sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif
  39 #include <sys/types.h>
  40 #ifdef HAVE_UNISTD_H
  41 # include <unistd.h>
  42 #endif
  43 #include <errno.h>
  44 #include <assert.h>
  45
  46 #include "wget.h"
  47 #include "utils.h"
  48 #include "url.h"
  49 #include "host.h"
  50 #include "hash.h"
  51
  52 #ifndef errno
  53 extern int errno;
  54 #endif
  55
  56 /* Is X "."?  */
  57 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  58 /* Is X ".."?  */
  59 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  60
  61 struct scheme_data
  62 {
  63   char *leading_string;
  64   int default_port;
  65   int enabled;
  66 };
  67
  68 /* Supported schemes: */
  69 static struct scheme_data supported_schemes[] =
  70 {
  71   { "http://",  DEFAULT_HTTP_PORT,  1 },
  72 #ifdef HAVE_SSL
  73   { "https://", DEFAULT_HTTPS_PORT, 1 },
  74 #endif
  75   { "ftp://",   DEFAULT_FTP_PORT,   1 },
  76
  77   /* SCHEME_INVALID */
  78   { NULL,       -1,                 0 }
  79 };
  80
  81 /* Forward declarations: */
  82
  83 static char *construct_relative PARAMS ((const char *, const char *));
  84 static int path_simplify PARAMS ((char *));
  85
  86
  87 \f
  88 /* Support for encoding and decoding of URL strings.  We determine
  89    whether a character is unsafe through static table lookup.  This
  90    code assumes ASCII character set and 8-bit chars.  */
  91
  92 enum {
  93   urlchr_reserved = 1,
  94   urlchr_unsafe   = 2
  95 };
  96
  97 #define R  urlchr_reserved
  98 #define U  urlchr_unsafe
  99 #define RU R|U
 100
 101 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 102
 103 /* rfc1738 reserved chars, preserved from encoding.  */
 104
 105 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 106
 107 /* rfc1738 unsafe chars, plus some more.  */
 108
 109 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 110
 111 const static unsigned char urlchr_table[256] =
 112 {
 113   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 114   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 115   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 116   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 117   U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 118   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 119   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 120   0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 121  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 122   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 123   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 124   0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 125   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 126   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 127   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 128   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 129
 130   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 131   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 132   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 133   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 134
 135   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 136   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 137   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 138   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 139 };
 140
 141 /* Decodes the forms %xy in a URL to the character the hexadecimal
 142    code of which is xy.  xy are hexadecimal digits from
 143    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 144    hex-digits or `%' precedes `\0', the sequence is inserted
 145    literally.  */
 146
 147 static void
 148 decode_string (char *s)
 149 {
 150   char *t = s;                  /* t - tortoise */
 151   char *h = s;                  /* h - hare     */
 152
 153   for (; *h; h++, t++)
 154     {
 155       if (*h != '%')
 156         {
 157         copychar:
 158           *t = *h;
 159         }
 160       else
 161         {
 162           /* Do nothing if '%' is not followed by two hex digits. */
 163           if (!*(h + 1) || !*(h + 2)
 164               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 165             goto copychar;
 166           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 167           h += 2;
 168         }
 169     }
 170   *t = '\0';
 171 }
 172
 173 /* Like encode_string, but return S if there are no unsafe chars.  */
 174
 175 static char *
 176 encode_string_maybe (const char *s)
 177 {
 178   const char *p1;
 179   char *p2, *newstr;
 180   int newlen;
 181   int addition = 0;
 182
 183   for (p1 = s; *p1; p1++)
 184     if (UNSAFE_CHAR (*p1))
 185       addition += 2;            /* Two more characters (hex digits) */
 186
 187   if (!addition)
 188     return (char *)s;
 189
 190   newlen = (p1 - s) + addition;
 191   newstr = (char *)xmalloc (newlen + 1);
 192
 193   p1 = s;
 194   p2 = newstr;
 195   while (*p1)
 196     {
 197       if (UNSAFE_CHAR (*p1))
 198         {
 199           unsigned char c = *p1++;
 200           *p2++ = '%';
 201           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 202           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 203         }
 204       else
 205         *p2++ = *p1++;
 206     }
 207   *p2 = '\0';
 208   assert (p2 - newstr == newlen);
 209
 210   return newstr;
 211 }
 212
 213 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 214    given string, returning a malloc-ed %XX encoded string.  */
 215
 216 char *
 217 encode_string (const char *s)
 218 {
 219   char *encoded = encode_string_maybe (s);
 220   if (encoded != s)
 221     return encoded;
 222   else
 223     return xstrdup (s);
 224 }
 225
 226 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 227    the old value of PTR is freed and PTR is made to point to the newly
 228    allocated storage.  */
 229
 230 #define ENCODE(ptr) do {                        \
 231   char *e_new = encode_string_maybe (ptr);      \
 232   if (e_new != ptr)                             \
 233     {                                           \
 234       xfree (ptr);                              \
 235       ptr = e_new;                              \
 236     }                                           \
 237 } while (0)
 238 \f
 239 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
 240
 241 /* Decide whether to encode, decode, or pass through the char at P.
 242    This used to be a macro, but it got a little too convoluted.  */
 243 static inline enum copy_method
 244 decide_copy_method (const char *p)
 245 {
 246   if (*p == '%')
 247     {
 248       if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
 249         {
 250           /* %xx sequence: decode it, unless it would decode to an
 251              unsafe or a reserved char; in that case, leave it as
 252              is. */
 253           char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
 254             XCHAR_TO_XDIGIT (*(p + 2));
 255
 256           if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
 257             return CM_PASSTHROUGH;
 258           else
 259             return CM_DECODE;
 260         }
 261       else
 262         /* Garbled %.. sequence: encode `%'. */
 263         return CM_ENCODE;
 264     }
 265   else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
 266     return CM_ENCODE;
 267   else
 268     return CM_PASSTHROUGH;
 269 }
 270
 271 /* Translate a %-quoting (but possibly non-conformant) input string S
 272    into a %-quoting (and conformant) output string.  If no characters
 273    are encoded or decoded, return the same string S; otherwise, return
 274    a freshly allocated string with the new contents.
 275
 276    After a URL has been run through this function, the protocols that
 277    use `%' as the quote character can use the resulting string as-is,
 278    while those that don't call decode_string() to get to the intended
 279    data.  This function is also stable: after an input string is
 280    transformed the first time, all further transformations of the
 281    result yield the same result string.
 282
 283    Let's discuss why this function is needed.
 284
 285    Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw
 286    space character would mess up the HTTP request, it needs to be
 287    quoted, like this:
 288
 289        GET /abc%20def HTTP/1.0
 290
 291    So it appears that the unsafe chars need to be quoted, as with
 292    encode_string.  But what if we're requested to download
 293    `abc%20def'?  Remember that %-encoding is valid URL syntax, so what
 294    the user meant was a literal space, and he was kind enough to quote
 295    it.  In that case, Wget should obviously leave the `%20' as is, and
 296    send the same request as above.  So in this case we may not call
 297    encode_string.
 298
 299    But what if the requested URI is `abc%20 def'?  If we call
 300    encode_string, we end up with `/abc%2520%20def', which is almost
 301    certainly not intended.  If we don't call encode_string, we are
 302    left with the embedded space and cannot send the request.  What the
 303    user meant was for Wget to request `/abc%20%20def', and this is
 304    where reencode_string kicks in.
 305
 306    Wget used to solve this by first decoding %-quotes, and then
 307    encoding all the "unsafe" characters found in the resulting string.
 308    This was wrong because it didn't preserve certain URL special
 309    (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
 310    == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
 311    whether we considered `+' reserved (it is).  One of these results
 312    is inevitable because by the second step we would lose information
 313    on whether the `+' was originally encoded or not.  Both results
 314    were wrong because in CGI parameters + means space, while %2B means
 315    literal plus.  reencode_string correctly translates the above to
 316    "a%2B+b", i.e. returns the original string.
 317
 318    This function uses an algorithm proposed by Anon Sricharoenchai:
 319
 320    1. Encode all URL_UNSAFE and the "%" that are not followed by 2
 321       hexdigits.
 322
 323    2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
 324       "+".
 325
 326    ...except that this code conflates the two steps, and decides
 327    whether to encode, decode, or pass through each character in turn.
 328    The function still uses two passes, but their logic is the same --
 329    the first pass exists merely for the sake of allocation.  Another
 330    small difference is that we include `+' to URL_RESERVED.
 331
 332    Anon's test case:
 333
 334    "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
 335    ->
 336    "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
 337
 338    Simpler test cases:
 339
 340    "foo bar"         -> "foo%20bar"
 341    "foo%20bar"       -> "foo%20bar"
 342    "foo %20bar"      -> "foo%20%20bar"
 343    "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
 344    "foo%25%20bar"    -> "foo%25%20bar"
 345    "foo%2%20bar"     -> "foo%252%20bar"
 346    "foo+bar"         -> "foo+bar"            (plus is reserved!)
 347    "foo%2b+bar"      -> "foo%2b+bar"  */
 348
 349 static char *
 350 reencode_string (const char *s)
 351 {
 352   const char *p1;
 353   char *newstr, *p2;
 354   int oldlen, newlen;
 355
 356   int encode_count = 0;
 357   int decode_count = 0;
 358
 359   /* First, pass through the string to see if there's anything to do,
 360      and to calculate the new length.  */
 361   for (p1 = s; *p1; p1++)
 362     {
 363       switch (decide_copy_method (p1))
 364         {
 365         case CM_ENCODE:
 366           ++encode_count;
 367           break;
 368         case CM_DECODE:
 369           ++decode_count;
 370           break;
 371         case CM_PASSTHROUGH:
 372           break;
 373         }
 374     }
 375
 376   if (!encode_count && !decode_count)
 377     /* The string is good as it is. */
 378     return (char *)s;           /* C const model sucks. */
 379
 380   oldlen = p1 - s;
 381   /* Each encoding adds two characters (hex digits), while each
 382      decoding removes two characters.  */
 383   newlen = oldlen + 2 * (encode_count - decode_count);
 384   newstr = xmalloc (newlen + 1);
 385
 386   p1 = s;
 387   p2 = newstr;
 388
 389   while (*p1)
 390     {
 391       switch (decide_copy_method (p1))
 392         {
 393         case CM_ENCODE:
 394           {
 395             unsigned char c = *p1++;
 396             *p2++ = '%';
 397             *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 398             *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 399           }
 400           break;
 401         case CM_DECODE:
 402           *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
 403                    + (XCHAR_TO_XDIGIT (*(p1 + 2))));
 404           p1 += 3;              /* skip %xx */
 405           break;
 406         case CM_PASSTHROUGH:
 407           *p2++ = *p1++;
 408         }
 409     }
 410   *p2 = '\0';
 411   assert (p2 - newstr == newlen);
 412   return newstr;
 413 }
 414
 415 /* Run PTR_VAR through reencode_string.  If a new string is consed,
 416    free PTR_VAR and make it point to the new storage.  Obviously,
 417    PTR_VAR needs to be an lvalue.  */
 418
 419 #define REENCODE(ptr_var) do {                  \
 420   char *rf_new = reencode_string (ptr_var);     \
 421   if (rf_new != ptr_var)                        \
 422     {                                           \
 423       xfree (ptr_var);                          \
 424       ptr_var = rf_new;                         \
 425     }                                           \
 426 } while (0)
 427 \f
 428 /* Returns the scheme type if the scheme is supported, or
 429    SCHEME_INVALID if not.  */
 430 enum url_scheme
 431 url_scheme (const char *url)
 432 {
 433   int i;
 434
 435   for (i = 0; supported_schemes[i].leading_string; i++)
 436     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 437                           strlen (supported_schemes[i].leading_string)))
 438       {
 439         if (supported_schemes[i].enabled)
 440           return (enum url_scheme) i;
 441         else
 442           return SCHEME_INVALID;
 443       }
 444
 445   return SCHEME_INVALID;
 446 }
 447
 448 /* Return the number of characters needed to skip the scheme part of
 449    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 450 int
 451 url_skip_scheme (const char *url)
 452 {
 453   const char *p = url;
 454
 455   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 456      etc. */
 457   while (ISALNUM (*p) || *p == '-' || *p == '+')
 458     ++p;
 459   if (*p != ':')
 460     return 0;
 461   /* Skip ':'. */
 462   ++p;
 463
 464   /* Skip "//" if found. */
 465   if (*p == '/' && *(p + 1) == '/')
 466     p += 2;
 467
 468   return p - url;
 469 }
 470
 471 /* Returns 1 if the URL begins with a scheme (supported or
 472    unsupported), 0 otherwise.  */
 473 int
 474 url_has_scheme (const char *url)
 475 {
 476   const char *p = url;
 477   while (ISALNUM (*p) || *p == '-' || *p == '+')
 478     ++p;
 479   return *p == ':';
 480 }
 481
 482 int
 483 scheme_default_port (enum url_scheme scheme)
 484 {
 485   return supported_schemes[scheme].default_port;
 486 }
 487
 488 void
 489 scheme_disable (enum url_scheme scheme)
 490 {
 491   supported_schemes[scheme].enabled = 0;
 492 }
 493
 494 /* Skip the username and password, if present here.  The function
 495    should be called *not* with the complete URL, but with the part
 496    right after the scheme.
 497
 498    If no username and password are found, return 0.  */
 499 int
 500 url_skip_uname (const char *url)
 501 {
 502   const char *p;
 503
 504   /* Look for '@' that comes before '/' or '?'. */
 505   p = (const char *)strpbrk (url, "/?@");
 506   if (!p || *p != '@')
 507     return 0;
 508
 509   return p - url + 1;
 510 }
 511
 512 static int
 513 parse_uname (const char *str, int len, char **user, char **passwd)
 514 {
 515   char *colon;
 516
 517   if (len == 0)
 518     /* Empty user name not allowed. */
 519     return 0;
 520
 521   colon = memchr (str, ':', len);
 522   if (colon == str)
 523     /* Empty user name again. */
 524     return 0;
 525
 526   if (colon)
 527     {
 528       int pwlen = len - (colon + 1 - str);
 529       *passwd = xmalloc (pwlen + 1);
 530       memcpy (*passwd, colon + 1, pwlen);
 531       (*passwd)[pwlen] = '\0';
 532       len -= pwlen + 1;
 533     }
 534   else
 535     *passwd = NULL;
 536
 537   *user = xmalloc (len + 1);
 538   memcpy (*user, str, len);
 539   (*user)[len] = '\0';
 540
 541   if (*user)
 542     decode_string (*user);
 543   if (*passwd)
 544     decode_string (*passwd);
 545
 546   return 1;
 547 }
 548
 549 /* Used by main.c: detect URLs written using the "shorthand" URL forms
 550    popularized by Netscape and NcFTP.  HTTP shorthands look like this:
 551
 552    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
 553    www.foo.com[:port]            -> http://www.foo.com[:port]
 554
 555    FTP shorthands look like this:
 556
 557    foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
 558    foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
 559
 560    If the URL needs not or cannot be rewritten, return NULL.  */
 561 char *
 562 rewrite_shorthand_url (const char *url)
 563 {
 564   const char *p;
 565
 566   if (url_has_scheme (url))
 567     return NULL;
 568
 569   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
 570      latter Netscape.  */
 571   for (p = url; *p && *p != ':' && *p != '/'; p++)
 572     ;
 573
 574   if (p == url)
 575     return NULL;
 576
 577   if (*p == ':')
 578     {
 579       const char *pp;
 580       char *res;
 581       /* If the characters after the colon and before the next slash
 582          or end of string are all digits, it's HTTP.  */
 583       int digits = 0;
 584       for (pp = p + 1; ISDIGIT (*pp); pp++)
 585         ++digits;
 586       if (digits > 0 && (*pp == '/' || *pp == '\0'))
 587         goto http;
 588
 589       /* Prepend "ftp://" to the entire URL... */
 590       res = xmalloc (6 + strlen (url) + 1);
 591       sprintf (res, "ftp://%s", url);
 592       /* ...and replace ':' with '/'. */
 593       res[6 + (p - url)] = '/';
 594       return res;
 595     }
 596   else
 597     {
 598       char *res;
 599     http:
 600       /* Just prepend "http://" to what we have. */
 601       res = xmalloc (7 + strlen (url) + 1);
 602       sprintf (res, "http://%s", url);
 603       return res;
 604     }
 605 }
 606 \f
 607 static void parse_path PARAMS ((const char *, char **, char **));
 608
 609 static char *
 610 strpbrk_or_eos (const char *s, const char *accept)
 611 {
 612   char *p = strpbrk (s, accept);
 613   if (!p)
 614     p = (char *)s + strlen (s);
 615   return p;
 616 }
 617
 618 /* Turn STR into lowercase; return non-zero if a character was
 619    actually changed. */
 620
 621 static int
 622 lowercase_str (char *str)
 623 {
 624   int change = 0;
 625   for (; *str; str++)
 626     if (ISUPPER (*str))
 627       {
 628         change = 1;
 629         *str = TOLOWER (*str);
 630       }
 631   return change;
 632 }
 633
 634 static char *parse_errors[] = {
 635 #define PE_NO_ERROR                     0
 636   "No error",
 637 #define PE_UNSUPPORTED_SCHEME           1
 638   "Unsupported scheme",
 639 #define PE_EMPTY_HOST                   2
 640   "Empty host",
 641 #define PE_BAD_PORT_NUMBER              3
 642   "Bad port number",
 643 #define PE_INVALID_USER_NAME            4
 644   "Invalid user name",
 645 #define PE_UNTERMINATED_IPV6_ADDRESS    5
 646   "Unterminated IPv6 numeric address",
 647 #define PE_INVALID_IPV6_ADDRESS         6
 648   "Invalid char in IPv6 numeric address"
 649 };
 650
 651 #define SETERR(p, v) do {                       \
 652   if (p)                                        \
 653     *(p) = (v);                                 \
 654 } while (0)
 655
 656 /* Parse a URL.
 657
 658    Return a new struct url if successful, NULL on error.  In case of
 659    error, and if ERROR is not NULL, also set *ERROR to the appropriate
 660    error code. */
 661 struct url *
 662 url_parse (const char *url, int *error)
 663 {
 664   struct url *u;
 665   const char *p;
 666   int path_modified, host_modified;
 667
 668   enum url_scheme scheme;
 669
 670   const char *uname_b,     *uname_e;
 671   const char *host_b,      *host_e;
 672   const char *path_b,      *path_e;
 673   const char *params_b,    *params_e;
 674   const char *query_b,     *query_e;
 675   const char *fragment_b,  *fragment_e;
 676
 677   int port;
 678   char *user = NULL, *passwd = NULL;
 679
 680   char *url_encoded;
 681
 682   scheme = url_scheme (url);
 683   if (scheme == SCHEME_INVALID)
 684     {
 685       SETERR (error, PE_UNSUPPORTED_SCHEME);
 686       return NULL;
 687     }
 688
 689   url_encoded = reencode_string (url);
 690   p = url_encoded;
 691
 692   p += strlen (supported_schemes[scheme].leading_string);
 693   uname_b = p;
 694   p += url_skip_uname (p);
 695   uname_e = p;
 696
 697   /* scheme://user:pass@host[:port]... */
 698   /*                    ^              */
 699
 700   /* We attempt to break down the URL into the components path,
 701      params, query, and fragment.  They are ordered like this:
 702
 703        scheme://host[:port][/path][;params][?query][#fragment]  */
 704
 705   params_b   = params_e   = NULL;
 706   query_b    = query_e    = NULL;
 707   fragment_b = fragment_e = NULL;
 708
 709   host_b = p;
 710
 711   if (*p == '[')
 712     {
 713       /* Support http://[::1]/ used by IPv6. */
 714       int invalid = 0;
 715       ++p;
 716       while (1)
 717         {
 718           char c = *p++;
 719           switch (c)
 720             {
 721             case ']':
 722               goto out;
 723             case '\0':
 724               SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
 725               return NULL;
 726             case ':': case '.':
 727               break;
 728             default:
 729               if (ISXDIGIT (c))
 730                 break;
 731               invalid = 1;
 732             }
 733         }
 734     out:
 735       if (invalid)
 736         {
 737           SETERR (error, PE_INVALID_IPV6_ADDRESS);
 738           return NULL;
 739         }
 740       /* Don't include brackets in [host_b, host_p). */
 741       ++host_b;
 742       host_e = p - 1;
 743     }
 744   else
 745     {
 746       p = strpbrk_or_eos (p, ":/;?#");
 747       host_e = p;
 748     }
 749
 750   if (host_b == host_e)
 751     {
 752       SETERR (error, PE_EMPTY_HOST);
 753       return NULL;
 754     }
 755
 756   port = scheme_default_port (scheme);
 757   if (*p == ':')
 758     {
 759       const char *port_b, *port_e, *pp;
 760
 761       /* scheme://host:port/tralala */
 762       /*              ^             */
 763       ++p;
 764       port_b = p;
 765       p = strpbrk_or_eos (p, "/;?#");
 766       port_e = p;
 767
 768       if (port_b == port_e)
 769         {
 770           /* http://host:/whatever */
 771           /*             ^         */
 772           SETERR (error, PE_BAD_PORT_NUMBER);
 773           return NULL;
 774         }
 775
 776       for (port = 0, pp = port_b; pp < port_e; pp++)
 777         {
 778           if (!ISDIGIT (*pp))
 779             {
 780               /* http://host:12randomgarbage/blah */
 781               /*               ^                  */
 782               SETERR (error, PE_BAD_PORT_NUMBER);
 783               return NULL;
 784             }
 785           port = 10 * port + (*pp - '0');
 786         }
 787     }
 788
 789   if (*p == '/')
 790     {
 791       ++p;
 792       path_b = p;
 793       p = strpbrk_or_eos (p, ";?#");
 794       path_e = p;
 795     }
 796   else
 797     {
 798       /* Path is not allowed not to exist. */
 799       path_b = path_e = p;
 800     }
 801
 802   if (*p == ';')
 803     {
 804       ++p;
 805       params_b = p;
 806       p = strpbrk_or_eos (p, "?#");
 807       params_e = p;
 808     }
 809   if (*p == '?')
 810     {
 811       ++p;
 812       query_b = p;
 813       p = strpbrk_or_eos (p, "#");
 814       query_e = p;
 815
 816       /* Hack that allows users to use '?' (a wildcard character) in
 817          FTP URLs without it being interpreted as a query string
 818          delimiter.  */
 819       if (scheme == SCHEME_FTP)
 820         {
 821           query_b = query_e = NULL;
 822           path_e = p;
 823         }
 824     }
 825   if (*p == '#')
 826     {
 827       ++p;
 828       fragment_b = p;
 829       p += strlen (p);
 830       fragment_e = p;
 831     }
 832   assert (*p == 0);
 833
 834   if (uname_b != uname_e)
 835     {
 836       /* http://user:pass@host */
 837       /*        ^         ^    */
 838       /*     uname_b   uname_e */
 839       if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
 840         {
 841           SETERR (error, PE_INVALID_USER_NAME);
 842           return NULL;
 843         }
 844     }
 845
 846   u = (struct url *)xmalloc (sizeof (struct url));
 847   memset (u, 0, sizeof (*u));
 848
 849   u->scheme = scheme;
 850   u->host   = strdupdelim (host_b, host_e);
 851   u->port   = port;
 852   u->user   = user;
 853   u->passwd = passwd;
 854
 855   u->path = strdupdelim (path_b, path_e);
 856   path_modified = path_simplify (u->path);
 857   parse_path (u->path, &u->dir, &u->file);
 858
 859   host_modified = lowercase_str (u->host);
 860
 861   if (params_b)
 862     u->params = strdupdelim (params_b, params_e);
 863   if (query_b)
 864     u->query = strdupdelim (query_b, query_e);
 865   if (fragment_b)
 866     u->fragment = strdupdelim (fragment_b, fragment_e);
 867
 868   if (path_modified || u->fragment || host_modified || path_b == path_e)
 869     {
 870       /* If we suspect that a transformation has rendered what
 871          url_string might return different from URL_ENCODED, rebuild
 872          u->url using url_string.  */
 873       u->url = url_string (u, 0);
 874
 875       if (url_encoded != url)
 876         xfree ((char *) url_encoded);
 877     }
 878   else
 879     {
 880       if (url_encoded == url)
 881         u->url    = xstrdup (url);
 882       else
 883         u->url    = url_encoded;
 884     }
 885   url_encoded = NULL;
 886
 887   return u;
 888 }
 889
 890 const char *
 891 url_error (int error_code)
 892 {
 893   assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
 894   return parse_errors[error_code];
 895 }
 896
 897 static void
 898 parse_path (const char *quoted_path, char **dir, char **file)
 899 {
 900   char *path, *last_slash;
 901
 902   STRDUP_ALLOCA (path, quoted_path);
 903   decode_string (path);
 904
 905   last_slash = strrchr (path, '/');
 906   if (!last_slash)
 907     {
 908       *dir = xstrdup ("");
 909       *file = xstrdup (path);
 910     }
 911   else
 912     {
 913       *dir = strdupdelim (path, last_slash);
 914       *file = xstrdup (last_slash + 1);
 915     }
 916 }
 917
 918 /* Note: URL's "full path" is the path with the query string and
 919    params appended.  The "fragment" (#foo) is intentionally ignored,
 920    but that might be changed.  For example, if the original URL was
 921    "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
 922    the full path will be "/foo/bar/baz;bullshit?querystring".  */
 923
 924 /* Return the length of the full path, without the terminating
 925    zero.  */
 926
 927 static int
 928 full_path_length (const struct url *url)
 929 {
 930   int len = 0;
 931
 932 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
 933
 934   FROB (path);
 935   FROB (params);
 936   FROB (query);
 937
 938 #undef FROB
 939
 940   return len;
 941 }
 942
 943 /* Write out the full path. */
 944
 945 static void
 946 full_path_write (const struct url *url, char *where)
 947 {
 948 #define FROB(el, chr) do {                      \
 949   char *f_el = url->el;                         \
 950   if (f_el) {                                   \
 951     int l = strlen (f_el);                      \
 952     *where++ = chr;                             \
 953     memcpy (where, f_el, l);                    \
 954     where += l;                                 \
 955   }                                             \
 956 } while (0)
 957
 958   FROB (path, '/');
 959   FROB (params, ';');
 960   FROB (query, '?');
 961
 962 #undef FROB
 963 }
 964
 965 /* Public function for getting the "full path".  E.g. if u->path is
 966    "foo/bar" and u->query is "param=value", full_path will be
 967    "/foo/bar?param=value". */
 968
 969 char *
 970 url_full_path (const struct url *url)
 971 {
 972   int length = full_path_length (url);
 973   char *full_path = (char *)xmalloc(length + 1);
 974
 975   full_path_write (url, full_path);
 976   full_path[length] = '\0';
 977
 978   return full_path;
 979 }
 980
 981 /* Sync u->path and u->url with u->dir and u->file. */
 982
 983 static void
 984 sync_path (struct url *url)
 985 {
 986   char *newpath;
 987
 988   xfree (url->path);
 989
 990   if (!*url->dir)
 991     {
 992       newpath = xstrdup (url->file);
 993       REENCODE (newpath);
 994     }
 995   else
 996     {
 997       int dirlen = strlen (url->dir);
 998       int filelen = strlen (url->file);
 999
1000       newpath = xmalloc (dirlen + 1 + filelen + 1);
1001       memcpy (newpath, url->dir, dirlen);
1002       newpath[dirlen] = '/';
1003       memcpy (newpath + dirlen + 1, url->file, filelen);
1004       newpath[dirlen + 1 + filelen] = '\0';
1005       REENCODE (newpath);
1006     }
1007
1008   url->path = newpath;
1009
1010   /* Synchronize u->url. */
1011   xfree (url->url);
1012   url->url = url_string (url, 0);
1013 }
1014
1015 /* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1016    This way we can sync u->path and u->url when they get changed.  */
1017
1018 void
1019 url_set_dir (struct url *url, const char *newdir)
1020 {
1021   xfree (url->dir);
1022   url->dir = xstrdup (newdir);
1023   sync_path (url);
1024 }
1025
1026 void
1027 url_set_file (struct url *url, const char *newfile)
1028 {
1029   xfree (url->file);
1030   url->file = xstrdup (newfile);
1031   sync_path (url);
1032 }
1033
1034 void
1035 url_free (struct url *url)
1036 {
1037   xfree (url->host);
1038   xfree (url->path);
1039   xfree (url->url);
1040
1041   FREE_MAYBE (url->params);
1042   FREE_MAYBE (url->query);
1043   FREE_MAYBE (url->fragment);
1044   FREE_MAYBE (url->user);
1045   FREE_MAYBE (url->passwd);
1046
1047   xfree (url->dir);
1048   xfree (url->file);
1049
1050   xfree (url);
1051 }
1052 \f
1053 struct urlpos *
1054 get_urls_file (const char *file)
1055 {
1056   struct file_memory *fm;
1057   struct urlpos *head, *tail;
1058   const char *text, *text_end;
1059
1060   /* Load the file.  */
1061   fm = read_file (file);
1062   if (!fm)
1063     {
1064       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1065       return NULL;
1066     }
1067   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1068
1069   head = tail = NULL;
1070   text = fm->content;
1071   text_end = fm->content + fm->length;
1072   while (text < text_end)
1073     {
1074       const char *line_beg = text;
1075       const char *line_end = memchr (text, '\n', text_end - text);
1076       if (!line_end)
1077         line_end = text_end;
1078       else
1079         ++line_end;
1080       text = line_end;
1081
1082       /* Strip whitespace from the beginning and end of line. */
1083       while (line_beg < line_end && ISSPACE (*line_beg))
1084         ++line_beg;
1085       while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1086         --line_end;
1087
1088       if (line_end > line_beg)
1089         {
1090           /* URL is in the [line_beg, line_end) region. */
1091
1092           int up_error_code;
1093           char *url_text;
1094           struct urlpos *entry;
1095           struct url *url;
1096
1097           /* We must copy the URL to a zero-terminated string, and we
1098              can't use alloca because we're in a loop.  *sigh*.  */
1099           url_text = strdupdelim (line_beg, line_end);
1100
1101           if (opt.base_href)
1102             {
1103               /* Merge opt.base_href with URL. */
1104               char *merged = uri_merge (opt.base_href, url_text);
1105               xfree (url_text);
1106               url_text = merged;
1107             }
1108
1109           url = url_parse (url_text, &up_error_code);
1110           if (!url)
1111             {
1112               logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1113                          file, url_text, url_error (up_error_code));
1114               xfree (url_text);
1115               continue;
1116             }
1117           xfree (url_text);
1118
1119           entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1120           memset (entry, 0, sizeof (*entry));
1121           entry->next = NULL;
1122           entry->url = url;
1123
1124           if (!head)
1125             head = entry;
1126           else
1127             tail->next = entry;
1128           tail = entry;
1129         }
1130     }
1131   read_file_free (fm);
1132   return head;
1133 }
1134 \f
1135 /* Free the linked list of urlpos.  */
1136 void
1137 free_urlpos (struct urlpos *l)
1138 {
1139   while (l)
1140     {
1141       struct urlpos *next = l->next;
1142       if (l->url)
1143         url_free (l->url);
1144       FREE_MAYBE (l->local_name);
1145       xfree (l);
1146       l = next;
1147     }
1148 }
1149
1150 /* Rotate FNAME opt.backups times */
1151 void
1152 rotate_backups(const char *fname)
1153 {
1154   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1155   char *from = (char *)alloca (maxlen);
1156   char *to = (char *)alloca (maxlen);
1157   struct stat sb;
1158   int i;
1159
1160   if (stat (fname, &sb) == 0)
1161     if (S_ISREG (sb.st_mode) == 0)
1162       return;
1163
1164   for (i = opt.backups; i > 1; i--)
1165     {
1166       sprintf (from, "%s.%d", fname, i - 1);
1167       sprintf (to, "%s.%d", fname, i);
1168       /* #### This will fail on machines without the rename() system
1169          call.  */
1170       rename (from, to);
1171     }
1172
1173   sprintf (to, "%s.%d", fname, 1);
1174   rename(fname, to);
1175 }
1176
1177 /* Create all the necessary directories for PATH (a file).  Calls
1178    mkdirhier() internally.  */
1179 int
1180 mkalldirs (const char *path)
1181 {
1182   const char *p;
1183   char *t;
1184   struct stat st;
1185   int res;
1186
1187   p = path + strlen (path);
1188   for (; *p != '/' && p != path; p--);
1189   /* Don't create if it's just a file.  */
1190   if ((p == path) && (*p != '/'))
1191     return 0;
1192   t = strdupdelim (path, p);
1193   /* Check whether the directory exists.  */
1194   if ((stat (t, &st) == 0))
1195     {
1196       if (S_ISDIR (st.st_mode))
1197         {
1198           xfree (t);
1199           return 0;
1200         }
1201       else
1202         {
1203           /* If the dir exists as a file name, remove it first.  This
1204              is *only* for Wget to work with buggy old CERN http
1205              servers.  Here is the scenario: When Wget tries to
1206              retrieve a directory without a slash, e.g.
1207              http://foo/bar (bar being a directory), CERN server will
1208              not redirect it too http://foo/bar/ -- it will generate a
1209              directory listing containing links to bar/file1,
1210              bar/file2, etc.  Wget will lose because it saves this
1211              HTML listing to a file `bar', so it cannot create the
1212              directory.  To work around this, if the file of the same
1213              name exists, we just remove it and create the directory
1214              anyway.  */
1215           DEBUGP (("Removing %s because of directory danger!\n", t));
1216           unlink (t);
1217         }
1218     }
1219   res = make_directory (t);
1220   if (res != 0)
1221     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1222   xfree (t);
1223   return res;
1224 }
1225
1226 static int
1227 count_slashes (const char *s)
1228 {
1229   int i = 0;
1230   while (*s)
1231     if (*s++ == '/')
1232       ++i;
1233   return i;
1234 }
1235
1236 /* Return the path name of the URL-equivalent file name, with a
1237    remote-like structure of directories.  */
1238 static char *
1239 mkstruct (const struct url *u)
1240 {
1241   char *dir, *file;
1242   char *res, *dirpref;
1243   int l;
1244
1245   if (opt.cut_dirs)
1246     {
1247       char *ptr = u->dir + (*u->dir == '/');
1248       int slash_count = 1 + count_slashes (ptr);
1249       int cut = MINVAL (opt.cut_dirs, slash_count);
1250       for (; cut && *ptr; ptr++)
1251         if (*ptr == '/')
1252           --cut;
1253       STRDUP_ALLOCA (dir, ptr);
1254     }
1255   else
1256     dir = u->dir + (*u->dir == '/');
1257
1258   /* Check for the true name (or at least a consistent name for saving
1259      to directory) of HOST, reusing the hlist if possible.  */
1260   if (opt.add_hostdir)
1261     {
1262       /* Add dir_prefix and hostname (if required) to the beginning of
1263          dir.  */
1264       dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1265                                 + strlen (u->host)
1266                                 + 1 + numdigit (u->port)
1267                                 + 1);
1268       if (!DOTP (opt.dir_prefix))
1269         sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1270       else
1271         strcpy (dirpref, u->host);
1272
1273       if (u->port != scheme_default_port (u->scheme))
1274         {
1275           int len = strlen (dirpref);
1276           dirpref[len] = ':';
1277           number_to_string (dirpref + len + 1, u->port);
1278         }
1279     }
1280   else                          /* not add_hostdir */
1281     {
1282       if (!DOTP (opt.dir_prefix))
1283         dirpref = opt.dir_prefix;
1284       else
1285         dirpref = "";
1286     }
1287
1288   /* If there is a prefix, prepend it.  */
1289   if (*dirpref)
1290     {
1291       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1292       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1293       dir = newdir;
1294     }
1295
1296   l = strlen (dir);
1297   if (l && dir[l - 1] == '/')
1298     dir[l - 1] = '\0';
1299
1300   if (!*u->file)
1301     file = "index.html";
1302   else
1303     file = u->file;
1304
1305   /* Finally, construct the full name.  */
1306   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1307                          + 1);
1308   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1309
1310   return res;
1311 }
1312
1313 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1314    an escaped query string.  The trick is to make sure that unsafe
1315    characters in BASE are escaped, and that slashes in QUERY are also
1316    escaped.  */
1317
1318 static char *
1319 compose_file_name (char *base, char *query)
1320 {
1321   char result[256];
1322   char *from;
1323   char *to = result;
1324
1325   /* Copy BASE to RESULT and encode all unsafe characters.  */
1326   from = base;
1327   while (*from && to - result < sizeof (result))
1328     {
1329       if (UNSAFE_CHAR (*from))
1330         {
1331           unsigned char c = *from++;
1332           *to++ = '%';
1333           *to++ = XDIGIT_TO_XCHAR (c >> 4);
1334           *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1335         }
1336       else
1337         *to++ = *from++;
1338     }
1339
1340   if (query && to - result < sizeof (result))
1341     {
1342       *to++ = '?';
1343
1344       /* Copy QUERY to RESULT and encode all '/' characters. */
1345       from = query;
1346       while (*from && to - result < sizeof (result))
1347         {
1348           if (*from == '/')
1349             {
1350               *to++ = '%';
1351               *to++ = '2';
1352               *to++ = 'F';
1353               ++from;
1354             }
1355           else
1356             *to++ = *from++;
1357         }
1358     }
1359
1360   if (to - result < sizeof (result))
1361     *to = '\0';
1362   else
1363     /* Truncate input which is too long, presumably due to a huge
1364        query string.  */
1365     result[sizeof (result) - 1] = '\0';
1366
1367   return xstrdup (result);
1368 }
1369
1370 /* Create a unique filename, corresponding to a given URL.  Calls
1371    mkstruct if necessary.  Does *not* actually create any directories.  */
1372 char *
1373 url_filename (const struct url *u)
1374 {
1375   char *file, *name;
1376
1377   char *query = u->query && *u->query ? u->query : NULL;
1378
1379   if (opt.dirstruct)
1380     {
1381       char *base = mkstruct (u);
1382       file = compose_file_name (base, query);
1383       xfree (base);
1384     }
1385   else
1386     {
1387       char *base = *u->file ? u->file : "index.html";
1388       file = compose_file_name (base, query);
1389
1390       /* Check whether the prefix directory is something other than "."
1391          before prepending it.  */
1392       if (!DOTP (opt.dir_prefix))
1393         {
1394           /* #### should just realloc FILE and prepend dir_prefix. */
1395           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1396                                          + 1 + strlen (file) + 1);
1397           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1398           xfree (file);
1399           file = nfile;
1400         }
1401     }
1402
1403   /* DOS-ish file systems don't like `%' signs in them; we change it
1404      to `@'.  */
1405 #ifdef WINDOWS
1406   {
1407     char *p = file;
1408     for (p = file; *p; p++)
1409       if (*p == '%')
1410         *p = '@';
1411   }
1412 #endif /* WINDOWS */
1413
1414   /* Check the cases in which the unique extensions are not used:
1415      1) Clobbering is turned off (-nc).
1416      2) Retrieval with regetting.
1417      3) Timestamping is used.
1418      4) Hierarchy is built.
1419
1420      The exception is the case when file does exist and is a
1421      directory (actually support for bad httpd-s).  */
1422   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1423       && !(file_exists_p (file) && !file_non_directory_p (file)))
1424     return file;
1425
1426   /* Find a unique name.  */
1427   name = unique_name (file);
1428   xfree (file);
1429   return name;
1430 }
1431
1432 /* Return the langth of URL's path.  Path is considered to be
1433    terminated by one of '?', ';', '#', or by the end of the
1434    string.  */
1435 static int
1436 path_length (const char *url)
1437 {
1438   const char *q = strpbrk_or_eos (url, "?;#");
1439   return q - url;
1440 }
1441
1442 /* Find the last occurrence of character C in the range [b, e), or
1443    NULL, if none are present.  This is equivalent to strrchr(b, c),
1444    except that it accepts an END argument instead of requiring the
1445    string to be zero-terminated.  Why is there no memrchr()?  */
1446 static const char *
1447 find_last_char (const char *b, const char *e, char c)
1448 {
1449   for (; e > b; e--)
1450     if (*e == c)
1451       return e;
1452   return NULL;
1453 }
1454 \f
1455 /* Resolve "." and ".." elements of PATH by destructively modifying
1456    PATH.  "." is resolved by removing that path element, and ".." is
1457    resolved by removing the preceding path element.  Leading and
1458    trailing slashes are preserved.
1459
1460    Return non-zero if any changes have been made.
1461
1462    For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1463    test examples are provided below.  If you change anything in this
1464    function, run test_path_simplify to make sure you haven't broken a
1465    test case.
1466
1467    A previous version of this function was based on path_simplify()
1468    from GNU Bash, but it has been rewritten for Wget 1.8.1.  */
1469
1470 static int
1471 path_simplify (char *path)
1472 {
1473   int change = 0;
1474   char *p, *end;
1475
1476   if (path[0] == '/')
1477     ++path;                     /* preserve the leading '/'. */
1478
1479   p = path;
1480   end = p + strlen (p) + 1;     /* position past the terminating zero. */
1481
1482   while (1)
1483     {
1484     again:
1485       /* P should point to the beginning of a path element. */
1486
1487       if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1488         {
1489           /* Handle "./foo" by moving "foo" two characters to the
1490              left. */
1491           if (*(p + 1) == '/')
1492             {
1493               change = 1;
1494               memmove (p, p + 2, end - p);
1495               end -= 2;
1496               goto again;
1497             }
1498           else
1499             {
1500               change = 1;
1501               *p = '\0';
1502               break;
1503             }
1504         }
1505       else if (*p == '.' && *(p + 1) == '.'
1506                && (*(p + 2) == '/' || *(p + 2) == '\0'))
1507         {
1508           /* Handle "../foo" by moving "foo" one path element to the
1509              left.  */
1510           char *b = p;          /* not p-1 because P can equal PATH */
1511
1512           /* Backtrack by one path element, but not past the beginning
1513              of PATH. */
1514
1515           /* foo/bar/../baz */
1516           /*         ^ p    */
1517           /*     ^ b        */
1518
1519           if (b > path)
1520             {
1521               /* Move backwards until B hits the beginning of the
1522                  previous path element or the beginning of path. */
1523               for (--b; b > path && *(b - 1) != '/'; b--)
1524                 ;
1525             }
1526
1527           change = 1;
1528           if (*(p + 2) == '/')
1529             {
1530               memmove (b, p + 3, end - (p + 3));
1531               end -= (p + 3) - b;
1532               p = b;
1533             }
1534           else
1535             {
1536               *b = '\0';
1537               break;
1538             }
1539
1540           goto again;
1541         }
1542       else if (*p == '/')
1543         {
1544           /* Remove empty path elements.  Not mandated by rfc1808 et
1545              al, but empty path elements are not all that useful, and
1546              the rest of Wget might not deal with them well. */
1547           char *q = p;
1548           while (*q == '/')
1549             ++q;
1550           change = 1;
1551           if (*q == '\0')
1552             {
1553               *p = '\0';
1554               break;
1555             }
1556           memmove (p, q, end - q);
1557           end -= q - p;
1558           goto again;
1559         }
1560
1561       /* Skip to the next path element. */
1562       while (*p && *p != '/')
1563         ++p;
1564       if (*p == '\0')
1565         break;
1566
1567       /* Make sure P points to the beginning of the next path element,
1568          which is location after the slash. */
1569       ++p;
1570     }
1571
1572   return change;
1573 }
1574 \f
1575 /* Resolve the result of "linking" a base URI (BASE) to a
1576    link-specified URI (LINK).
1577
1578    Either of the URIs may be absolute or relative, complete with the
1579    host name, or path only.  This tries to behave "reasonably" in all
1580    foreseeable cases.  It employs little specific knowledge about
1581    schemes or URL-specific stuff -- it just works on strings.
1582
1583    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1584    See uri_merge for a gentler interface to this functionality.
1585
1586    Perhaps this function should call path_simplify so that the callers
1587    don't have to call url_parse unconditionally.  */
1588 static char *
1589 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1590 {
1591   char *constr;
1592
1593   if (no_scheme)
1594     {
1595       const char *end = base + path_length (base);
1596
1597       if (!*link)
1598         {
1599           /* Empty LINK points back to BASE, query string and all. */
1600           constr = xstrdup (base);
1601         }
1602       else if (*link == '?')
1603         {
1604           /* LINK points to the same location, but changes the query
1605              string.  Examples: */
1606           /* uri_merge("path",         "?new") -> "path?new"     */
1607           /* uri_merge("path?foo",     "?new") -> "path?new"     */
1608           /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1609           /* uri_merge("path#foo",     "?new") -> "path?new"     */
1610           int baselength = end - base;
1611           constr = xmalloc (baselength + linklength + 1);
1612           memcpy (constr, base, baselength);
1613           memcpy (constr + baselength, link, linklength);
1614           constr[baselength + linklength] = '\0';
1615         }
1616       else if (*link == '#')
1617         {
1618           /* uri_merge("path",         "#new") -> "path#new"     */
1619           /* uri_merge("path#foo",     "#new") -> "path#new"     */
1620           /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1621           /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1622           int baselength;
1623           const char *end1 = strchr (base, '#');
1624           if (!end1)
1625             end1 = base + strlen (base);
1626           baselength = end1 - base;
1627           constr = xmalloc (baselength + linklength + 1);
1628           memcpy (constr, base, baselength);
1629           memcpy (constr + baselength, link, linklength);
1630           constr[baselength + linklength] = '\0';
1631         }
1632       else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1633         {
1634           /* LINK begins with "//" and so is a net path: we need to
1635              replace everything after (and including) the double slash
1636              with LINK. */
1637
1638           /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1639           /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1640           /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1641
1642           int span;
1643           const char *slash;
1644           const char *start_insert;
1645
1646           /* Look for first slash. */
1647           slash = memchr (base, '/', end - base);
1648           /* If found slash and it is a double slash, then replace
1649              from this point, else default to replacing from the
1650              beginning.  */
1651           if (slash && *(slash + 1) == '/')
1652             start_insert = slash;
1653           else
1654             start_insert = base;
1655
1656           span = start_insert - base;
1657           constr = (char *)xmalloc (span + linklength + 1);
1658           if (span)
1659             memcpy (constr, base, span);
1660           memcpy (constr + span, link, linklength);
1661           constr[span + linklength] = '\0';
1662         }
1663       else if (*link == '/')
1664         {
1665           /* LINK is an absolute path: we need to replace everything
1666              after (and including) the FIRST slash with LINK.
1667
1668              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1669              "/qux/xyzzy", our result should be
1670              "http://host/qux/xyzzy".  */
1671           int span;
1672           const char *slash;
1673           const char *start_insert = NULL; /* for gcc to shut up. */
1674           const char *pos = base;
1675           int seen_slash_slash = 0;
1676           /* We're looking for the first slash, but want to ignore
1677              double slash. */
1678         again:
1679           slash = memchr (pos, '/', end - pos);
1680           if (slash && !seen_slash_slash)
1681             if (*(slash + 1) == '/')
1682               {
1683                 pos = slash + 2;
1684                 seen_slash_slash = 1;
1685                 goto again;
1686               }
1687
1688           /* At this point, SLASH is the location of the first / after
1689              "//", or the first slash altogether.  START_INSERT is the
1690              pointer to the location where LINK will be inserted.  When
1691              examining the last two examples, keep in mind that LINK
1692              begins with '/'. */
1693
1694           if (!slash && !seen_slash_slash)
1695             /* example: "foo" */
1696             /*           ^    */
1697             start_insert = base;
1698           else if (!slash && seen_slash_slash)
1699             /* example: "http://foo" */
1700             /*                     ^ */
1701             start_insert = end;
1702           else if (slash && !seen_slash_slash)
1703             /* example: "foo/bar" */
1704             /*           ^        */
1705             start_insert = base;
1706           else if (slash && seen_slash_slash)
1707             /* example: "http://something/" */
1708             /*                           ^  */
1709             start_insert = slash;
1710
1711           span = start_insert - base;
1712           constr = (char *)xmalloc (span + linklength + 1);
1713           if (span)
1714             memcpy (constr, base, span);
1715           if (linklength)
1716             memcpy (constr + span, link, linklength);
1717           constr[span + linklength] = '\0';
1718         }
1719       else
1720         {
1721           /* LINK is a relative URL: we need to replace everything
1722              after last slash (possibly empty) with LINK.
1723
1724              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1725              our result should be "whatever/foo/qux/xyzzy".  */
1726           int need_explicit_slash = 0;
1727           int span;
1728           const char *start_insert;
1729           const char *last_slash = find_last_char (base, end, '/');
1730           if (!last_slash)
1731             {
1732               /* No slash found at all.  Append LINK to what we have,
1733                  but we'll need a slash as a separator.
1734
1735                  Example: if base == "foo" and link == "qux/xyzzy", then
1736                  we cannot just append link to base, because we'd get
1737                  "fooqux/xyzzy", whereas what we want is
1738                  "foo/qux/xyzzy".
1739
1740                  To make sure the / gets inserted, we set
1741                  need_explicit_slash to 1.  We also set start_insert
1742                  to end + 1, so that the length calculations work out
1743                  correctly for one more (slash) character.  Accessing
1744                  that character is fine, since it will be the
1745                  delimiter, '\0' or '?'.  */
1746               /* example: "foo?..." */
1747               /*               ^    ('?' gets changed to '/') */
1748               start_insert = end + 1;
1749               need_explicit_slash = 1;
1750             }
1751           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1752             {
1753               /* example: http://host"  */
1754               /*                      ^ */
1755               start_insert = end + 1;
1756               need_explicit_slash = 1;
1757             }
1758           else
1759             {
1760               /* example: "whatever/foo/bar" */
1761               /*                        ^    */
1762               start_insert = last_slash + 1;
1763             }
1764
1765           span = start_insert - base;
1766           constr = (char *)xmalloc (span + linklength + 1);
1767           if (span)
1768             memcpy (constr, base, span);
1769           if (need_explicit_slash)
1770             constr[span - 1] = '/';
1771           if (linklength)
1772             memcpy (constr + span, link, linklength);
1773           constr[span + linklength] = '\0';
1774         }
1775     }
1776   else /* !no_scheme */
1777     {
1778       constr = strdupdelim (link, link + linklength);
1779     }
1780   return constr;
1781 }
1782
1783 /* Merge BASE with LINK and return the resulting URI.  This is an
1784    interface to uri_merge_1 that assumes that LINK is a
1785    zero-terminated string.  */
1786 char *
1787 uri_merge (const char *base, const char *link)
1788 {
1789   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1790 }
1791 \f
1792 #define APPEND(p, s) do {                       \
1793   int len = strlen (s);                         \
1794   memcpy (p, s, len);                           \
1795   p += len;                                     \
1796 } while (0)
1797
1798 /* Use this instead of password when the actual password is supposed
1799    to be hidden.  We intentionally use a generic string without giving
1800    away the number of characters in the password, like previous
1801    versions did.  */
1802 #define HIDDEN_PASSWORD "*password*"
1803
1804 /* Recreate the URL string from the data in URL.
1805
1806    If HIDE is non-zero (as it is when we're calling this on a URL we
1807    plan to print, but not when calling it to canonicalize a URL for
1808    use within the program), password will be hidden.  Unsafe
1809    characters in the URL will be quoted.  */
1810
1811 char *
1812 url_string (const struct url *url, int hide_password)
1813 {
1814   int size;
1815   char *result, *p;
1816   char *quoted_user = NULL, *quoted_passwd = NULL;
1817
1818   int scheme_port  = supported_schemes[url->scheme].default_port;
1819   char *scheme_str = supported_schemes[url->scheme].leading_string;
1820   int fplen = full_path_length (url);
1821
1822   int brackets_around_host = 0;
1823
1824   assert (scheme_str != NULL);
1825
1826   /* Make sure the user name and password are quoted. */
1827   if (url->user)
1828     {
1829       quoted_user = encode_string_maybe (url->user);
1830       if (url->passwd)
1831         {
1832           if (hide_password)
1833             quoted_passwd = HIDDEN_PASSWORD;
1834           else
1835             quoted_passwd = encode_string_maybe (url->passwd);
1836         }
1837     }
1838
1839   if (strchr (url->host, ':'))
1840     brackets_around_host = 1;
1841
1842   size = (strlen (scheme_str)
1843           + strlen (url->host)
1844           + (brackets_around_host ? 2 : 0)
1845           + fplen
1846           + 1);
1847   if (url->port != scheme_port)
1848     size += 1 + numdigit (url->port);
1849   if (quoted_user)
1850     {
1851       size += 1 + strlen (quoted_user);
1852       if (quoted_passwd)
1853         size += 1 + strlen (quoted_passwd);
1854     }
1855
1856   p = result = xmalloc (size);
1857
1858   APPEND (p, scheme_str);
1859   if (quoted_user)
1860     {
1861       APPEND (p, quoted_user);
1862       if (quoted_passwd)
1863         {
1864           *p++ = ':';
1865           APPEND (p, quoted_passwd);
1866         }
1867       *p++ = '@';
1868     }
1869
1870   if (brackets_around_host)
1871     *p++ = '[';
1872   APPEND (p, url->host);
1873   if (brackets_around_host)
1874     *p++ = ']';
1875   if (url->port != scheme_port)
1876     {
1877       *p++ = ':';
1878       p = number_to_string (p, url->port);
1879     }
1880
1881   full_path_write (url, p);
1882   p += fplen;
1883   *p++ = '\0';
1884
1885   assert (p - result == size);
1886
1887   if (quoted_user && quoted_user != url->user)
1888     xfree (quoted_user);
1889   if (quoted_passwd && !hide_password
1890       && quoted_passwd != url->passwd)
1891     xfree (quoted_passwd);
1892
1893   return result;
1894 }
1895 \f
1896 /* Return the URL of the proxy appropriate for url U.  */
1897 char *
1898 getproxy (struct url *u)
1899 {
1900   char *proxy = NULL;
1901   char *rewritten_url;
1902   static char rewritten_storage[1024];
1903
1904   if (!opt.use_proxy)
1905     return NULL;
1906   if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
1907     return NULL;
1908
1909   switch (u->scheme)
1910     {
1911     case SCHEME_HTTP:
1912       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1913       break;
1914 #ifdef HAVE_SSL
1915     case SCHEME_HTTPS:
1916       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1917       break;
1918 #endif
1919     case SCHEME_FTP:
1920       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1921       break;
1922     case SCHEME_INVALID:
1923       break;
1924     }
1925   if (!proxy || !*proxy)
1926     return NULL;
1927
1928   /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1929      getproxy() to return static storage. */
1930   rewritten_url = rewrite_shorthand_url (proxy);
1931   if (rewritten_url)
1932     {
1933       strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1934       rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1935       proxy = rewritten_storage;
1936     }
1937
1938   return proxy;
1939 }
1940
1941 /* Should a host be accessed through proxy, concerning no_proxy?  */
1942 int
1943 no_proxy_match (const char *host, const char **no_proxy)
1944 {
1945   if (!no_proxy)
1946     return 1;
1947   else
1948     return !sufmatch (no_proxy, host);
1949 }
1950 \f
1951 /* Support for converting links for local viewing in downloaded HTML
1952    files.  This should be moved to another file, because it has
1953    nothing to do with processing URLs.  */
1954
1955 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1956 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1957                                          const char *));
1958 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1959                                                       const char *, int));
1960 static char *local_quote_string PARAMS ((const char *));
1961
1962 /* Change the links in one HTML file.  LINKS is a list of links in the
1963    document, along with their positions and the desired direction of
1964    the conversion.  */
1965 void
1966 convert_links (const char *file, struct urlpos *links)
1967 {
1968   struct file_memory *fm;
1969   FILE *fp;
1970   const char *p;
1971   downloaded_file_t downloaded_file_return;
1972
1973   struct urlpos *link;
1974   int to_url_count = 0, to_file_count = 0;
1975
1976   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1977
1978   {
1979     /* First we do a "dry run": go through the list L and see whether
1980        any URL needs to be converted in the first place.  If not, just
1981        leave the file alone.  */
1982     int dry_count = 0;
1983     struct urlpos *dry = links;
1984     for (dry = links; dry; dry = dry->next)
1985       if (dry->convert != CO_NOCONVERT)
1986         ++dry_count;
1987     if (!dry_count)
1988       {
1989         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1990         return;
1991       }
1992   }
1993
1994   fm = read_file (file);
1995   if (!fm)
1996     {
1997       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1998                  file, strerror (errno));
1999       return;
2000     }
2001
2002   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2003   if (opt.backup_converted && downloaded_file_return)
2004     write_backup_file (file, downloaded_file_return);
2005
2006   /* Before opening the file for writing, unlink the file.  This is
2007      important if the data in FM is mmaped.  In such case, nulling the
2008      file, which is what fopen() below does, would make us read all
2009      zeroes from the mmaped region.  */
2010   if (unlink (file) < 0 && errno != ENOENT)
2011     {
2012       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2013                  file, strerror (errno));
2014       read_file_free (fm);
2015       return;
2016     }
2017   /* Now open the file for writing.  */
2018   fp = fopen (file, "wb");
2019   if (!fp)
2020     {
2021       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2022                  file, strerror (errno));
2023       read_file_free (fm);
2024       return;
2025     }
2026
2027   /* Here we loop through all the URLs in file, replacing those of
2028      them that are downloaded with relative references.  */
2029   p = fm->content;
2030   for (link = links; link; link = link->next)
2031     {
2032       char *url_start = fm->content + link->pos;
2033
2034       if (link->pos >= fm->length)
2035         {
2036           DEBUGP (("Something strange is going on.  Please investigate."));
2037           break;
2038         }
2039       /* If the URL is not to be converted, skip it.  */
2040       if (link->convert == CO_NOCONVERT)
2041         {
2042           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2043           continue;
2044         }
2045
2046       /* Echo the file contents, up to the offending URL's opening
2047          quote, to the outfile.  */
2048       fwrite (p, 1, url_start - p, fp);
2049       p = url_start;
2050
2051       switch (link->convert)
2052         {
2053         case CO_CONVERT_TO_RELATIVE:
2054           /* Convert absolute URL to relative. */
2055           {
2056             char *newname = construct_relative (file, link->local_name);
2057             char *quoted_newname = local_quote_string (newname);
2058
2059             if (!link->link_refresh_p)
2060               p = replace_attr (p, link->size, fp, quoted_newname);
2061             else
2062               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2063                                              link->refresh_timeout);
2064
2065             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2066                      link->url->url, newname, link->pos, file));
2067             xfree (newname);
2068             xfree (quoted_newname);
2069             ++to_file_count;
2070             break;
2071           }
2072         case CO_CONVERT_TO_COMPLETE:
2073           /* Convert the link to absolute URL. */
2074           {
2075             char *newlink = link->url->url;
2076             char *quoted_newlink = html_quote_string (newlink);
2077
2078             if (!link->link_refresh_p)
2079               p = replace_attr (p, link->size, fp, quoted_newlink);
2080             else
2081               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2082                                              link->refresh_timeout);
2083
2084             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2085                      newlink, link->pos, file));
2086             xfree (quoted_newlink);
2087             ++to_url_count;
2088             break;
2089           }
2090         case CO_NULLIFY_BASE:
2091           /* Change the base href to "". */
2092           p = replace_attr (p, link->size, fp, "");
2093           break;
2094         case CO_NOCONVERT:
2095           abort ();
2096           break;
2097         }
2098     }
2099
2100   /* Output the rest of the file. */
2101   if (p - fm->content < fm->length)
2102     fwrite (p, 1, fm->length - (p - fm->content), fp);
2103   fclose (fp);
2104   read_file_free (fm);
2105
2106   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2107 }
2108
2109 /* Construct and return a malloced copy of the relative link from two
2110    pieces of information: local name S1 of the referring file and
2111    local name S2 of the referred file.
2112
2113    So, if S1 is "jagor.srce.hr/index.html" and S2 is
2114    "jagor.srce.hr/images/news.gif", the function will return
2115    "images/news.gif".
2116
2117    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2118    "fly.cc.fer.hr/images/fly.gif", the function will return
2119    "../images/fly.gif".
2120
2121    Caveats: S1 should not begin with `/', unless S2 also begins with
2122    '/'.  S1 should not contain things like ".." and such --
2123    construct_relative ("fly/ioccc/../index.html",
2124    "fly/images/fly.gif") will fail.  (A workaround is to call
2125    something like path_simplify() on S1).  */
2126 static char *
2127 construct_relative (const char *s1, const char *s2)
2128 {
2129   int i, cnt, sepdirs1;
2130   char *res;
2131
2132   if (*s2 == '/')
2133     return xstrdup (s2);
2134   /* S1 should *not* be absolute, if S2 wasn't.  */
2135   assert (*s1 != '/');
2136   i = cnt = 0;
2137   /* Skip the directories common to both strings.  */
2138   while (1)
2139     {
2140       while (s1[i] && s2[i]
2141              && (s1[i] == s2[i])
2142              && (s1[i] != '/')
2143              && (s2[i] != '/'))
2144         ++i;
2145       if (s1[i] == '/' && s2[i] == '/')
2146         cnt = ++i;
2147       else
2148         break;
2149     }
2150   for (sepdirs1 = 0; s1[i]; i++)
2151     if (s1[i] == '/')
2152       ++sepdirs1;
2153   /* Now, construct the file as of:
2154      - ../ repeated sepdirs1 time
2155      - all the non-mutual directories of S2.  */
2156   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2157   for (i = 0; i < sepdirs1; i++)
2158     memcpy (res + 3 * i, "../", 3);
2159   strcpy (res + 3 * i, s2 + cnt);
2160   return res;
2161 }
2162 \f
2163 static void
2164 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2165 {
2166   /* Rather than just writing over the original .html file with the
2167      converted version, save the former to *.orig.  Note we only do
2168      this for files we've _successfully_ downloaded, so we don't
2169      clobber .orig files sitting around from previous invocations. */
2170
2171   /* Construct the backup filename as the original name plus ".orig". */
2172   size_t         filename_len = strlen(file);
2173   char*          filename_plus_orig_suffix;
2174   boolean        already_wrote_backup_file = FALSE;
2175   slist*         converted_file_ptr;
2176   static slist*  converted_files = NULL;
2177
2178   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2179     {
2180       /* Just write "orig" over "html".  We need to do it this way
2181          because when we're checking to see if we've downloaded the
2182          file before (to see if we can skip downloading it), we don't
2183          know if it's a text/html file.  Therefore we don't know yet
2184          at that stage that -E is going to cause us to tack on
2185          ".html", so we need to compare vs. the original URL plus
2186          ".orig", not the original URL plus ".html.orig". */
2187       filename_plus_orig_suffix = alloca (filename_len + 1);
2188       strcpy(filename_plus_orig_suffix, file);
2189       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2190     }
2191   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2192     {
2193       /* Append ".orig" to the name. */
2194       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2195       strcpy(filename_plus_orig_suffix, file);
2196       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2197     }
2198
2199   /* We can get called twice on the same URL thanks to the
2200      convert_all_links() call in main().  If we write the .orig file
2201      each time in such a case, it'll end up containing the first-pass
2202      conversion, not the original file.  So, see if we've already been
2203      called on this file. */
2204   converted_file_ptr = converted_files;
2205   while (converted_file_ptr != NULL)
2206     if (strcmp(converted_file_ptr->string, file) == 0)
2207       {
2208         already_wrote_backup_file = TRUE;
2209         break;
2210       }
2211     else
2212       converted_file_ptr = converted_file_ptr->next;
2213
2214   if (!already_wrote_backup_file)
2215     {
2216       /* Rename <file> to <file>.orig before former gets written over. */
2217       if (rename(file, filename_plus_orig_suffix) != 0)
2218         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2219                    file, filename_plus_orig_suffix, strerror (errno));
2220
2221       /* Remember that we've already written a .orig backup for this file.
2222          Note that we never free this memory since we need it till the
2223          convert_all_links() call, which is one of the last things the
2224          program does before terminating.  BTW, I'm not sure if it would be
2225          safe to just set 'converted_file_ptr->string' to 'file' below,
2226          rather than making a copy of the string...  Another note is that I
2227          thought I could just add a field to the urlpos structure saying
2228          that we'd written a .orig file for this URL, but that didn't work,
2229          so I had to make this separate list.
2230          -- Dan Harkless <wget@harkless.org>
2231
2232          This [adding a field to the urlpos structure] didn't work
2233          because convert_file() is called from convert_all_links at
2234          the end of the retrieval with a freshly built new urlpos
2235          list.
2236          -- Hrvoje Niksic <hniksic@arsdigita.com>
2237       */
2238       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2239       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
2240       converted_file_ptr->next = converted_files;
2241       converted_files = converted_file_ptr;
2242     }
2243 }
2244
2245 static int find_fragment PARAMS ((const char *, int, const char **,
2246                                   const char **));
2247
2248 /* Replace an attribute's original text with NEW_TEXT. */
2249
2250 static const char *
2251 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2252 {
2253   int quote_flag = 0;
2254   char quote_char = '\"';       /* use "..." for quoting, unless the
2255                                    original value is quoted, in which
2256                                    case reuse its quoting char. */
2257   const char *frag_beg, *frag_end;
2258
2259   /* Structure of our string is:
2260        "...old-contents..."
2261        <---    size    --->  (with quotes)
2262      OR:
2263        ...old-contents...
2264        <---    size   -->    (no quotes)   */
2265
2266   if (*p == '\"' || *p == '\'')
2267     {
2268       quote_char = *p;
2269       quote_flag = 1;
2270       ++p;
2271       size -= 2;                /* disregard opening and closing quote */
2272     }
2273   putc (quote_char, fp);
2274   fputs (new_text, fp);
2275
2276   /* Look for fragment identifier, if any. */
2277   if (find_fragment (p, size, &frag_beg, &frag_end))
2278     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2279   p += size;
2280   if (quote_flag)
2281     ++p;
2282   putc (quote_char, fp);
2283
2284   return p;
2285 }
2286
2287 /* The same as REPLACE_ATTR, but used when replacing
2288    <meta http-equiv=refresh content="new_text"> because we need to
2289    append "timeout_value; URL=" before the next_text.  */
2290
2291 static const char *
2292 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2293                            const char *new_text, int timeout)
2294 {
2295   /* "0; URL=..." */
2296   char *new_with_timeout = (char *)alloca (numdigit (timeout)
2297                                            + 6 /* "; URL=" */
2298                                            + strlen (new_text)
2299                                            + 1);
2300   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2301
2302   return replace_attr (p, size, fp, new_with_timeout);
2303 }
2304
2305 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2306    preceded by '&'.  If the character is not found, return zero.  If
2307    the character is found, return 1 and set BP and EP to point to the
2308    beginning and end of the region.
2309
2310    This is used for finding the fragment indentifiers in URLs.  */
2311
2312 static int
2313 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2314 {
2315   const char *end = beg + size;
2316   int saw_amp = 0;
2317   for (; beg < end; beg++)
2318     {
2319       switch (*beg)
2320         {
2321         case '&':
2322           saw_amp = 1;
2323           break;
2324         case '#':
2325           if (!saw_amp)
2326             {
2327               *bp = beg;
2328               *ep = end;
2329               return 1;
2330             }
2331           /* fallthrough */
2332         default:
2333           saw_amp = 0;
2334         }
2335     }
2336   return 0;
2337 }
2338
2339 /* Quote FILE for use as local reference to an HTML file.
2340
2341    We quote ? as %3F to avoid passing part of the file name as the
2342    parameter when browsing the converted file through HTTP.  However,
2343    it is safe to do this only when `--html-extension' is turned on.
2344    This is because converting "index.html?foo=bar" to
2345    "index.html%3Ffoo=bar" would break local browsing, as the latter
2346    isn't even recognized as an HTML file!  However, converting
2347    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2348    safe for both local and HTTP-served browsing.  */
2349
2350 static char *
2351 local_quote_string (const char *file)
2352 {
2353   const char *file_sans_qmark;
2354   int qm;
2355
2356   if (!opt.html_extension)
2357     return html_quote_string (file);
2358
2359   qm = count_char (file, '?');
2360
2361   if (qm)
2362     {
2363       const char *from = file;
2364       char *to, *newname;
2365
2366       /* qm * 2 because we replace each question mark with "%3F",
2367          i.e. replace one char with three, hence two more.  */
2368       int fsqlen = strlen (file) + qm * 2;
2369
2370       to = newname = (char *)alloca (fsqlen + 1);
2371       for (; *from; from++)
2372         {
2373           if (*from != '?')
2374             *to++ = *from;
2375           else
2376             {
2377               *to++ = '%';
2378               *to++ = '3';
2379               *to++ = 'F';
2380             }
2381         }
2382       assert (to - newname == fsqlen);
2383       *to = '\0';
2384
2385       file_sans_qmark = newname;
2386     }
2387   else
2388     file_sans_qmark = file;
2389
2390   return html_quote_string (file_sans_qmark);
2391 }
2392
2393 /* We're storing "modes" of type downloaded_file_t in the hash table.
2394    However, our hash tables only accept pointers for keys and values.
2395    So when we need a pointer, we use the address of a
2396    downloaded_file_t variable of static storage.  */
2397
2398 static downloaded_file_t *
2399 downloaded_mode_to_ptr (downloaded_file_t mode)
2400 {
2401   static downloaded_file_t
2402     v1 = FILE_NOT_ALREADY_DOWNLOADED,
2403     v2 = FILE_DOWNLOADED_NORMALLY,
2404     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2405     v4 = CHECK_FOR_FILE;
2406
2407   switch (mode)
2408     {
2409     case FILE_NOT_ALREADY_DOWNLOADED:
2410       return &v1;
2411     case FILE_DOWNLOADED_NORMALLY:
2412       return &v2;
2413     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2414       return &v3;
2415     case CHECK_FOR_FILE:
2416       return &v4;
2417     }
2418   return NULL;
2419 }
2420
2421 /* This should really be merged with dl_file_url_map and
2422    downloaded_html_files in recur.c.  This was originally a list, but
2423    I changed it to a hash table beause it was actually taking a lot of
2424    time to find things in it.  */
2425
2426 static struct hash_table *downloaded_files_hash;
2427
2428 /* Remembers which files have been downloaded.  In the standard case, should be
2429    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2430    download successfully (i.e. not for ones we have failures on or that we skip
2431    due to -N).
2432
2433    When we've downloaded a file and tacked on a ".html" extension due to -E,
2434    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2435    FILE_DOWNLOADED_NORMALLY.
2436
2437    If you just want to check if a file has been previously added without adding
2438    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
2439    with local filenames, not remote URLs. */
2440 downloaded_file_t
2441 downloaded_file (downloaded_file_t mode, const char *file)
2442 {
2443   downloaded_file_t *ptr;
2444
2445   if (mode == CHECK_FOR_FILE)
2446     {
2447       if (!downloaded_files_hash)
2448         return FILE_NOT_ALREADY_DOWNLOADED;
2449       ptr = hash_table_get (downloaded_files_hash, file);
2450       if (!ptr)
2451         return FILE_NOT_ALREADY_DOWNLOADED;
2452       return *ptr;
2453     }
2454
2455   if (!downloaded_files_hash)
2456     downloaded_files_hash = make_string_hash_table (0);
2457
2458   ptr = hash_table_get (downloaded_files_hash, file);
2459   if (ptr)
2460     return *ptr;
2461
2462   ptr = downloaded_mode_to_ptr (mode);
2463   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2464
2465   return FILE_NOT_ALREADY_DOWNLOADED;
2466 }
2467
2468 static int
2469 df_free_mapper (void *key, void *value, void *ignored)
2470 {
2471   xfree (key);
2472   return 0;
2473 }
2474
2475 void
2476 downloaded_files_free (void)
2477 {
2478   if (downloaded_files_hash)
2479     {
2480       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2481       hash_table_destroy (downloaded_files_hash);
2482       downloaded_files_hash = NULL;
2483     }
2484 }
2485
2486 /* Return non-zero if scheme a is similar to scheme b.
2487
2488    Schemes are similar if they are equal.  If SSL is supported, schemes
2489    are also similar if one is http (SCHEME_HTTP) and the other is https
2490    (SCHEME_HTTPS).  */
2491 int
2492 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2493 {
2494   if (a == b)
2495     return 1;
2496 #ifdef HAVE_SSL
2497   if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2498       || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2499     return 1;
2500 #endif
2501   return 0;
2502 }
2503 \f
2504 #if 0
2505 /* Debugging and testing support for path_simplify. */
2506
2507 /* Debug: run path_simplify on PATH and return the result in a new
2508    string.  Useful for calling from the debugger.  */
2509 static char *
2510 ps (char *path)
2511 {
2512   char *copy = xstrdup (path);
2513   path_simplify (copy);
2514   return copy;
2515 }
2516
2517 static void
2518 run_test (char *test, char *expected_result, int expected_change)
2519 {
2520   char *test_copy = xstrdup (test);
2521   int modified = path_simplify (test_copy);
2522
2523   if (0 != strcmp (test_copy, expected_result))
2524     {
2525       printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2526               test, expected_result, test_copy);
2527     }
2528   if (modified != expected_change)
2529     {
2530       if (expected_change == 1)
2531         printf ("Expected no modification with path_simplify(\"%s\").\n",
2532                 test);
2533       else
2534         printf ("Expected modification with path_simplify(\"%s\").\n",
2535                 test);
2536     }
2537   xfree (test_copy);
2538 }
2539
2540 static void
2541 test_path_simplify (void)
2542 {
2543   static struct {
2544     char *test, *result;
2545     int should_modify;
2546   } tests[] = {
2547     { "",               "",             0 },
2548     { ".",              "",             1 },
2549     { "..",             "",             1 },
2550     { "foo",            "foo",          0 },
2551     { "foo/bar",        "foo/bar",      0 },
2552     { "foo///bar",      "foo/bar",      1 },
2553     { "foo/.",          "foo/",         1 },
2554     { "foo/./",         "foo/",         1 },
2555     { "foo./",          "foo./",        0 },
2556     { "foo/../bar",     "bar",          1 },
2557     { "foo/../bar/",    "bar/",         1 },
2558     { "foo/bar/..",     "foo/",         1 },
2559     { "foo/bar/../x",   "foo/x",        1 },
2560     { "foo/bar/../x/",  "foo/x/",       1 },
2561     { "foo/..",         "",             1 },
2562     { "foo/../..",      "",             1 },
2563     { "a/b/../../c",    "c",            1 },
2564     { "./a/../b",       "b",            1 }
2565   };
2566   int i;
2567
2568   for (i = 0; i < ARRAY_SIZE (tests); i++)
2569     {
2570       char *test = tests[i].test;
2571       char *expected_result = tests[i].result;
2572       int   expected_change = tests[i].should_modify;
2573       run_test (test, expected_result, expected_change);
2574     }
2575
2576   /* Now run all the tests with a leading slash before the test case,
2577      to prove that the slash is being preserved.  */
2578   for (i = 0; i < ARRAY_SIZE (tests); i++)
2579     {
2580       char *test, *expected_result;
2581       int expected_change = tests[i].should_modify;
2582
2583       test = xmalloc (1 + strlen (tests[i].test) + 1);
2584       sprintf (test, "/%s", tests[i].test);
2585
2586       expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2587       sprintf (expected_result, "/%s", tests[i].result);
2588
2589       run_test (test, expected_result, expected_change);
2590
2591       xfree (test);
2592       xfree (expected_result);
2593     }
2594 }
2595 #endif