sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Default port definitions */
  47 #define DEFAULT_HTTP_PORT 80
  48 #define DEFAULT_FTP_PORT 21
  49 #define DEFAULT_HTTPS_PORT 443
  50
  51 /* Table of Unsafe chars.  This is intialized in
  52    init_unsafe_char_table.  */
  53
  54 static char unsafe_char_table[256];
  55
  56 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  57
  58 /* If S contains unsafe characters, free it and replace it with a
  59    version that doesn't.  */
  60 #define URL_CLEANSE(s) do                       \
  61 {                                               \
  62   if (contains_unsafe (s))                      \
  63     {                                           \
  64       char *uc_tmp = encode_string (s);         \
  65       xfree (s);                                \
  66       (s) = uc_tmp;                             \
  67     }                                           \
  68 } while (0)
  69
  70 /* Is a directory "."?  */
  71 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  72 /* Is a directory ".."?  */
  73 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  74
  75 #if 0
  76 static void path_simplify_with_kludge PARAMS ((char *));
  77 #endif
  78 static int urlpath_length PARAMS ((const char *));
  79
  80 /* NULL-terminated list of strings to be recognized as prototypes (URL
  81    schemes).  Note that recognized doesn't mean supported -- only HTTP,
  82    HTTPS and FTP are currently supported .
  83
  84    However, a string that does not match anything in the list will be
  85    considered a relative URL.  Thus it's important that this list has
  86    anything anyone could think of being legal.
  87
  88    There are wild things here.  :-) Take a look at
  89    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  90    fun.  */
  91 static char *protostrings[] =
  92 {
  93   "cid:",
  94   "clsid:",
  95   "file:",
  96   "finger:",
  97   "ftp:",
  98   "gopher:",
  99   "hdl:",
 100   "http:",
 101   "https:",
 102   "ilu:",
 103   "ior:",
 104   "irc:",
 105   "java:",
 106   "javascript:",
 107   "lifn:",
 108   "mailto:",
 109   "mid:",
 110   "news:",
 111   "nntp:",
 112   "path:",
 113   "prospero:",
 114   "rlogin:",
 115   "service:",
 116   "shttp:",
 117   "snews:",
 118   "stanf:",
 119   "telnet:",
 120   "tn3270:",
 121   "wais:",
 122   "whois++:",
 123   NULL
 124 };
 125
 126 struct proto
 127 {
 128   char *name;
 129   uerr_t ind;
 130   unsigned short port;
 131 };
 132
 133 /* Similar to former, but for supported protocols: */
 134 static struct proto sup_protos[] =
 135 {
 136   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 137 #ifdef HAVE_SSL
 138   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 139 #endif
 140   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 141   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 142 };
 143
 144 static void parse_dir PARAMS ((const char *, char **, char **));
 145 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 146 static char *construct PARAMS ((const char *, const char *, int , int));
 147 static char *construct_relative PARAMS ((const char *, const char *));
 148 static char process_ftp_type PARAMS ((char *));
 149
 150 \f
 151 /* Returns the number of characters to be skipped if the first thing
 152    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 153    URL: are also skipped.  */
 154 int
 155 skip_url (const char *url)
 156 {
 157   int i;
 158
 159   if (TOUPPER (url[0]) == 'U'
 160       && TOUPPER (url[1]) == 'R'
 161       && TOUPPER (url[2]) == 'L'
 162       && url[3] == ':')
 163     {
 164       /* Skip blanks.  */
 165       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 166       return i;
 167     }
 168   else
 169     return 0;
 170 }
 171
 172 /* Unsafe chars:
 173    - anything <= 32;
 174    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 175    - @ and :, for user/password encoding.
 176    - everything over 127 (but we don't bother with recording those.  */
 177 void
 178 init_unsafe_char_table (void)
 179 {
 180   int i;
 181   for (i = 0; i < 256; i++)
 182     if (i < 32 || i >= 127
 183         || i == ' '
 184         || i == '<'
 185         || i == '>'
 186         || i == '\"'
 187         || i == '#'
 188         || i == '%'
 189         || i == '{'
 190         || i == '}'
 191         || i == '|'
 192         || i == '\\'
 193         || i == '^'
 194         || i == '~'
 195         || i == '['
 196         || i == ']'
 197         || i == '`')
 198       unsafe_char_table[i] = 1;
 199 }
 200
 201 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 202 int
 203 contains_unsafe (const char *s)
 204 {
 205   for (; *s; s++)
 206     if (UNSAFE_CHAR (*s))
 207       return 1;
 208   return 0;
 209 }
 210
 211 /* Decodes the forms %xy in a URL to the character the hexadecimal
 212    code of which is xy.  xy are hexadecimal digits from
 213    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 214    hex-digits or `%' precedes `\0', the sequence is inserted
 215    literally.  */
 216
 217 static void
 218 decode_string (char *s)
 219 {
 220   char *p = s;
 221
 222   for (; *s; s++, p++)
 223     {
 224       if (*s != '%')
 225         *p = *s;
 226       else
 227         {
 228           /* Do nothing if at the end of the string, or if the chars
 229              are not hex-digits.  */
 230           if (!*(s + 1) || !*(s + 2)
 231               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 232             {
 233               *p = *s;
 234               continue;
 235             }
 236           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 237           s += 2;
 238         }
 239     }
 240   *p = '\0';
 241 }
 242
 243 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 244    given string, returning a malloc-ed %XX encoded string.  */
 245 char *
 246 encode_string (const char *s)
 247 {
 248   const char *b;
 249   char *p, *res;
 250   int i;
 251
 252   b = s;
 253   for (i = 0; *s; s++, i++)
 254     if (UNSAFE_CHAR (*s))
 255       i += 2; /* Two more characters (hex digits) */
 256   res = (char *)xmalloc (i + 1);
 257   s = b;
 258   for (p = res; *s; s++)
 259     if (UNSAFE_CHAR (*s))
 260       {
 261         const unsigned char c = *s;
 262         *p++ = '%';
 263         *p++ = HEXD2ASC (c >> 4);
 264         *p++ = HEXD2ASC (c & 0xf);
 265       }
 266     else
 267       *p++ = *s;
 268   *p = '\0';
 269   return res;
 270 }
 271 \f
 272 /* Returns the proto-type if URL's protocol is supported, or
 273    URLUNKNOWN if not.  */
 274 uerr_t
 275 urlproto (const char *url)
 276 {
 277   int i;
 278
 279   url += skip_url (url);
 280   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 281     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 282       return sup_protos[i].ind;
 283   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 284   if (url[i] == ':')
 285     {
 286       for (++i; url[i] && url[i] != '/'; i++)
 287         if (!ISDIGIT (url[i]))
 288           return URLBADPORT;
 289       if (url[i - 1] == ':')
 290         return URLFTP;
 291       else
 292         return URLHTTP;
 293     }
 294   else
 295     return URLHTTP;
 296 }
 297
 298 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 299    part is found, returns 0.  */
 300 int
 301 skip_proto (const char *url)
 302 {
 303   char **s;
 304   int l;
 305
 306   for (s = protostrings; *s; s++)
 307     if (!strncasecmp (*s, url, strlen (*s)))
 308       break;
 309   if (!*s)
 310     return 0;
 311   l = strlen (*s);
 312   /* HTTP and FTP protocols are expected to yield exact host names
 313      (i.e. the `//' part must be skipped, too).  */
 314   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 315     l += 2;
 316   return l;
 317 }
 318
 319 /* Returns 1 if the URL begins with a protocol (supported or
 320    unsupported), 0 otherwise.  */
 321 int
 322 has_proto (const char *url)
 323 {
 324   char **s;
 325
 326   url += skip_url (url);
 327   for (s = protostrings; *s; s++)
 328     if (strncasecmp (url, *s, strlen (*s)) == 0)
 329       return 1;
 330   return 0;
 331 }
 332
 333 /* Skip the username and password, if present here.  The function
 334    should be called *not* with the complete URL, but with the part
 335    right after the protocol.
 336
 337    If no username and password are found, return 0.  */
 338 int
 339 skip_uname (const char *url)
 340 {
 341   const char *p;
 342   for (p = url; *p && *p != '/'; p++)
 343     if (*p == '@')
 344       break;
 345   /* If a `@' was found before the first occurrence of `/', skip
 346      it.  */
 347   if (*p == '@')
 348     return p - url + 1;
 349   else
 350     return 0;
 351 }
 352 \f
 353 /* Allocate a new urlinfo structure, fill it with default values and
 354    return a pointer to it.  */
 355 struct urlinfo *
 356 newurl (void)
 357 {
 358   struct urlinfo *u;
 359
 360   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 361   memset (u, 0, sizeof (*u));
 362   u->proto = URLUNKNOWN;
 363   return u;
 364 }
 365
 366 /* Perform a "deep" free of the urlinfo structure.  The structure
 367    should have been created with newurl, but need not have been used.
 368    If free_pointer is non-0, free the pointer itself.  */
 369 void
 370 freeurl (struct urlinfo *u, int complete)
 371 {
 372   assert (u != NULL);
 373   FREE_MAYBE (u->url);
 374   FREE_MAYBE (u->host);
 375   FREE_MAYBE (u->path);
 376   FREE_MAYBE (u->file);
 377   FREE_MAYBE (u->dir);
 378   FREE_MAYBE (u->user);
 379   FREE_MAYBE (u->passwd);
 380   FREE_MAYBE (u->local);
 381   FREE_MAYBE (u->referer);
 382   if (u->proxy)
 383     freeurl (u->proxy, 1);
 384   if (complete)
 385     xfree (u);
 386   return;
 387 }
 388 \f
 389 /* Extract the given URL of the form
 390    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 391    1. hostname (terminated with `/' or `:')
 392    2. port number (terminated with `/'), or chosen for the protocol
 393    3. dirname (everything after hostname)
 394    Most errors are handled.  No allocation is done, you must supply
 395    pointers to allocated memory.
 396    ...and a host of other stuff :-)
 397
 398    - Recognizes hostname:dir/file for FTP and
 399      hostname (:portnum)?/dir/file for HTTP.
 400    - Parses the path to yield directory and file
 401    - Parses the URL to yield the username and passwd (if present)
 402    - Decodes the strings, in case they contain "forbidden" characters
 403    - Writes the result to struct urlinfo
 404
 405    If the argument STRICT is set, it recognizes only the canonical
 406    form.  */
 407 uerr_t
 408 parseurl (const char *url, struct urlinfo *u, int strict)
 409 {
 410   int i, l, abs_ftp;
 411   int recognizable;            /* Recognizable URL is the one where
 412                                   the protocol name was explicitly
 413                                   named, i.e. it wasn't deduced from
 414                                   the URL format.  */
 415   uerr_t type;
 416
 417   DEBUGP (("parseurl (\"%s\") -> ", url));
 418   url += skip_url (url);
 419   recognizable = has_proto (url);
 420   if (strict && !recognizable)
 421     return URLUNKNOWN;
 422   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 423     {
 424       l = strlen (sup_protos[i].name);
 425       if (!strncasecmp (sup_protos[i].name, url, l))
 426         break;
 427     }
 428   /* If protocol is recognizable, but unsupported, bail out, else
 429      suppose unknown.  */
 430   if (recognizable && i == ARRAY_SIZE (sup_protos))
 431     return URLUNKNOWN;
 432   else if (i == ARRAY_SIZE (sup_protos))
 433     type = URLUNKNOWN;
 434   else
 435     u->proto = type = sup_protos[i].ind;
 436
 437   if (type == URLUNKNOWN)
 438     l = 0;
 439   /* Allow a username and password to be specified (i.e. just skip
 440      them for now).  */
 441   if (recognizable)
 442     l += skip_uname (url + l);
 443   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 444   if (i == l)
 445     return URLBADHOST;
 446   /* Get the hostname.  */
 447   u->host = strdupdelim (url + l, url + i);
 448   DEBUGP (("host %s -> ", u->host));
 449
 450   /* Assume no port has been given.  */
 451   u->port = 0;
 452   if (url[i] == ':')
 453     {
 454       /* We have a colon delimiting the hostname.  It could mean that
 455          a port number is following it, or a directory.  */
 456       if (ISDIGIT (url[++i]))    /* A port number */
 457         {
 458           if (type == URLUNKNOWN)
 459             u->proto = type = URLHTTP;
 460           for (; url[i] && url[i] != '/'; i++)
 461             if (ISDIGIT (url[i]))
 462               u->port = 10 * u->port + (url[i] - '0');
 463             else
 464               return URLBADPORT;
 465           if (!u->port)
 466             return URLBADPORT;
 467           DEBUGP (("port %hu -> ", u->port));
 468         }
 469       else if (type == URLUNKNOWN) /* or a directory */
 470         u->proto = type = URLFTP;
 471       else                      /* or just a misformed port number */
 472         return URLBADPORT;
 473     }
 474   else if (type == URLUNKNOWN)
 475     u->proto = type = URLHTTP;
 476   if (!u->port)
 477     {
 478       int ind;
 479       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 480         if (sup_protos[ind].ind == type)
 481           break;
 482       if (ind == ARRAY_SIZE (sup_protos))
 483         return URLUNKNOWN;
 484       u->port = sup_protos[ind].port;
 485     }
 486   /* Some delimiter troubles...  */
 487   if (url[i] == '/' && url[i - 1] != ':')
 488     ++i;
 489   if (type == URLHTTP)
 490     while (url[i] && url[i] == '/')
 491       ++i;
 492   u->path = (char *)xmalloc (strlen (url + i) + 8);
 493   strcpy (u->path, url + i);
 494   if (type == URLFTP)
 495     {
 496       u->ftp_type = process_ftp_type (u->path);
 497       /* #### We don't handle type `d' correctly yet.  */
 498       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 499         u->ftp_type = 'I';
 500       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 501     }
 502   DEBUGP (("opath %s -> ", u->path));
 503   /* Parse the username and password (if existing).  */
 504   parse_uname (url, &u->user, &u->passwd);
 505   /* Decode the strings, as per RFC 1738.  */
 506   decode_string (u->host);
 507   decode_string (u->path);
 508   if (u->user)
 509     decode_string (u->user);
 510   if (u->passwd)
 511     decode_string (u->passwd);
 512   /* Parse the directory.  */
 513   parse_dir (u->path, &u->dir, &u->file);
 514   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 515   /* Simplify the directory.  */
 516   path_simplify (u->dir);
 517   /* Remove the leading `/' in HTTP.  */
 518   if (type == URLHTTP && *u->dir == '/')
 519     strcpy (u->dir, u->dir + 1);
 520   DEBUGP (("ndir %s\n", u->dir));
 521   /* Strip trailing `/'.  */
 522   l = strlen (u->dir);
 523   if (l && u->dir[l - 1] == '/')
 524     u->dir[l - 1] = '\0';
 525   /* Re-create the path: */
 526   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 527   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 528       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 529   strcpy (u->path, abs_ftp ? "%2F" : "/");
 530   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 531   strcat (u->path, *u->dir ? "/" : "");
 532   strcat (u->path, u->file);
 533   URL_CLEANSE (u->path);
 534   DEBUGP (("newpath: %s\n", u->path));
 535   /* Create the clean URL.  */
 536   u->url = str_url (u, 0);
 537   return URLOK;
 538 }
 539 \f
 540 /* Special versions of DOTP and DDOTP for parse_dir(). */
 541
 542 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 543 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 544                      && (!*((x) + 2) || *((x) + 2) == '?'))
 545
 546 /* Build the directory and filename components of the path.  Both
 547    components are *separately* malloc-ed strings!  It does not change
 548    the contents of path.
 549
 550    If the path ends with "." or "..", they are (correctly) counted as
 551    directories.  */
 552 static void
 553 parse_dir (const char *path, char **dir, char **file)
 554 {
 555   int i, l;
 556
 557   l = urlpath_length (path);
 558   for (i = l; i && path[i] != '/'; i--);
 559
 560   if (!i && *path != '/')   /* Just filename */
 561     {
 562       if (PD_DOTP (path) || PD_DDOTP (path))
 563         {
 564           *dir = strdupdelim (path, path + l);
 565           *file = xstrdup (path + l); /* normally empty, but could
 566                                          contain ?... */
 567         }
 568       else
 569         {
 570           *dir = xstrdup ("");     /* This is required because of FTP */
 571           *file = xstrdup (path);
 572         }
 573     }
 574   else if (!i)                 /* /filename */
 575     {
 576       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 577         {
 578           *dir = strdupdelim (path, path + l);
 579           *file = xstrdup (path + l); /* normally empty, but could
 580                                          contain ?... */
 581         }
 582       else
 583         {
 584           *dir = xstrdup ("/");
 585           *file = xstrdup (path + 1);
 586         }
 587     }
 588   else /* Nonempty directory with or without a filename */
 589     {
 590       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 591         {
 592           *dir = strdupdelim (path, path + l);
 593           *file = xstrdup (path + l); /* normally empty, but could
 594                                          contain ?... */
 595         }
 596       else
 597         {
 598           *dir = strdupdelim (path, path + i);
 599           *file = xstrdup (path + i + 1);
 600         }
 601     }
 602 }
 603
 604 /* Find the optional username and password within the URL, as per
 605    RFC1738.  The returned user and passwd char pointers are
 606    malloc-ed.  */
 607 static uerr_t
 608 parse_uname (const char *url, char **user, char **passwd)
 609 {
 610   int l;
 611   const char *p, *col;
 612   char **where;
 613
 614   *user = NULL;
 615   *passwd = NULL;
 616   url += skip_url (url);
 617   /* Look for end of protocol string.  */
 618   l = skip_proto (url);
 619   if (!l)
 620     return URLUNKNOWN;
 621   /* Add protocol offset.  */
 622   url += l;
 623   /* Is there an `@' character?  */
 624   for (p = url; *p && *p != '/'; p++)
 625     if (*p == '@')
 626       break;
 627   /* If not, return.  */
 628   if (*p != '@')
 629     return URLOK;
 630   /* Else find the username and password.  */
 631   for (p = col = url; *p != '@'; p++)
 632     {
 633       if (*p == ':' && !*user)
 634         {
 635           *user = (char *)xmalloc (p - url + 1);
 636           memcpy (*user, url, p - url);
 637           (*user)[p - url] = '\0';
 638           col = p + 1;
 639         }
 640     }
 641   /* Decide whether you have only the username or both.  */
 642   where = *user ? passwd : user;
 643   *where = (char *)xmalloc (p - col + 1);
 644   memcpy (*where, col, p - col);
 645   (*where)[p - col] = '\0';
 646   return URLOK;
 647 }
 648
 649 /* If PATH ends with `;type=X', return the character X.  */
 650 static char
 651 process_ftp_type (char *path)
 652 {
 653   int len = strlen (path);
 654
 655   if (len >= 7
 656       && !memcmp (path + len - 7, ";type=", 6))
 657     {
 658       path[len - 7] = '\0';
 659       return path[len - 1];
 660     }
 661   else
 662     return '\0';
 663 }
 664 \f
 665 /* Return the URL as fine-formed string, with a proper protocol, optional port
 666    number, directory and optional user/password.  If `hide' is non-zero (as it
 667    is when we're calling this on a URL we plan to print, but not when calling it
 668    to canonicalize a URL for use within the program), password will be hidden.
 669    The forbidden characters in the URL will be cleansed.  */
 670 char *
 671 str_url (const struct urlinfo *u, int hide)
 672 {
 673   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 674   int i, l, ln, lu, lh, lp, lf, ld;
 675   unsigned short proto_default_port;
 676
 677   /* Look for the protocol name.  */
 678   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 679     if (sup_protos[i].ind == u->proto)
 680       break;
 681   if (i == ARRAY_SIZE (sup_protos))
 682     return NULL;
 683   proto_name = sup_protos[i].name;
 684   proto_default_port = sup_protos[i].port;
 685   host = CLEANDUP (u->host);
 686   dir = CLEANDUP (u->dir);
 687   file = CLEANDUP (u->file);
 688   user = passwd = NULL;
 689   if (u->user)
 690     user = CLEANDUP (u->user);
 691   if (u->passwd)
 692     {
 693       if (hide)
 694         /* Don't output the password, or someone might see it over the user's
 695            shoulder (or in saved wget output).  Don't give away the number of
 696            characters in the password, either, as we did in past versions of
 697            this code, when we replaced the password characters with 'x's. */
 698         passwd = xstrdup("<password>");
 699       else
 700         passwd = CLEANDUP (u->passwd);
 701     }
 702   if (u->proto == URLFTP && *dir == '/')
 703     {
 704       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 705       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 706       tmp[0] = '%';
 707       tmp[1] = '2';
 708       tmp[2] = 'F';
 709       strcpy (tmp + 3, dir + 1);
 710       xfree (dir);
 711       dir = tmp;
 712     }
 713
 714   ln = strlen (proto_name);
 715   lu = user ? strlen (user) : 0;
 716   lp = passwd ? strlen (passwd) : 0;
 717   lh = strlen (host);
 718   ld = strlen (dir);
 719   lf = strlen (file);
 720   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 721   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 722      (user ? user : ""), (passwd ? ":" : ""),
 723      (passwd ? passwd : ""), (user ? "@" : ""),
 724      host, u->port, dir, *dir ? "/" : "", file); */
 725   l = 0;
 726   memcpy (res, proto_name, ln);
 727   l += ln;
 728   if (user)
 729     {
 730       memcpy (res + l, user, lu);
 731       l += lu;
 732       if (passwd)
 733         {
 734           res[l++] = ':';
 735           memcpy (res + l, passwd, lp);
 736           l += lp;
 737         }
 738       res[l++] = '@';
 739     }
 740   memcpy (res + l, host, lh);
 741   l += lh;
 742   if (u->port != proto_default_port)
 743     {
 744       res[l++] = ':';
 745       long_to_string (res + l, (long)u->port);
 746       l += numdigit (u->port);
 747     }
 748   res[l++] = '/';
 749   memcpy (res + l, dir, ld);
 750   l += ld;
 751   if (*dir)
 752     res[l++] = '/';
 753   strcpy (res + l, file);
 754   xfree (host);
 755   xfree (dir);
 756   xfree (file);
 757   FREE_MAYBE (user);
 758   FREE_MAYBE (passwd);
 759   return res;
 760 }
 761
 762 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 763    location.  Uses parseurl to parse them, and compares the canonical
 764    forms.
 765
 766    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 767    return 0 on error.  */
 768 int
 769 url_equal (const char *url1, const char *url2)
 770 {
 771   struct urlinfo *u1, *u2;
 772   uerr_t err;
 773   int res;
 774
 775   u1 = newurl ();
 776   err = parseurl (url1, u1, 0);
 777   if (err != URLOK)
 778     {
 779       freeurl (u1, 1);
 780       return 0;
 781     }
 782   u2 = newurl ();
 783   err = parseurl (url2, u2, 0);
 784   if (err != URLOK)
 785     {
 786       freeurl (u2, 1);
 787       return 0;
 788     }
 789   res = !strcmp (u1->url, u2->url);
 790   freeurl (u1, 1);
 791   freeurl (u2, 1);
 792   return res;
 793 }
 794 \f
 795 urlpos *
 796 get_urls_file (const char *file)
 797 {
 798   struct file_memory *fm;
 799   urlpos *head, *tail;
 800   const char *text, *text_end;
 801
 802   /* Load the file.  */
 803   fm = read_file (file);
 804   if (!fm)
 805     {
 806       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 807       return NULL;
 808     }
 809   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 810   head = tail = NULL;
 811   text = fm->content;
 812   text_end = fm->content + fm->length;
 813   while (text < text_end)
 814     {
 815       const char *line_beg = text;
 816       const char *line_end = memchr (text, '\n', text_end - text);
 817       if (!line_end)
 818         line_end = text_end;
 819       else
 820         ++line_end;
 821       text = line_end;
 822       while (line_beg < line_end
 823              && ISSPACE (*line_beg))
 824         ++line_beg;
 825       while (line_end > line_beg + 1
 826              && ISSPACE (*(line_end - 1)))
 827         --line_end;
 828       if (line_end > line_beg)
 829         {
 830           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 831           memset (entry, 0, sizeof (*entry));
 832           entry->next = NULL;
 833           entry->url = strdupdelim (line_beg, line_end);
 834           if (!head)
 835             head = entry;
 836           else
 837             tail->next = entry;
 838           tail = entry;
 839         }
 840     }
 841   read_file_free (fm);
 842   return head;
 843 }
 844 \f
 845 /* Free the linked list of urlpos.  */
 846 void
 847 free_urlpos (urlpos *l)
 848 {
 849   while (l)
 850     {
 851       urlpos *next = l->next;
 852       xfree (l->url);
 853       FREE_MAYBE (l->local_name);
 854       xfree (l);
 855       l = next;
 856     }
 857 }
 858
 859 /* Rotate FNAME opt.backups times */
 860 void
 861 rotate_backups(const char *fname)
 862 {
 863   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 864   char *from = (char *)alloca (maxlen);
 865   char *to = (char *)alloca (maxlen);
 866   struct stat sb;
 867   int i;
 868
 869   if (stat (fname, &sb) == 0)
 870     if (S_ISREG (sb.st_mode) == 0)
 871       return;
 872
 873   for (i = opt.backups; i > 1; i--)
 874     {
 875       sprintf (from, "%s.%d", fname, i - 1);
 876       sprintf (to, "%s.%d", fname, i);
 877       /* #### This will fail on machines without the rename() system
 878          call.  */
 879       rename (from, to);
 880     }
 881
 882   sprintf (to, "%s.%d", fname, 1);
 883   rename(fname, to);
 884 }
 885
 886 /* Create all the necessary directories for PATH (a file).  Calls
 887    mkdirhier() internally.  */
 888 int
 889 mkalldirs (const char *path)
 890 {
 891   const char *p;
 892   char *t;
 893   struct stat st;
 894   int res;
 895
 896   p = path + strlen (path);
 897   for (; *p != '/' && p != path; p--);
 898   /* Don't create if it's just a file.  */
 899   if ((p == path) && (*p != '/'))
 900     return 0;
 901   t = strdupdelim (path, p);
 902   /* Check whether the directory exists.  */
 903   if ((stat (t, &st) == 0))
 904     {
 905       if (S_ISDIR (st.st_mode))
 906         {
 907           xfree (t);
 908           return 0;
 909         }
 910       else
 911         {
 912           /* If the dir exists as a file name, remove it first.  This
 913              is *only* for Wget to work with buggy old CERN http
 914              servers.  Here is the scenario: When Wget tries to
 915              retrieve a directory without a slash, e.g.
 916              http://foo/bar (bar being a directory), CERN server will
 917              not redirect it too http://foo/bar/ -- it will generate a
 918              directory listing containing links to bar/file1,
 919              bar/file2, etc.  Wget will lose because it saves this
 920              HTML listing to a file `bar', so it cannot create the
 921              directory.  To work around this, if the file of the same
 922              name exists, we just remove it and create the directory
 923              anyway.  */
 924           DEBUGP (("Removing %s because of directory danger!\n", t));
 925           unlink (t);
 926         }
 927     }
 928   res = make_directory (t);
 929   if (res != 0)
 930     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 931   xfree (t);
 932   return res;
 933 }
 934
 935 static int
 936 count_slashes (const char *s)
 937 {
 938   int i = 0;
 939   while (*s)
 940     if (*s++ == '/')
 941       ++i;
 942   return i;
 943 }
 944
 945 /* Return the path name of the URL-equivalent file name, with a
 946    remote-like structure of directories.  */
 947 static char *
 948 mkstruct (const struct urlinfo *u)
 949 {
 950   char *host, *dir, *file, *res, *dirpref;
 951   int l;
 952
 953   assert (u->dir != NULL);
 954   assert (u->host != NULL);
 955
 956   if (opt.cut_dirs)
 957     {
 958       char *ptr = u->dir + (*u->dir == '/');
 959       int slash_count = 1 + count_slashes (ptr);
 960       int cut = MINVAL (opt.cut_dirs, slash_count);
 961       for (; cut && *ptr; ptr++)
 962         if (*ptr == '/')
 963           --cut;
 964       STRDUP_ALLOCA (dir, ptr);
 965     }
 966   else
 967     dir = u->dir + (*u->dir == '/');
 968
 969   host = xstrdup (u->host);
 970   /* Check for the true name (or at least a consistent name for saving
 971      to directory) of HOST, reusing the hlist if possible.  */
 972   if (opt.add_hostdir && !opt.simple_check)
 973     {
 974       char *nhost = realhost (host);
 975       xfree (host);
 976       host = nhost;
 977     }
 978   /* Add dir_prefix and hostname (if required) to the beginning of
 979      dir.  */
 980   if (opt.add_hostdir)
 981     {
 982       if (!DOTP (opt.dir_prefix))
 983         {
 984           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 985                                     + strlen (host) + 1);
 986           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 987         }
 988       else
 989         STRDUP_ALLOCA (dirpref, host);
 990     }
 991   else                         /* not add_hostdir */
 992     {
 993       if (!DOTP (opt.dir_prefix))
 994         dirpref = opt.dir_prefix;
 995       else
 996         dirpref = "";
 997     }
 998   xfree (host);
 999
1000   /* If there is a prefix, prepend it.  */
1001   if (*dirpref)
1002     {
1003       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1004       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1005       dir = newdir;
1006     }
1007   dir = xstrdup (dir);
1008   URL_CLEANSE (dir);
1009   l = strlen (dir);
1010   if (l && dir[l - 1] == '/')
1011     dir[l - 1] = '\0';
1012
1013   if (!*u->file)
1014     file = "index.html";
1015   else
1016     file = u->file;
1017
1018   /* Finally, construct the full name.  */
1019   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1020   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1021   xfree (dir);
1022   return res;
1023 }
1024
1025 /* Create a unique filename, corresponding to a given URL.  Calls
1026    mkstruct if necessary.  Does *not* actually create any directories.  */
1027 char *
1028 url_filename (const struct urlinfo *u)
1029 {
1030   char *file, *name;
1031   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1032
1033   if (opt.dirstruct)
1034     {
1035       file = mkstruct (u);
1036       have_prefix = 1;
1037     }
1038   else
1039     {
1040       if (!*u->file)
1041         file = xstrdup ("index.html");
1042       else
1043         file = xstrdup (u->file);
1044     }
1045
1046   if (!have_prefix)
1047     {
1048       /* Check whether the prefix directory is something other than "."
1049          before prepending it.  */
1050       if (!DOTP (opt.dir_prefix))
1051         {
1052           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1053                                          + 1 + strlen (file) + 1);
1054           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1055           xfree (file);
1056           file = nfile;
1057         }
1058     }
1059   /* DOS-ish file systems don't like `%' signs in them; we change it
1060      to `@'.  */
1061 #ifdef WINDOWS
1062   {
1063     char *p = file;
1064     for (p = file; *p; p++)
1065       if (*p == '%')
1066         *p = '@';
1067   }
1068 #endif /* WINDOWS */
1069
1070   /* Check the cases in which the unique extensions are not used:
1071      1) Clobbering is turned off (-nc).
1072      2) Retrieval with regetting.
1073      3) Timestamping is used.
1074      4) Hierarchy is built.
1075
1076      The exception is the case when file does exist and is a
1077      directory (actually support for bad httpd-s).  */
1078   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1079       && !(file_exists_p (file) && !file_non_directory_p (file)))
1080     return file;
1081
1082   /* Find a unique name.  */
1083   name = unique_name (file);
1084   xfree (file);
1085   return name;
1086 }
1087
1088 /* Like strlen(), but allow the URL to be ended with '?'.  */
1089 static int
1090 urlpath_length (const char *url)
1091 {
1092   const char *q = strchr (url, '?');
1093   if (q)
1094     return q - url;
1095   return strlen (url);
1096 }
1097
1098 /* Find the last occurrence of character C in the range [b, e), or
1099    NULL, if none are present.  This is almost completely equivalent to
1100    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1101    the contents of the string.  */
1102 static const char *
1103 find_last_char (const char *b, const char *e, char c)
1104 {
1105   for (; e > b; e--)
1106     if (*e == c)
1107       return e;
1108   return NULL;
1109 }
1110
1111 /* Construct a URL by concatenating an absolute URL and a path, which
1112    may or may not be absolute.  This tries to behave "reasonably" in
1113    all foreseeable cases.  It employs little specific knowledge about
1114    protocols or URL-specific stuff -- it just works on strings.  */
1115 static char *
1116 construct (const char *url, const char *sub, int subsize, int no_proto)
1117 {
1118   char *constr;
1119
1120   if (no_proto)
1121     {
1122       const char *end = url + urlpath_length (url);
1123
1124       if (*sub != '/')
1125         {
1126           /* SUB is a relative URL: we need to replace everything
1127              after last slash (possibly empty) with SUB.
1128
1129              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1130              our result should be "whatever/foo/qux/xyzzy".  */
1131           int need_explicit_slash = 0;
1132           int span;
1133           const char *start_insert;
1134           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1135           if (!last_slash)
1136             {
1137               /* No slash found at all.  Append SUB to what we have,
1138                  but we'll need a slash as a separator.
1139
1140                  Example: if url == "foo" and sub == "qux/xyzzy", then
1141                  we cannot just append sub to url, because we'd get
1142                  "fooqux/xyzzy", whereas what we want is
1143                  "foo/qux/xyzzy".
1144
1145                  To make sure the / gets inserted, we set
1146                  need_explicit_slash to 1.  We also set start_insert
1147                  to end + 1, so that the length calculations work out
1148                  correctly for one more (slash) character.  Accessing
1149                  that character is fine, since it will be the
1150                  delimiter, '\0' or '?'.  */
1151               /* example: "foo?..." */
1152               /*               ^    ('?' gets changed to '/') */
1153               start_insert = end + 1;
1154               need_explicit_slash = 1;
1155             }
1156           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1157             {
1158               /* example: http://host"  */
1159               /*                      ^ */
1160               start_insert = end + 1;
1161               need_explicit_slash = 1;
1162             }
1163           else
1164             {
1165               /* example: "whatever/foo/bar" */
1166               /*                        ^    */
1167               start_insert = last_slash + 1;
1168             }
1169
1170           span = start_insert - url;
1171           constr = (char *)xmalloc (span + subsize + 1);
1172           if (span)
1173             memcpy (constr, url, span);
1174           if (need_explicit_slash)
1175             constr[span - 1] = '/';
1176           if (subsize)
1177             memcpy (constr + span, sub, subsize);
1178           constr[span + subsize] = '\0';
1179         }
1180       else /* *sub == `/' */
1181         {
1182           /* SUB is an absolute path: we need to replace everything
1183              after (and including) the FIRST slash with SUB.
1184
1185              So, if URL is "http://host/whatever/foo/bar", and SUB is
1186              "/qux/xyzzy", our result should be
1187              "http://host/qux/xyzzy".  */
1188           int span;
1189           const char *slash;
1190           const char *start_insert = NULL; /* for gcc to shut up. */
1191           const char *pos = url;
1192           int seen_slash_slash = 0;
1193           /* We're looking for the first slash, but want to ignore
1194              double slash. */
1195         again:
1196           slash = memchr (pos, '/', end - pos);
1197           if (slash && !seen_slash_slash)
1198             if (*(slash + 1) == '/')
1199               {
1200                 pos = slash + 2;
1201                 seen_slash_slash = 1;
1202                 goto again;
1203               }
1204
1205           /* At this point, SLASH is the location of the first / after
1206              "//", or the first slash altogether.  START_INSERT is the
1207              pointer to the location where SUB will be inserted.  When
1208              examining the last two examples, keep in mind that SUB
1209              begins with '/'. */
1210
1211           if (!slash && !seen_slash_slash)
1212             /* example: "foo" */
1213             /*           ^    */
1214             start_insert = url;
1215           else if (!slash && seen_slash_slash)
1216             /* example: "http://foo" */
1217             /*                     ^ */
1218             start_insert = end;
1219           else if (slash && !seen_slash_slash)
1220             /* example: "foo/bar" */
1221             /*           ^        */
1222             start_insert = url;
1223           else if (slash && seen_slash_slash)
1224             /* example: "http://something/" */
1225             /*                           ^  */
1226             start_insert = slash;
1227
1228           span = start_insert - url;
1229           constr = (char *)xmalloc (span + subsize + 1);
1230           if (span)
1231             memcpy (constr, url, span);
1232           if (subsize)
1233             memcpy (constr + span, sub, subsize);
1234           constr[span + subsize] = '\0';
1235         }
1236     }
1237   else /* !no_proto */
1238     {
1239       constr = strdupdelim (sub, sub + subsize);
1240     }
1241   return constr;
1242 }
1243
1244 /* Like the function above, but with a saner caller interface. */
1245 char *
1246 url_concat (const char *base_url, const char *new_url)
1247 {
1248   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1249 }
1250 \f
1251 /* Optimize URL by host, destructively replacing u->host with realhost
1252    (u->host).  Do this regardless of opt.simple_check.  */
1253 void
1254 opt_url (struct urlinfo *u)
1255 {
1256   /* Find the "true" host.  */
1257   char *host = realhost (u->host);
1258   xfree (u->host);
1259   u->host = host;
1260   assert (u->dir != NULL);      /* the URL must have been parsed */
1261   /* Refresh the printed representation.  */
1262   xfree (u->url);
1263   u->url = str_url (u, 0);
1264 }
1265
1266 /* This beautiful kludge is fortunately not needed, as I've made
1267    parse_dir do the (almost) right thing, so that a query can never
1268    become a part of directory.  */
1269 #if 0
1270 /* Call path_simplify, but make sure that the part after the
1271    question-mark, if any, is not destroyed by path_simplify's
1272    "optimizations".  */
1273 void
1274 path_simplify_with_kludge (char *path)
1275 {
1276   char *query = strchr (path, '?');
1277   if (query)
1278     /* path_simplify also works destructively, so we also have the
1279        license to write. */
1280     *query = '\0';
1281   path_simplify (path);
1282   if (query)
1283     {
1284       char *newend = path + strlen (path);
1285       *query = '?';
1286       if (newend != query)
1287         memmove (newend, query, strlen (query) + 1);
1288     }
1289 }
1290 #endif
1291 \f
1292 /* Returns proxy host address, in accordance with PROTO.  */
1293 char *
1294 getproxy (uerr_t proto)
1295 {
1296   if (proto == URLHTTP)
1297     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1298   else if (proto == URLFTP)
1299     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1300 #ifdef HAVE_SSL
1301   else if (proto == URLHTTPS)
1302     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1303 #endif /* HAVE_SSL */
1304   else
1305     return NULL;
1306 }
1307
1308 /* Should a host be accessed through proxy, concerning no_proxy?  */
1309 int
1310 no_proxy_match (const char *host, const char **no_proxy)
1311 {
1312   if (!no_proxy)
1313     return 1;
1314   else
1315     return !sufmatch (no_proxy, host);
1316 }
1317 \f
1318 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1319 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1320
1321 /* Change the links in an HTML document.  Accepts a structure that
1322    defines the positions of all the links.  */
1323 void
1324 convert_links (const char *file, urlpos *l)
1325 {
1326   struct file_memory *fm;
1327   FILE               *fp;
1328   const char         *p;
1329   downloaded_file_t  downloaded_file_return;
1330
1331   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1332
1333   {
1334     /* First we do a "dry run": go through the list L and see whether
1335        any URL needs to be converted in the first place.  If not, just
1336        leave the file alone.  */
1337     int count = 0;
1338     urlpos *dry = l;
1339     for (dry = l; dry; dry = dry->next)
1340       if (dry->convert != CO_NOCONVERT)
1341         ++count;
1342     if (!count)
1343       {
1344         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1345         return;
1346       }
1347   }
1348
1349   fm = read_file (file);
1350   if (!fm)
1351     {
1352       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1353                  file, strerror (errno));
1354       return;
1355     }
1356
1357   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1358   if (opt.backup_converted && downloaded_file_return)
1359     write_backup_file (file, downloaded_file_return);
1360
1361   /* Before opening the file for writing, unlink the file.  This is
1362      important if the data in FM is mmaped.  In such case, nulling the
1363      file, which is what fopen() below does, would make us read all
1364      zeroes from the mmaped region.  */
1365   if (unlink (file) < 0 && errno != ENOENT)
1366     {
1367       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1368                  file, strerror (errno));
1369       read_file_free (fm);
1370       return;
1371     }
1372   /* Now open the file for writing.  */
1373   fp = fopen (file, "wb");
1374   if (!fp)
1375     {
1376       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1377                  file, strerror (errno));
1378       read_file_free (fm);
1379       return;
1380     }
1381   /* Here we loop through all the URLs in file, replacing those of
1382      them that are downloaded with relative references.  */
1383   p = fm->content;
1384   for (; l; l = l->next)
1385     {
1386       char *url_start = fm->content + l->pos;
1387
1388       if (l->pos >= fm->length)
1389         {
1390           DEBUGP (("Something strange is going on.  Please investigate."));
1391           break;
1392         }
1393       /* If the URL is not to be converted, skip it.  */
1394       if (l->convert == CO_NOCONVERT)
1395         {
1396           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1397           continue;
1398         }
1399
1400       /* Echo the file contents, up to the offending URL's opening
1401          quote, to the outfile.  */
1402       fwrite (p, 1, url_start - p, fp);
1403       p = url_start;
1404       if (l->convert == CO_CONVERT_TO_RELATIVE)
1405         {
1406           /* Convert absolute URL to relative. */
1407           char *newname = construct_relative (file, l->local_name);
1408           char *quoted_newname = html_quote_string (newname);
1409           replace_attr (&p, l->size, fp, quoted_newname);
1410           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1411                    l->url, newname, l->pos, file));
1412           xfree (newname);
1413           xfree (quoted_newname);
1414         }
1415       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1416         {
1417           /* Convert the link to absolute URL. */
1418           char *newlink = l->url;
1419           char *quoted_newlink = html_quote_string (newlink);
1420           replace_attr (&p, l->size, fp, quoted_newlink);
1421           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1422                    newlink, l->pos, file));
1423           xfree (quoted_newlink);
1424         }
1425     }
1426   /* Output the rest of the file. */
1427   if (p - fm->content < fm->length)
1428     fwrite (p, 1, fm->length - (p - fm->content), fp);
1429   fclose (fp);
1430   read_file_free (fm);
1431   logputs (LOG_VERBOSE, _("done.\n"));
1432 }
1433
1434 /* Construct and return a malloced copy of the relative link from two
1435    pieces of information: local name S1 of the referring file and
1436    local name S2 of the referred file.
1437
1438    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1439    "jagor.srce.hr/images/news.gif", the function will return
1440    "images/news.gif".
1441
1442    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1443    "fly.cc.fer.hr/images/fly.gif", the function will return
1444    "../images/fly.gif".
1445
1446    Caveats: S1 should not begin with `/', unless S2 also begins with
1447    '/'.  S1 should not contain things like ".." and such --
1448    construct_relative ("fly/ioccc/../index.html",
1449    "fly/images/fly.gif") will fail.  (A workaround is to call
1450    something like path_simplify() on S1).  */
1451 static char *
1452 construct_relative (const char *s1, const char *s2)
1453 {
1454   int i, cnt, sepdirs1;
1455   char *res;
1456
1457   if (*s2 == '/')
1458     return xstrdup (s2);
1459   /* S1 should *not* be absolute, if S2 wasn't.  */
1460   assert (*s1 != '/');
1461   i = cnt = 0;
1462   /* Skip the directories common to both strings.  */
1463   while (1)
1464     {
1465       while (s1[i] && s2[i]
1466              && (s1[i] == s2[i])
1467              && (s1[i] != '/')
1468              && (s2[i] != '/'))
1469         ++i;
1470       if (s1[i] == '/' && s2[i] == '/')
1471         cnt = ++i;
1472       else
1473         break;
1474     }
1475   for (sepdirs1 = 0; s1[i]; i++)
1476     if (s1[i] == '/')
1477       ++sepdirs1;
1478   /* Now, construct the file as of:
1479      - ../ repeated sepdirs1 time
1480      - all the non-mutual directories of S2.  */
1481   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1482   for (i = 0; i < sepdirs1; i++)
1483     memcpy (res + 3 * i, "../", 3);
1484   strcpy (res + 3 * i, s2 + cnt);
1485   return res;
1486 }
1487 \f
1488 /* Add URL to the head of the list L.  */
1489 urlpos *
1490 add_url (urlpos *l, const char *url, const char *file)
1491 {
1492   urlpos *t;
1493
1494   t = (urlpos *)xmalloc (sizeof (urlpos));
1495   memset (t, 0, sizeof (*t));
1496   t->url = xstrdup (url);
1497   t->local_name = xstrdup (file);
1498   t->next = l;
1499   return t;
1500 }
1501
1502 static void
1503 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1504 {
1505   /* Rather than just writing over the original .html file with the
1506      converted version, save the former to *.orig.  Note we only do
1507      this for files we've _successfully_ downloaded, so we don't
1508      clobber .orig files sitting around from previous invocations. */
1509
1510   /* Construct the backup filename as the original name plus ".orig". */
1511   size_t         filename_len = strlen(file);
1512   char*          filename_plus_orig_suffix;
1513   boolean        already_wrote_backup_file = FALSE;
1514   slist*         converted_file_ptr;
1515   static slist*  converted_files = NULL;
1516
1517   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1518     {
1519       /* Just write "orig" over "html".  We need to do it this way
1520          because when we're checking to see if we've downloaded the
1521          file before (to see if we can skip downloading it), we don't
1522          know if it's a text/html file.  Therefore we don't know yet
1523          at that stage that -E is going to cause us to tack on
1524          ".html", so we need to compare vs. the original URL plus
1525          ".orig", not the original URL plus ".html.orig". */
1526       filename_plus_orig_suffix = alloca (filename_len + 1);
1527       strcpy(filename_plus_orig_suffix, file);
1528       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1529     }
1530   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1531     {
1532       /* Append ".orig" to the name. */
1533       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1534       strcpy(filename_plus_orig_suffix, file);
1535       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1536     }
1537
1538   /* We can get called twice on the same URL thanks to the
1539      convert_all_links() call in main().  If we write the .orig file
1540      each time in such a case, it'll end up containing the first-pass
1541      conversion, not the original file.  So, see if we've already been
1542      called on this file. */
1543   converted_file_ptr = converted_files;
1544   while (converted_file_ptr != NULL)
1545     if (strcmp(converted_file_ptr->string, file) == 0)
1546       {
1547         already_wrote_backup_file = TRUE;
1548         break;
1549       }
1550     else
1551       converted_file_ptr = converted_file_ptr->next;
1552
1553   if (!already_wrote_backup_file)
1554     {
1555       /* Rename <file> to <file>.orig before former gets written over. */
1556       if (rename(file, filename_plus_orig_suffix) != 0)
1557         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1558                    file, filename_plus_orig_suffix, strerror (errno));
1559
1560       /* Remember that we've already written a .orig backup for this file.
1561          Note that we never free this memory since we need it till the
1562          convert_all_links() call, which is one of the last things the
1563          program does before terminating.  BTW, I'm not sure if it would be
1564          safe to just set 'converted_file_ptr->string' to 'file' below,
1565          rather than making a copy of the string...  Another note is that I
1566          thought I could just add a field to the urlpos structure saying
1567          that we'd written a .orig file for this URL, but that didn't work,
1568          so I had to make this separate list.
1569          -- Dan Harkless <wget@harkless.org>
1570
1571          This [adding a field to the urlpos structure] didn't work
1572          because convert_file() is called twice: once after all its
1573          sublinks have been retrieved in recursive_retrieve(), and
1574          once at the end of the day in convert_all_links().  The
1575          original linked list collected in recursive_retrieve() is
1576          lost after the first invocation of convert_links(), and
1577          convert_all_links() makes a new one (it calls get_urls_html()
1578          for each file it covers.)  That's why your first approach didn't
1579          work.  The way to make it work is perhaps to make this flag a
1580          field in the `urls_html' list.
1581          -- Hrvoje Niksic <hniksic@arsdigita.com>
1582       */
1583       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1584       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1585       converted_file_ptr->next = converted_files;
1586       converted_files = converted_file_ptr;
1587     }
1588 }
1589
1590 static int find_fragment PARAMS ((const char *, int, const char **,
1591                                   const char **));
1592
1593 static void
1594 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1595 {
1596   const char *p = *pp;
1597   int quote_flag = 0;
1598   int size = raw_size;
1599   char quote_char = '\"';
1600   const char *frag_beg, *frag_end;
1601
1602   /* Structure of our string is:
1603        "...old-contents..."
1604        <---  l->size   --->  (with quotes)
1605      OR:
1606        ...old-contents...
1607        <---  l->size  -->    (no quotes)   */
1608
1609   if (*p == '\"' || *p == '\'')
1610     {
1611       quote_char = *p;
1612       quote_flag = 1;
1613       ++p;
1614       size -= 2;                /* disregard opening and closing quote */
1615     }
1616   putc (quote_char, fp);
1617   fputs (new_str, fp);
1618
1619   /* Look for fragment identifier, if any. */
1620   if (find_fragment (p, size, &frag_beg, &frag_end))
1621     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1622   p += size;
1623   if (quote_flag)
1624     ++p;
1625   putc (quote_char, fp);
1626   *pp = p;
1627 }
1628
1629 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1630    preceded by '&'.  If the character is not found, return zero.  If
1631    the character is found, return 1 and set BP and EP to point to the
1632    beginning and end of the region.
1633
1634    This is used for finding the fragment indentifiers in URLs.  */
1635
1636 static int
1637 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1638 {
1639   const char *end = beg + size;
1640   int saw_amp = 0;
1641   for (; beg < end; beg++)
1642     {
1643       switch (*beg)
1644         {
1645         case '&':
1646           saw_amp = 1;
1647           break;
1648         case '#':
1649           if (!saw_amp)
1650             {
1651               *bp = beg;
1652               *ep = end;
1653               return 1;
1654             }
1655           /* fallthrough */
1656         default:
1657           saw_amp = 0;
1658         }
1659     }
1660   return 0;
1661 }
1662
1663 typedef struct _downloaded_file_list {
1664   char*                          file;
1665   downloaded_file_t              download_type;
1666   struct _downloaded_file_list*  next;
1667 } downloaded_file_list;
1668
1669 static downloaded_file_list *downloaded_files;
1670
1671 /* Remembers which files have been downloaded.  In the standard case, should be
1672    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1673    download successfully (i.e. not for ones we have failures on or that we skip
1674    due to -N).
1675
1676    When we've downloaded a file and tacked on a ".html" extension due to -E,
1677    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1678    FILE_DOWNLOADED_NORMALLY.
1679
1680    If you just want to check if a file has been previously added without adding
1681    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1682    with local filenames, not remote URLs. */
1683 downloaded_file_t
1684 downloaded_file (downloaded_file_t  mode, const char*  file)
1685 {
1686   boolean                       found_file = FALSE;
1687   downloaded_file_list*         rover = downloaded_files;
1688
1689   while (rover != NULL)
1690     if (strcmp(rover->file, file) == 0)
1691       {
1692         found_file = TRUE;
1693         break;
1694       }
1695     else
1696       rover = rover->next;
1697
1698   if (found_file)
1699     return rover->download_type;  /* file had already been downloaded */
1700   else
1701     {
1702       if (mode != CHECK_FOR_FILE)
1703         {
1704           rover = xmalloc(sizeof(*rover));
1705           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1706           rover->download_type = mode;
1707           rover->next = downloaded_files;
1708           downloaded_files = rover;
1709         }
1710
1711       return FILE_NOT_ALREADY_DOWNLOADED;
1712     }
1713 }
1714
1715 void
1716 downloaded_files_free (void)
1717 {
1718   downloaded_file_list*         rover = downloaded_files;
1719   while (rover)
1720     {
1721       downloaded_file_list *next = rover->next;
1722       xfree (rover->file);
1723       xfree (rover);
1724       rover = next;
1725     }
1726 }
1727 \f
1728 /* Initialization of static stuff. */
1729 void
1730 url_init (void)
1731 {
1732   init_unsafe_char_table ();
1733 }