sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Default port definitions */
  47 #define DEFAULT_HTTP_PORT 80
  48 #define DEFAULT_FTP_PORT 21
  49 #define DEFAULT_HTTPS_PORT 443
  50
  51 /* Table of Unsafe chars.  This is intialized in
  52    init_unsafe_char_table.  */
  53
  54 static char unsafe_char_table[256];
  55
  56 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  57
  58 /* If S contains unsafe characters, free it and replace it with a
  59    version that doesn't.  */
  60 #define URL_CLEANSE(s) do                       \
  61 {                                               \
  62   if (contains_unsafe (s))                      \
  63     {                                           \
  64       char *uc_tmp = encode_string (s);         \
  65       xfree (s);                                \
  66       (s) = uc_tmp;                             \
  67     }                                           \
  68 } while (0)
  69
  70 /* Is a directory "."?  */
  71 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  72 /* Is a directory ".."?  */
  73 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  74
  75 #if 0
  76 static void path_simplify_with_kludge PARAMS ((char *));
  77 #endif
  78 static int urlpath_length PARAMS ((const char *));
  79
  80 /* NULL-terminated list of strings to be recognized as prototypes (URL
  81    schemes).  Note that recognized doesn't mean supported -- only HTTP,
  82    HTTPS and FTP are currently supported .
  83
  84    However, a string that does not match anything in the list will be
  85    considered a relative URL.  Thus it's important that this list has
  86    anything anyone could think of being legal.
  87
  88    There are wild things here.  :-) Take a look at
  89    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  90    fun.  */
  91 static char *protostrings[] =
  92 {
  93   "cid:",
  94   "clsid:",
  95   "file:",
  96   "finger:",
  97   "ftp:",
  98   "gopher:",
  99   "hdl:",
 100   "http:",
 101   "https:",
 102   "ilu:",
 103   "ior:",
 104   "irc:",
 105   "java:",
 106   "javascript:",
 107   "lifn:",
 108   "mailto:",
 109   "mid:",
 110   "news:",
 111   "nntp:",
 112   "path:",
 113   "prospero:",
 114   "rlogin:",
 115   "service:",
 116   "shttp:",
 117   "snews:",
 118   "stanf:",
 119   "telnet:",
 120   "tn3270:",
 121   "wais:",
 122   "whois++:",
 123   NULL
 124 };
 125
 126 struct proto
 127 {
 128   char *name;
 129   uerr_t ind;
 130   unsigned short port;
 131 };
 132
 133 /* Similar to former, but for supported protocols: */
 134 static struct proto sup_protos[] =
 135 {
 136   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 137 #ifdef HAVE_SSL
 138   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 139 #endif
 140   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 141   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 142 };
 143
 144 static void parse_dir PARAMS ((const char *, char **, char **));
 145 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 146 static char *construct PARAMS ((const char *, const char *, int , int));
 147 static char *construct_relative PARAMS ((const char *, const char *));
 148 static char process_ftp_type PARAMS ((char *));
 149
 150 \f
 151 /* Returns the number of characters to be skipped if the first thing
 152    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 153    URL: are also skipped.  */
 154 int
 155 skip_url (const char *url)
 156 {
 157   int i;
 158
 159   if (TOUPPER (url[0]) == 'U'
 160       && TOUPPER (url[1]) == 'R'
 161       && TOUPPER (url[2]) == 'L'
 162       && url[3] == ':')
 163     {
 164       /* Skip blanks.  */
 165       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 166       return i;
 167     }
 168   else
 169     return 0;
 170 }
 171
 172 /* Unsafe chars:
 173    - anything <= 32;
 174    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 175    - @ and :, for user/password encoding.
 176    - everything over 127 (but we don't bother with recording those.  */
 177 void
 178 init_unsafe_char_table (void)
 179 {
 180   int i;
 181   for (i = 0; i < 256; i++)
 182     if (i < 32 || i >= 127
 183         || i == ' '
 184         || i == '<'
 185         || i == '>'
 186         || i == '\"'
 187         || i == '#'
 188         || i == '%'
 189         || i == '{'
 190         || i == '}'
 191         || i == '|'
 192         || i == '\\'
 193         || i == '^'
 194         || i == '~'
 195         || i == '['
 196         || i == ']'
 197         || i == '`')
 198       unsafe_char_table[i] = 1;
 199 }
 200
 201 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 202 int
 203 contains_unsafe (const char *s)
 204 {
 205   for (; *s; s++)
 206     if (UNSAFE_CHAR (*s))
 207       return 1;
 208   return 0;
 209 }
 210
 211 /* Decodes the forms %xy in a URL to the character the hexadecimal
 212    code of which is xy.  xy are hexadecimal digits from
 213    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 214    hex-digits or `%' precedes `\0', the sequence is inserted
 215    literally.  */
 216
 217 static void
 218 decode_string (char *s)
 219 {
 220   char *p = s;
 221
 222   for (; *s; s++, p++)
 223     {
 224       if (*s != '%')
 225         *p = *s;
 226       else
 227         {
 228           /* Do nothing if at the end of the string, or if the chars
 229              are not hex-digits.  */
 230           if (!*(s + 1) || !*(s + 2)
 231               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 232             {
 233               *p = *s;
 234               continue;
 235             }
 236           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 237           s += 2;
 238         }
 239     }
 240   *p = '\0';
 241 }
 242
 243 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 244    given string, returning a malloc-ed %XX encoded string.  */
 245 char *
 246 encode_string (const char *s)
 247 {
 248   const char *b;
 249   char *p, *res;
 250   int i;
 251
 252   b = s;
 253   for (i = 0; *s; s++, i++)
 254     if (UNSAFE_CHAR (*s))
 255       i += 2; /* Two more characters (hex digits) */
 256   res = (char *)xmalloc (i + 1);
 257   s = b;
 258   for (p = res; *s; s++)
 259     if (UNSAFE_CHAR (*s))
 260       {
 261         const unsigned char c = *s;
 262         *p++ = '%';
 263         *p++ = HEXD2ASC (c >> 4);
 264         *p++ = HEXD2ASC (c & 0xf);
 265       }
 266     else
 267       *p++ = *s;
 268   *p = '\0';
 269   return res;
 270 }
 271 \f
 272 /* Returns the proto-type if URL's protocol is supported, or
 273    URLUNKNOWN if not.  */
 274 uerr_t
 275 urlproto (const char *url)
 276 {
 277   int i;
 278
 279   url += skip_url (url);
 280   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 281     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 282       return sup_protos[i].ind;
 283   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 284   if (url[i] == ':')
 285     {
 286       for (++i; url[i] && url[i] != '/'; i++)
 287         if (!ISDIGIT (url[i]))
 288           return URLBADPORT;
 289       if (url[i - 1] == ':')
 290         return URLFTP;
 291       else
 292         return URLHTTP;
 293     }
 294   else
 295     return URLHTTP;
 296 }
 297
 298 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 299    part is found, returns 0.  */
 300 int
 301 skip_proto (const char *url)
 302 {
 303   char **s;
 304   int l;
 305
 306   for (s = protostrings; *s; s++)
 307     if (!strncasecmp (*s, url, strlen (*s)))
 308       break;
 309   if (!*s)
 310     return 0;
 311   l = strlen (*s);
 312   /* HTTP and FTP protocols are expected to yield exact host names
 313      (i.e. the `//' part must be skipped, too).  */
 314   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 315     l += 2;
 316   return l;
 317 }
 318
 319 /* Returns 1 if the URL begins with a protocol (supported or
 320    unsupported), 0 otherwise.  */
 321 int
 322 has_proto (const char *url)
 323 {
 324   char **s;
 325
 326   url += skip_url (url);
 327   for (s = protostrings; *s; s++)
 328     if (strncasecmp (url, *s, strlen (*s)) == 0)
 329       return 1;
 330   return 0;
 331 }
 332
 333 /* Skip the username and password, if present here.  The function
 334    should be called *not* with the complete URL, but with the part
 335    right after the protocol.
 336
 337    If no username and password are found, return 0.  */
 338 int
 339 skip_uname (const char *url)
 340 {
 341   const char *p;
 342   const char *q = NULL;
 343   for (p = url ; *p && *p != '/'; p++)
 344     if (*p == '@') q = p;
 345   /* If a `@' was found before the first occurrence of `/', skip
 346      it.  */
 347   if (q != NULL)
 348     return q - url + 1;
 349   else
 350     return 0;
 351 }
 352 \f
 353 /* Allocate a new urlinfo structure, fill it with default values and
 354    return a pointer to it.  */
 355 struct urlinfo *
 356 newurl (void)
 357 {
 358   struct urlinfo *u;
 359
 360   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 361   memset (u, 0, sizeof (*u));
 362   u->proto = URLUNKNOWN;
 363   return u;
 364 }
 365
 366 /* Perform a "deep" free of the urlinfo structure.  The structure
 367    should have been created with newurl, but need not have been used.
 368    If free_pointer is non-0, free the pointer itself.  */
 369 void
 370 freeurl (struct urlinfo *u, int complete)
 371 {
 372   assert (u != NULL);
 373   FREE_MAYBE (u->url);
 374   FREE_MAYBE (u->host);
 375   FREE_MAYBE (u->path);
 376   FREE_MAYBE (u->file);
 377   FREE_MAYBE (u->dir);
 378   FREE_MAYBE (u->user);
 379   FREE_MAYBE (u->passwd);
 380   FREE_MAYBE (u->local);
 381   FREE_MAYBE (u->referer);
 382   if (u->proxy)
 383     freeurl (u->proxy, 1);
 384   if (complete)
 385     xfree (u);
 386   return;
 387 }
 388 \f
 389 /* Extract the given URL of the form
 390    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 391    1. hostname (terminated with `/' or `:')
 392    2. port number (terminated with `/'), or chosen for the protocol
 393    3. dirname (everything after hostname)
 394    Most errors are handled.  No allocation is done, you must supply
 395    pointers to allocated memory.
 396    ...and a host of other stuff :-)
 397
 398    - Recognizes hostname:dir/file for FTP and
 399      hostname (:portnum)?/dir/file for HTTP.
 400    - Parses the path to yield directory and file
 401    - Parses the URL to yield the username and passwd (if present)
 402    - Decodes the strings, in case they contain "forbidden" characters
 403    - Writes the result to struct urlinfo
 404
 405    If the argument STRICT is set, it recognizes only the canonical
 406    form.  */
 407 uerr_t
 408 parseurl (const char *url, struct urlinfo *u, int strict)
 409 {
 410   int i, l, abs_ftp;
 411   int recognizable;            /* Recognizable URL is the one where
 412                                   the protocol name was explicitly
 413                                   named, i.e. it wasn't deduced from
 414                                   the URL format.  */
 415   uerr_t type;
 416
 417   DEBUGP (("parseurl (\"%s\") -> ", url));
 418   url += skip_url (url);
 419   recognizable = has_proto (url);
 420   if (strict && !recognizable)
 421     return URLUNKNOWN;
 422   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 423     {
 424       l = strlen (sup_protos[i].name);
 425       if (!strncasecmp (sup_protos[i].name, url, l))
 426         break;
 427     }
 428   /* If protocol is recognizable, but unsupported, bail out, else
 429      suppose unknown.  */
 430   if (recognizable && i == ARRAY_SIZE (sup_protos))
 431     return URLUNKNOWN;
 432   else if (i == ARRAY_SIZE (sup_protos))
 433     type = URLUNKNOWN;
 434   else
 435     u->proto = type = sup_protos[i].ind;
 436
 437   if (type == URLUNKNOWN)
 438     l = 0;
 439   /* Allow a username and password to be specified (i.e. just skip
 440      them for now).  */
 441   if (recognizable)
 442     l += skip_uname (url + l);
 443   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 444   if (i == l)
 445     return URLBADHOST;
 446   /* Get the hostname.  */
 447   u->host = strdupdelim (url + l, url + i);
 448   DEBUGP (("host %s -> ", u->host));
 449
 450   /* Assume no port has been given.  */
 451   u->port = 0;
 452   if (url[i] == ':')
 453     {
 454       /* We have a colon delimiting the hostname.  It could mean that
 455          a port number is following it, or a directory.  */
 456       if (ISDIGIT (url[++i]))    /* A port number */
 457         {
 458           if (type == URLUNKNOWN)
 459             u->proto = type = URLHTTP;
 460           for (; url[i] && url[i] != '/'; i++)
 461             if (ISDIGIT (url[i]))
 462               u->port = 10 * u->port + (url[i] - '0');
 463             else
 464               return URLBADPORT;
 465           if (!u->port)
 466             return URLBADPORT;
 467           DEBUGP (("port %hu -> ", u->port));
 468         }
 469       else if (type == URLUNKNOWN) /* or a directory */
 470         u->proto = type = URLFTP;
 471       else                      /* or just a misformed port number */
 472         return URLBADPORT;
 473     }
 474   else if (type == URLUNKNOWN)
 475     u->proto = type = URLHTTP;
 476   if (!u->port)
 477     {
 478       int ind;
 479       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 480         if (sup_protos[ind].ind == type)
 481           break;
 482       if (ind == ARRAY_SIZE (sup_protos))
 483         return URLUNKNOWN;
 484       u->port = sup_protos[ind].port;
 485     }
 486   /* Some delimiter troubles...  */
 487   if (url[i] == '/' && url[i - 1] != ':')
 488     ++i;
 489   if (type == URLHTTP)
 490     while (url[i] && url[i] == '/')
 491       ++i;
 492   u->path = (char *)xmalloc (strlen (url + i) + 8);
 493   strcpy (u->path, url + i);
 494   if (type == URLFTP)
 495     {
 496       u->ftp_type = process_ftp_type (u->path);
 497       /* #### We don't handle type `d' correctly yet.  */
 498       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 499         u->ftp_type = 'I';
 500       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 501     }
 502   DEBUGP (("opath %s -> ", u->path));
 503   /* Parse the username and password (if existing).  */
 504   parse_uname (url, &u->user, &u->passwd);
 505   /* Decode the strings, as per RFC 1738.  */
 506   decode_string (u->host);
 507   decode_string (u->path);
 508   if (u->user)
 509     decode_string (u->user);
 510   if (u->passwd)
 511     decode_string (u->passwd);
 512   /* Parse the directory.  */
 513   parse_dir (u->path, &u->dir, &u->file);
 514   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 515   /* Simplify the directory.  */
 516   path_simplify (u->dir);
 517   /* Remove the leading `/' in HTTP.  */
 518   if (type == URLHTTP && *u->dir == '/')
 519     strcpy (u->dir, u->dir + 1);
 520   DEBUGP (("ndir %s\n", u->dir));
 521   /* Strip trailing `/'.  */
 522   l = strlen (u->dir);
 523   if (l && u->dir[l - 1] == '/')
 524     u->dir[l - 1] = '\0';
 525   /* Re-create the path: */
 526   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 527   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 528       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 529   strcpy (u->path, abs_ftp ? "%2F" : "/");
 530   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 531   strcat (u->path, *u->dir ? "/" : "");
 532   strcat (u->path, u->file);
 533   URL_CLEANSE (u->path);
 534   DEBUGP (("newpath: %s\n", u->path));
 535   /* Create the clean URL.  */
 536   u->url = str_url (u, 0);
 537   return URLOK;
 538 }
 539 \f
 540 /* Special versions of DOTP and DDOTP for parse_dir(). */
 541
 542 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 543 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 544                      && (!*((x) + 2) || *((x) + 2) == '?'))
 545
 546 /* Build the directory and filename components of the path.  Both
 547    components are *separately* malloc-ed strings!  It does not change
 548    the contents of path.
 549
 550    If the path ends with "." or "..", they are (correctly) counted as
 551    directories.  */
 552 static void
 553 parse_dir (const char *path, char **dir, char **file)
 554 {
 555   int i, l;
 556
 557   l = urlpath_length (path);
 558   for (i = l; i && path[i] != '/'; i--);
 559
 560   if (!i && *path != '/')   /* Just filename */
 561     {
 562       if (PD_DOTP (path) || PD_DDOTP (path))
 563         {
 564           *dir = strdupdelim (path, path + l);
 565           *file = xstrdup (path + l); /* normally empty, but could
 566                                          contain ?... */
 567         }
 568       else
 569         {
 570           *dir = xstrdup ("");     /* This is required because of FTP */
 571           *file = xstrdup (path);
 572         }
 573     }
 574   else if (!i)                 /* /filename */
 575     {
 576       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 577         {
 578           *dir = strdupdelim (path, path + l);
 579           *file = xstrdup (path + l); /* normally empty, but could
 580                                          contain ?... */
 581         }
 582       else
 583         {
 584           *dir = xstrdup ("/");
 585           *file = xstrdup (path + 1);
 586         }
 587     }
 588   else /* Nonempty directory with or without a filename */
 589     {
 590       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 591         {
 592           *dir = strdupdelim (path, path + l);
 593           *file = xstrdup (path + l); /* normally empty, but could
 594                                          contain ?... */
 595         }
 596       else
 597         {
 598           *dir = strdupdelim (path, path + i);
 599           *file = xstrdup (path + i + 1);
 600         }
 601     }
 602 }
 603
 604 /* Find the optional username and password within the URL, as per
 605    RFC1738.  The returned user and passwd char pointers are
 606    malloc-ed.  */
 607 static uerr_t
 608 parse_uname (const char *url, char **user, char **passwd)
 609 {
 610   int l;
 611   const char *p, *q, *col;
 612   char **where;
 613
 614   *user = NULL;
 615   *passwd = NULL;
 616   url += skip_url (url);
 617   /* Look for end of protocol string.  */
 618   l = skip_proto (url);
 619   if (!l)
 620     return URLUNKNOWN;
 621   /* Add protocol offset.  */
 622   url += l;
 623   /* Is there an `@' character?  */
 624   for (p = url; *p && *p != '/'; p++)
 625     if (*p == '@')
 626       break;
 627   /* If not, return.  */
 628   if (*p != '@')
 629     return URLOK;
 630   /* Else find the username and password.  */
 631   for (p = q = col = url; *p != '/'; p++)
 632     {
 633       if (*p == ':' && !*user)
 634         {
 635           *user = (char *)xmalloc (p - url + 1);
 636           memcpy (*user, url, p - url);
 637           (*user)[p - url] = '\0';
 638           col = p + 1;
 639         }
 640       if (*p == '@') q = p;
 641     }
 642   /* Decide whether you have only the username or both.  */
 643   where = *user ? passwd : user;
 644   *where = (char *)xmalloc (q - col + 1);
 645   memcpy (*where, col, q - col);
 646   (*where)[q - col] = '\0';
 647   return URLOK;
 648 }
 649
 650 /* If PATH ends with `;type=X', return the character X.  */
 651 static char
 652 process_ftp_type (char *path)
 653 {
 654   int len = strlen (path);
 655
 656   if (len >= 7
 657       && !memcmp (path + len - 7, ";type=", 6))
 658     {
 659       path[len - 7] = '\0';
 660       return path[len - 1];
 661     }
 662   else
 663     return '\0';
 664 }
 665 \f
 666 /* Return the URL as fine-formed string, with a proper protocol, optional port
 667    number, directory and optional user/password.  If `hide' is non-zero (as it
 668    is when we're calling this on a URL we plan to print, but not when calling it
 669    to canonicalize a URL for use within the program), password will be hidden.
 670    The forbidden characters in the URL will be cleansed.  */
 671 char *
 672 str_url (const struct urlinfo *u, int hide)
 673 {
 674   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 675   int i, l, ln, lu, lh, lp, lf, ld;
 676   unsigned short proto_default_port;
 677
 678   /* Look for the protocol name.  */
 679   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 680     if (sup_protos[i].ind == u->proto)
 681       break;
 682   if (i == ARRAY_SIZE (sup_protos))
 683     return NULL;
 684   proto_name = sup_protos[i].name;
 685   proto_default_port = sup_protos[i].port;
 686   host = CLEANDUP (u->host);
 687   dir = CLEANDUP (u->dir);
 688   file = CLEANDUP (u->file);
 689   user = passwd = NULL;
 690   if (u->user)
 691     user = CLEANDUP (u->user);
 692   if (u->passwd)
 693     {
 694       if (hide)
 695         /* Don't output the password, or someone might see it over the user's
 696            shoulder (or in saved wget output).  Don't give away the number of
 697            characters in the password, either, as we did in past versions of
 698            this code, when we replaced the password characters with 'x's. */
 699         passwd = xstrdup("<password>");
 700       else
 701         passwd = CLEANDUP (u->passwd);
 702     }
 703   if (u->proto == URLFTP && *dir == '/')
 704     {
 705       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 706       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 707       tmp[0] = '%';
 708       tmp[1] = '2';
 709       tmp[2] = 'F';
 710       strcpy (tmp + 3, dir + 1);
 711       xfree (dir);
 712       dir = tmp;
 713     }
 714
 715   ln = strlen (proto_name);
 716   lu = user ? strlen (user) : 0;
 717   lp = passwd ? strlen (passwd) : 0;
 718   lh = strlen (host);
 719   ld = strlen (dir);
 720   lf = strlen (file);
 721   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 722   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 723      (user ? user : ""), (passwd ? ":" : ""),
 724      (passwd ? passwd : ""), (user ? "@" : ""),
 725      host, u->port, dir, *dir ? "/" : "", file); */
 726   l = 0;
 727   memcpy (res, proto_name, ln);
 728   l += ln;
 729   if (user)
 730     {
 731       memcpy (res + l, user, lu);
 732       l += lu;
 733       if (passwd)
 734         {
 735           res[l++] = ':';
 736           memcpy (res + l, passwd, lp);
 737           l += lp;
 738         }
 739       res[l++] = '@';
 740     }
 741   memcpy (res + l, host, lh);
 742   l += lh;
 743   if (u->port != proto_default_port)
 744     {
 745       res[l++] = ':';
 746       long_to_string (res + l, (long)u->port);
 747       l += numdigit (u->port);
 748     }
 749   res[l++] = '/';
 750   memcpy (res + l, dir, ld);
 751   l += ld;
 752   if (*dir)
 753     res[l++] = '/';
 754   strcpy (res + l, file);
 755   xfree (host);
 756   xfree (dir);
 757   xfree (file);
 758   FREE_MAYBE (user);
 759   FREE_MAYBE (passwd);
 760   return res;
 761 }
 762
 763 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 764    location.  Uses parseurl to parse them, and compares the canonical
 765    forms.
 766
 767    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 768    return 0 on error.  */
 769 int
 770 url_equal (const char *url1, const char *url2)
 771 {
 772   struct urlinfo *u1, *u2;
 773   uerr_t err;
 774   int res;
 775
 776   u1 = newurl ();
 777   err = parseurl (url1, u1, 0);
 778   if (err != URLOK)
 779     {
 780       freeurl (u1, 1);
 781       return 0;
 782     }
 783   u2 = newurl ();
 784   err = parseurl (url2, u2, 0);
 785   if (err != URLOK)
 786     {
 787       freeurl (u2, 1);
 788       return 0;
 789     }
 790   res = !strcmp (u1->url, u2->url);
 791   freeurl (u1, 1);
 792   freeurl (u2, 1);
 793   return res;
 794 }
 795 \f
 796 urlpos *
 797 get_urls_file (const char *file)
 798 {
 799   struct file_memory *fm;
 800   urlpos *head, *tail;
 801   const char *text, *text_end;
 802
 803   /* Load the file.  */
 804   fm = read_file (file);
 805   if (!fm)
 806     {
 807       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 808       return NULL;
 809     }
 810   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 811   head = tail = NULL;
 812   text = fm->content;
 813   text_end = fm->content + fm->length;
 814   while (text < text_end)
 815     {
 816       const char *line_beg = text;
 817       const char *line_end = memchr (text, '\n', text_end - text);
 818       if (!line_end)
 819         line_end = text_end;
 820       else
 821         ++line_end;
 822       text = line_end;
 823       while (line_beg < line_end
 824              && ISSPACE (*line_beg))
 825         ++line_beg;
 826       while (line_end > line_beg + 1
 827              && ISSPACE (*(line_end - 1)))
 828         --line_end;
 829       if (line_end > line_beg)
 830         {
 831           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 832           memset (entry, 0, sizeof (*entry));
 833           entry->next = NULL;
 834           entry->url = strdupdelim (line_beg, line_end);
 835           if (!head)
 836             head = entry;
 837           else
 838             tail->next = entry;
 839           tail = entry;
 840         }
 841     }
 842   read_file_free (fm);
 843   return head;
 844 }
 845 \f
 846 /* Free the linked list of urlpos.  */
 847 void
 848 free_urlpos (urlpos *l)
 849 {
 850   while (l)
 851     {
 852       urlpos *next = l->next;
 853       xfree (l->url);
 854       FREE_MAYBE (l->local_name);
 855       xfree (l);
 856       l = next;
 857     }
 858 }
 859
 860 /* Rotate FNAME opt.backups times */
 861 void
 862 rotate_backups(const char *fname)
 863 {
 864   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 865   char *from = (char *)alloca (maxlen);
 866   char *to = (char *)alloca (maxlen);
 867   struct stat sb;
 868   int i;
 869
 870   if (stat (fname, &sb) == 0)
 871     if (S_ISREG (sb.st_mode) == 0)
 872       return;
 873
 874   for (i = opt.backups; i > 1; i--)
 875     {
 876       sprintf (from, "%s.%d", fname, i - 1);
 877       sprintf (to, "%s.%d", fname, i);
 878       /* #### This will fail on machines without the rename() system
 879          call.  */
 880       rename (from, to);
 881     }
 882
 883   sprintf (to, "%s.%d", fname, 1);
 884   rename(fname, to);
 885 }
 886
 887 /* Create all the necessary directories for PATH (a file).  Calls
 888    mkdirhier() internally.  */
 889 int
 890 mkalldirs (const char *path)
 891 {
 892   const char *p;
 893   char *t;
 894   struct stat st;
 895   int res;
 896
 897   p = path + strlen (path);
 898   for (; *p != '/' && p != path; p--);
 899   /* Don't create if it's just a file.  */
 900   if ((p == path) && (*p != '/'))
 901     return 0;
 902   t = strdupdelim (path, p);
 903   /* Check whether the directory exists.  */
 904   if ((stat (t, &st) == 0))
 905     {
 906       if (S_ISDIR (st.st_mode))
 907         {
 908           xfree (t);
 909           return 0;
 910         }
 911       else
 912         {
 913           /* If the dir exists as a file name, remove it first.  This
 914              is *only* for Wget to work with buggy old CERN http
 915              servers.  Here is the scenario: When Wget tries to
 916              retrieve a directory without a slash, e.g.
 917              http://foo/bar (bar being a directory), CERN server will
 918              not redirect it too http://foo/bar/ -- it will generate a
 919              directory listing containing links to bar/file1,
 920              bar/file2, etc.  Wget will lose because it saves this
 921              HTML listing to a file `bar', so it cannot create the
 922              directory.  To work around this, if the file of the same
 923              name exists, we just remove it and create the directory
 924              anyway.  */
 925           DEBUGP (("Removing %s because of directory danger!\n", t));
 926           unlink (t);
 927         }
 928     }
 929   res = make_directory (t);
 930   if (res != 0)
 931     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 932   xfree (t);
 933   return res;
 934 }
 935
 936 static int
 937 count_slashes (const char *s)
 938 {
 939   int i = 0;
 940   while (*s)
 941     if (*s++ == '/')
 942       ++i;
 943   return i;
 944 }
 945
 946 /* Return the path name of the URL-equivalent file name, with a
 947    remote-like structure of directories.  */
 948 static char *
 949 mkstruct (const struct urlinfo *u)
 950 {
 951   char *host, *dir, *file, *res, *dirpref;
 952   int l;
 953
 954   assert (u->dir != NULL);
 955   assert (u->host != NULL);
 956
 957   if (opt.cut_dirs)
 958     {
 959       char *ptr = u->dir + (*u->dir == '/');
 960       int slash_count = 1 + count_slashes (ptr);
 961       int cut = MINVAL (opt.cut_dirs, slash_count);
 962       for (; cut && *ptr; ptr++)
 963         if (*ptr == '/')
 964           --cut;
 965       STRDUP_ALLOCA (dir, ptr);
 966     }
 967   else
 968     dir = u->dir + (*u->dir == '/');
 969
 970   host = xstrdup (u->host);
 971   /* Check for the true name (or at least a consistent name for saving
 972      to directory) of HOST, reusing the hlist if possible.  */
 973   if (opt.add_hostdir && !opt.simple_check)
 974     {
 975       char *nhost = realhost (host);
 976       xfree (host);
 977       host = nhost;
 978     }
 979   /* Add dir_prefix and hostname (if required) to the beginning of
 980      dir.  */
 981   if (opt.add_hostdir)
 982     {
 983       if (!DOTP (opt.dir_prefix))
 984         {
 985           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 986                                     + strlen (host) + 1);
 987           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 988         }
 989       else
 990         STRDUP_ALLOCA (dirpref, host);
 991     }
 992   else                         /* not add_hostdir */
 993     {
 994       if (!DOTP (opt.dir_prefix))
 995         dirpref = opt.dir_prefix;
 996       else
 997         dirpref = "";
 998     }
 999   xfree (host);
1000
1001   /* If there is a prefix, prepend it.  */
1002   if (*dirpref)
1003     {
1004       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1005       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1006       dir = newdir;
1007     }
1008   dir = xstrdup (dir);
1009   URL_CLEANSE (dir);
1010   l = strlen (dir);
1011   if (l && dir[l - 1] == '/')
1012     dir[l - 1] = '\0';
1013
1014   if (!*u->file)
1015     file = "index.html";
1016   else
1017     file = u->file;
1018
1019   /* Finally, construct the full name.  */
1020   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1021   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1022   xfree (dir);
1023   return res;
1024 }
1025
1026 /* Create a unique filename, corresponding to a given URL.  Calls
1027    mkstruct if necessary.  Does *not* actually create any directories.  */
1028 char *
1029 url_filename (const struct urlinfo *u)
1030 {
1031   char *file, *name;
1032   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1033
1034   if (opt.dirstruct)
1035     {
1036       file = mkstruct (u);
1037       have_prefix = 1;
1038     }
1039   else
1040     {
1041       if (!*u->file)
1042         file = xstrdup ("index.html");
1043       else
1044         file = xstrdup (u->file);
1045     }
1046
1047   if (!have_prefix)
1048     {
1049       /* Check whether the prefix directory is something other than "."
1050          before prepending it.  */
1051       if (!DOTP (opt.dir_prefix))
1052         {
1053           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1054                                          + 1 + strlen (file) + 1);
1055           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1056           xfree (file);
1057           file = nfile;
1058         }
1059     }
1060   /* DOS-ish file systems don't like `%' signs in them; we change it
1061      to `@'.  */
1062 #ifdef WINDOWS
1063   {
1064     char *p = file;
1065     for (p = file; *p; p++)
1066       if (*p == '%')
1067         *p = '@';
1068   }
1069 #endif /* WINDOWS */
1070
1071   /* Check the cases in which the unique extensions are not used:
1072      1) Clobbering is turned off (-nc).
1073      2) Retrieval with regetting.
1074      3) Timestamping is used.
1075      4) Hierarchy is built.
1076
1077      The exception is the case when file does exist and is a
1078      directory (actually support for bad httpd-s).  */
1079   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1080       && !(file_exists_p (file) && !file_non_directory_p (file)))
1081     return file;
1082
1083   /* Find a unique name.  */
1084   name = unique_name (file);
1085   xfree (file);
1086   return name;
1087 }
1088
1089 /* Like strlen(), but allow the URL to be ended with '?'.  */
1090 static int
1091 urlpath_length (const char *url)
1092 {
1093   const char *q = strchr (url, '?');
1094   if (q)
1095     return q - url;
1096   return strlen (url);
1097 }
1098
1099 /* Find the last occurrence of character C in the range [b, e), or
1100    NULL, if none are present.  This is almost completely equivalent to
1101    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1102    the contents of the string.  */
1103 static const char *
1104 find_last_char (const char *b, const char *e, char c)
1105 {
1106   for (; e > b; e--)
1107     if (*e == c)
1108       return e;
1109   return NULL;
1110 }
1111
1112 /* Construct a URL by concatenating an absolute URL and a path, which
1113    may or may not be absolute.  This tries to behave "reasonably" in
1114    all foreseeable cases.  It employs little specific knowledge about
1115    protocols or URL-specific stuff -- it just works on strings.  */
1116 static char *
1117 construct (const char *url, const char *sub, int subsize, int no_proto)
1118 {
1119   char *constr;
1120
1121   if (no_proto)
1122     {
1123       const char *end = url + urlpath_length (url);
1124
1125       if (*sub != '/')
1126         {
1127           /* SUB is a relative URL: we need to replace everything
1128              after last slash (possibly empty) with SUB.
1129
1130              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1131              our result should be "whatever/foo/qux/xyzzy".  */
1132           int need_explicit_slash = 0;
1133           int span;
1134           const char *start_insert;
1135           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1136           if (!last_slash)
1137             {
1138               /* No slash found at all.  Append SUB to what we have,
1139                  but we'll need a slash as a separator.
1140
1141                  Example: if url == "foo" and sub == "qux/xyzzy", then
1142                  we cannot just append sub to url, because we'd get
1143                  "fooqux/xyzzy", whereas what we want is
1144                  "foo/qux/xyzzy".
1145
1146                  To make sure the / gets inserted, we set
1147                  need_explicit_slash to 1.  We also set start_insert
1148                  to end + 1, so that the length calculations work out
1149                  correctly for one more (slash) character.  Accessing
1150                  that character is fine, since it will be the
1151                  delimiter, '\0' or '?'.  */
1152               /* example: "foo?..." */
1153               /*               ^    ('?' gets changed to '/') */
1154               start_insert = end + 1;
1155               need_explicit_slash = 1;
1156             }
1157           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1158             {
1159               /* example: http://host"  */
1160               /*                      ^ */
1161               start_insert = end + 1;
1162               need_explicit_slash = 1;
1163             }
1164           else
1165             {
1166               /* example: "whatever/foo/bar" */
1167               /*                        ^    */
1168               start_insert = last_slash + 1;
1169             }
1170
1171           span = start_insert - url;
1172           constr = (char *)xmalloc (span + subsize + 1);
1173           if (span)
1174             memcpy (constr, url, span);
1175           if (need_explicit_slash)
1176             constr[span - 1] = '/';
1177           if (subsize)
1178             memcpy (constr + span, sub, subsize);
1179           constr[span + subsize] = '\0';
1180         }
1181       else /* *sub == `/' */
1182         {
1183           /* SUB is an absolute path: we need to replace everything
1184              after (and including) the FIRST slash with SUB.
1185
1186              So, if URL is "http://host/whatever/foo/bar", and SUB is
1187              "/qux/xyzzy", our result should be
1188              "http://host/qux/xyzzy".  */
1189           int span;
1190           const char *slash;
1191           const char *start_insert = NULL; /* for gcc to shut up. */
1192           const char *pos = url;
1193           int seen_slash_slash = 0;
1194           /* We're looking for the first slash, but want to ignore
1195              double slash. */
1196         again:
1197           slash = memchr (pos, '/', end - pos);
1198           if (slash && !seen_slash_slash)
1199             if (*(slash + 1) == '/')
1200               {
1201                 pos = slash + 2;
1202                 seen_slash_slash = 1;
1203                 goto again;
1204               }
1205
1206           /* At this point, SLASH is the location of the first / after
1207              "//", or the first slash altogether.  START_INSERT is the
1208              pointer to the location where SUB will be inserted.  When
1209              examining the last two examples, keep in mind that SUB
1210              begins with '/'. */
1211
1212           if (!slash && !seen_slash_slash)
1213             /* example: "foo" */
1214             /*           ^    */
1215             start_insert = url;
1216           else if (!slash && seen_slash_slash)
1217             /* example: "http://foo" */
1218             /*                     ^ */
1219             start_insert = end;
1220           else if (slash && !seen_slash_slash)
1221             /* example: "foo/bar" */
1222             /*           ^        */
1223             start_insert = url;
1224           else if (slash && seen_slash_slash)
1225             /* example: "http://something/" */
1226             /*                           ^  */
1227             start_insert = slash;
1228
1229           span = start_insert - url;
1230           constr = (char *)xmalloc (span + subsize + 1);
1231           if (span)
1232             memcpy (constr, url, span);
1233           if (subsize)
1234             memcpy (constr + span, sub, subsize);
1235           constr[span + subsize] = '\0';
1236         }
1237     }
1238   else /* !no_proto */
1239     {
1240       constr = strdupdelim (sub, sub + subsize);
1241     }
1242   return constr;
1243 }
1244
1245 /* Like the function above, but with a saner caller interface. */
1246 char *
1247 url_concat (const char *base_url, const char *new_url)
1248 {
1249   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1250 }
1251 \f
1252 /* Optimize URL by host, destructively replacing u->host with realhost
1253    (u->host).  Do this regardless of opt.simple_check.  */
1254 void
1255 opt_url (struct urlinfo *u)
1256 {
1257   /* Find the "true" host.  */
1258   char *host = realhost (u->host);
1259   xfree (u->host);
1260   u->host = host;
1261   assert (u->dir != NULL);      /* the URL must have been parsed */
1262   /* Refresh the printed representation.  */
1263   xfree (u->url);
1264   u->url = str_url (u, 0);
1265 }
1266
1267 /* This beautiful kludge is fortunately not needed, as I've made
1268    parse_dir do the (almost) right thing, so that a query can never
1269    become a part of directory.  */
1270 #if 0
1271 /* Call path_simplify, but make sure that the part after the
1272    question-mark, if any, is not destroyed by path_simplify's
1273    "optimizations".  */
1274 void
1275 path_simplify_with_kludge (char *path)
1276 {
1277   char *query = strchr (path, '?');
1278   if (query)
1279     /* path_simplify also works destructively, so we also have the
1280        license to write. */
1281     *query = '\0';
1282   path_simplify (path);
1283   if (query)
1284     {
1285       char *newend = path + strlen (path);
1286       *query = '?';
1287       if (newend != query)
1288         memmove (newend, query, strlen (query) + 1);
1289     }
1290 }
1291 #endif
1292 \f
1293 /* Returns proxy host address, in accordance with PROTO.  */
1294 char *
1295 getproxy (uerr_t proto)
1296 {
1297   if (proto == URLHTTP)
1298     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1299   else if (proto == URLFTP)
1300     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1301 #ifdef HAVE_SSL
1302   else if (proto == URLHTTPS)
1303     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1304 #endif /* HAVE_SSL */
1305   else
1306     return NULL;
1307 }
1308
1309 /* Should a host be accessed through proxy, concerning no_proxy?  */
1310 int
1311 no_proxy_match (const char *host, const char **no_proxy)
1312 {
1313   if (!no_proxy)
1314     return 1;
1315   else
1316     return !sufmatch (no_proxy, host);
1317 }
1318 \f
1319 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1320 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1321
1322 /* Change the links in an HTML document.  Accepts a structure that
1323    defines the positions of all the links.  */
1324 void
1325 convert_links (const char *file, urlpos *l)
1326 {
1327   struct file_memory *fm;
1328   FILE               *fp;
1329   const char         *p;
1330   downloaded_file_t  downloaded_file_return;
1331
1332   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1333
1334   {
1335     /* First we do a "dry run": go through the list L and see whether
1336        any URL needs to be converted in the first place.  If not, just
1337        leave the file alone.  */
1338     int count = 0;
1339     urlpos *dry = l;
1340     for (dry = l; dry; dry = dry->next)
1341       if (dry->convert != CO_NOCONVERT)
1342         ++count;
1343     if (!count)
1344       {
1345         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1346         return;
1347       }
1348   }
1349
1350   fm = read_file (file);
1351   if (!fm)
1352     {
1353       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1354                  file, strerror (errno));
1355       return;
1356     }
1357
1358   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1359   if (opt.backup_converted && downloaded_file_return)
1360     write_backup_file (file, downloaded_file_return);
1361
1362   /* Before opening the file for writing, unlink the file.  This is
1363      important if the data in FM is mmaped.  In such case, nulling the
1364      file, which is what fopen() below does, would make us read all
1365      zeroes from the mmaped region.  */
1366   if (unlink (file) < 0 && errno != ENOENT)
1367     {
1368       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1369                  file, strerror (errno));
1370       read_file_free (fm);
1371       return;
1372     }
1373   /* Now open the file for writing.  */
1374   fp = fopen (file, "wb");
1375   if (!fp)
1376     {
1377       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1378                  file, strerror (errno));
1379       read_file_free (fm);
1380       return;
1381     }
1382   /* Here we loop through all the URLs in file, replacing those of
1383      them that are downloaded with relative references.  */
1384   p = fm->content;
1385   for (; l; l = l->next)
1386     {
1387       char *url_start = fm->content + l->pos;
1388
1389       if (l->pos >= fm->length)
1390         {
1391           DEBUGP (("Something strange is going on.  Please investigate."));
1392           break;
1393         }
1394       /* If the URL is not to be converted, skip it.  */
1395       if (l->convert == CO_NOCONVERT)
1396         {
1397           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1398           continue;
1399         }
1400
1401       /* Echo the file contents, up to the offending URL's opening
1402          quote, to the outfile.  */
1403       fwrite (p, 1, url_start - p, fp);
1404       p = url_start;
1405       if (l->convert == CO_CONVERT_TO_RELATIVE)
1406         {
1407           /* Convert absolute URL to relative. */
1408           char *newname = construct_relative (file, l->local_name);
1409           char *quoted_newname = html_quote_string (newname);
1410           replace_attr (&p, l->size, fp, quoted_newname);
1411           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1412                    l->url, newname, l->pos, file));
1413           xfree (newname);
1414           xfree (quoted_newname);
1415         }
1416       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1417         {
1418           /* Convert the link to absolute URL. */
1419           char *newlink = l->url;
1420           char *quoted_newlink = html_quote_string (newlink);
1421           replace_attr (&p, l->size, fp, quoted_newlink);
1422           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1423                    newlink, l->pos, file));
1424           xfree (quoted_newlink);
1425         }
1426     }
1427   /* Output the rest of the file. */
1428   if (p - fm->content < fm->length)
1429     fwrite (p, 1, fm->length - (p - fm->content), fp);
1430   fclose (fp);
1431   read_file_free (fm);
1432   logputs (LOG_VERBOSE, _("done.\n"));
1433 }
1434
1435 /* Construct and return a malloced copy of the relative link from two
1436    pieces of information: local name S1 of the referring file and
1437    local name S2 of the referred file.
1438
1439    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1440    "jagor.srce.hr/images/news.gif", the function will return
1441    "images/news.gif".
1442
1443    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1444    "fly.cc.fer.hr/images/fly.gif", the function will return
1445    "../images/fly.gif".
1446
1447    Caveats: S1 should not begin with `/', unless S2 also begins with
1448    '/'.  S1 should not contain things like ".." and such --
1449    construct_relative ("fly/ioccc/../index.html",
1450    "fly/images/fly.gif") will fail.  (A workaround is to call
1451    something like path_simplify() on S1).  */
1452 static char *
1453 construct_relative (const char *s1, const char *s2)
1454 {
1455   int i, cnt, sepdirs1;
1456   char *res;
1457
1458   if (*s2 == '/')
1459     return xstrdup (s2);
1460   /* S1 should *not* be absolute, if S2 wasn't.  */
1461   assert (*s1 != '/');
1462   i = cnt = 0;
1463   /* Skip the directories common to both strings.  */
1464   while (1)
1465     {
1466       while (s1[i] && s2[i]
1467              && (s1[i] == s2[i])
1468              && (s1[i] != '/')
1469              && (s2[i] != '/'))
1470         ++i;
1471       if (s1[i] == '/' && s2[i] == '/')
1472         cnt = ++i;
1473       else
1474         break;
1475     }
1476   for (sepdirs1 = 0; s1[i]; i++)
1477     if (s1[i] == '/')
1478       ++sepdirs1;
1479   /* Now, construct the file as of:
1480      - ../ repeated sepdirs1 time
1481      - all the non-mutual directories of S2.  */
1482   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1483   for (i = 0; i < sepdirs1; i++)
1484     memcpy (res + 3 * i, "../", 3);
1485   strcpy (res + 3 * i, s2 + cnt);
1486   return res;
1487 }
1488 \f
1489 /* Add URL to the head of the list L.  */
1490 urlpos *
1491 add_url (urlpos *l, const char *url, const char *file)
1492 {
1493   urlpos *t;
1494
1495   t = (urlpos *)xmalloc (sizeof (urlpos));
1496   memset (t, 0, sizeof (*t));
1497   t->url = xstrdup (url);
1498   t->local_name = xstrdup (file);
1499   t->next = l;
1500   return t;
1501 }
1502
1503 static void
1504 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1505 {
1506   /* Rather than just writing over the original .html file with the
1507      converted version, save the former to *.orig.  Note we only do
1508      this for files we've _successfully_ downloaded, so we don't
1509      clobber .orig files sitting around from previous invocations. */
1510
1511   /* Construct the backup filename as the original name plus ".orig". */
1512   size_t         filename_len = strlen(file);
1513   char*          filename_plus_orig_suffix;
1514   boolean        already_wrote_backup_file = FALSE;
1515   slist*         converted_file_ptr;
1516   static slist*  converted_files = NULL;
1517
1518   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1519     {
1520       /* Just write "orig" over "html".  We need to do it this way
1521          because when we're checking to see if we've downloaded the
1522          file before (to see if we can skip downloading it), we don't
1523          know if it's a text/html file.  Therefore we don't know yet
1524          at that stage that -E is going to cause us to tack on
1525          ".html", so we need to compare vs. the original URL plus
1526          ".orig", not the original URL plus ".html.orig". */
1527       filename_plus_orig_suffix = alloca (filename_len + 1);
1528       strcpy(filename_plus_orig_suffix, file);
1529       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1530     }
1531   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1532     {
1533       /* Append ".orig" to the name. */
1534       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1535       strcpy(filename_plus_orig_suffix, file);
1536       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1537     }
1538
1539   /* We can get called twice on the same URL thanks to the
1540      convert_all_links() call in main().  If we write the .orig file
1541      each time in such a case, it'll end up containing the first-pass
1542      conversion, not the original file.  So, see if we've already been
1543      called on this file. */
1544   converted_file_ptr = converted_files;
1545   while (converted_file_ptr != NULL)
1546     if (strcmp(converted_file_ptr->string, file) == 0)
1547       {
1548         already_wrote_backup_file = TRUE;
1549         break;
1550       }
1551     else
1552       converted_file_ptr = converted_file_ptr->next;
1553
1554   if (!already_wrote_backup_file)
1555     {
1556       /* Rename <file> to <file>.orig before former gets written over. */
1557       if (rename(file, filename_plus_orig_suffix) != 0)
1558         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1559                    file, filename_plus_orig_suffix, strerror (errno));
1560
1561       /* Remember that we've already written a .orig backup for this file.
1562          Note that we never free this memory since we need it till the
1563          convert_all_links() call, which is one of the last things the
1564          program does before terminating.  BTW, I'm not sure if it would be
1565          safe to just set 'converted_file_ptr->string' to 'file' below,
1566          rather than making a copy of the string...  Another note is that I
1567          thought I could just add a field to the urlpos structure saying
1568          that we'd written a .orig file for this URL, but that didn't work,
1569          so I had to make this separate list.
1570          -- Dan Harkless <wget@harkless.org>
1571
1572          This [adding a field to the urlpos structure] didn't work
1573          because convert_file() is called twice: once after all its
1574          sublinks have been retrieved in recursive_retrieve(), and
1575          once at the end of the day in convert_all_links().  The
1576          original linked list collected in recursive_retrieve() is
1577          lost after the first invocation of convert_links(), and
1578          convert_all_links() makes a new one (it calls get_urls_html()
1579          for each file it covers.)  That's why your first approach didn't
1580          work.  The way to make it work is perhaps to make this flag a
1581          field in the `urls_html' list.
1582          -- Hrvoje Niksic <hniksic@arsdigita.com>
1583       */
1584       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1585       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1586       converted_file_ptr->next = converted_files;
1587       converted_files = converted_file_ptr;
1588     }
1589 }
1590
1591 static int find_fragment PARAMS ((const char *, int, const char **,
1592                                   const char **));
1593
1594 static void
1595 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1596 {
1597   const char *p = *pp;
1598   int quote_flag = 0;
1599   int size = raw_size;
1600   char quote_char = '\"';
1601   const char *frag_beg, *frag_end;
1602
1603   /* Structure of our string is:
1604        "...old-contents..."
1605        <---  l->size   --->  (with quotes)
1606      OR:
1607        ...old-contents...
1608        <---  l->size  -->    (no quotes)   */
1609
1610   if (*p == '\"' || *p == '\'')
1611     {
1612       quote_char = *p;
1613       quote_flag = 1;
1614       ++p;
1615       size -= 2;                /* disregard opening and closing quote */
1616     }
1617   putc (quote_char, fp);
1618   fputs (new_str, fp);
1619
1620   /* Look for fragment identifier, if any. */
1621   if (find_fragment (p, size, &frag_beg, &frag_end))
1622     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1623   p += size;
1624   if (quote_flag)
1625     ++p;
1626   putc (quote_char, fp);
1627   *pp = p;
1628 }
1629
1630 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1631    preceded by '&'.  If the character is not found, return zero.  If
1632    the character is found, return 1 and set BP and EP to point to the
1633    beginning and end of the region.
1634
1635    This is used for finding the fragment indentifiers in URLs.  */
1636
1637 static int
1638 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1639 {
1640   const char *end = beg + size;
1641   int saw_amp = 0;
1642   for (; beg < end; beg++)
1643     {
1644       switch (*beg)
1645         {
1646         case '&':
1647           saw_amp = 1;
1648           break;
1649         case '#':
1650           if (!saw_amp)
1651             {
1652               *bp = beg;
1653               *ep = end;
1654               return 1;
1655             }
1656           /* fallthrough */
1657         default:
1658           saw_amp = 0;
1659         }
1660     }
1661   return 0;
1662 }
1663
1664 typedef struct _downloaded_file_list {
1665   char*                          file;
1666   downloaded_file_t              download_type;
1667   struct _downloaded_file_list*  next;
1668 } downloaded_file_list;
1669
1670 static downloaded_file_list *downloaded_files;
1671
1672 /* Remembers which files have been downloaded.  In the standard case, should be
1673    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1674    download successfully (i.e. not for ones we have failures on or that we skip
1675    due to -N).
1676
1677    When we've downloaded a file and tacked on a ".html" extension due to -E,
1678    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1679    FILE_DOWNLOADED_NORMALLY.
1680
1681    If you just want to check if a file has been previously added without adding
1682    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1683    with local filenames, not remote URLs. */
1684 downloaded_file_t
1685 downloaded_file (downloaded_file_t  mode, const char*  file)
1686 {
1687   boolean                       found_file = FALSE;
1688   downloaded_file_list*         rover = downloaded_files;
1689
1690   while (rover != NULL)
1691     if (strcmp(rover->file, file) == 0)
1692       {
1693         found_file = TRUE;
1694         break;
1695       }
1696     else
1697       rover = rover->next;
1698
1699   if (found_file)
1700     return rover->download_type;  /* file had already been downloaded */
1701   else
1702     {
1703       if (mode != CHECK_FOR_FILE)
1704         {
1705           rover = xmalloc(sizeof(*rover));
1706           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1707           rover->download_type = mode;
1708           rover->next = downloaded_files;
1709           downloaded_files = rover;
1710         }
1711
1712       return FILE_NOT_ALREADY_DOWNLOADED;
1713     }
1714 }
1715
1716 void
1717 downloaded_files_free (void)
1718 {
1719   downloaded_file_list*         rover = downloaded_files;
1720   while (rover)
1721     {
1722       downloaded_file_list *next = rover->next;
1723       xfree (rover->file);
1724       xfree (rover);
1725       rover = next;
1726     }
1727 }
1728 \f
1729 /* Initialization of static stuff. */
1730 void
1731 url_init (void)
1732 {
1733   init_unsafe_char_table ();
1734 }