sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Is X "."?  */
  46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  47 /* Is X ".."?  */
  48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  49
  50 static int urlpath_length PARAMS ((const char *));
  51
  52 /* A NULL-terminated list of strings to be recognized as protocol
  53    types (URL schemes).  Note that recognized doesn't mean supported
  54    -- only HTTP, HTTPS and FTP are currently supported.
  55
  56    However, a string that does not match anything in the list will be
  57    considered a relative URL.  Thus it's important that this list has
  58    anything anyone could think of being legal.
  59
  60    #### This is probably broken.  Wget should use other means to
  61    distinguish between absolute and relative URIs in HTML links.
  62
  63    Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
  64    for more.  */
  65 static char *protostrings[] =
  66 {
  67   "cid:",
  68   "clsid:",
  69   "file:",
  70   "finger:",
  71   "ftp:",
  72   "gopher:",
  73   "hdl:",
  74   "http:",
  75   "https:",
  76   "ilu:",
  77   "ior:",
  78   "irc:",
  79   "java:",
  80   "javascript:",
  81   "lifn:",
  82   "mailto:",
  83   "mid:",
  84   "news:",
  85   "nntp:",
  86   "path:",
  87   "prospero:",
  88   "rlogin:",
  89   "service:",
  90   "shttp:",
  91   "snews:",
  92   "stanf:",
  93   "telnet:",
  94   "tn3270:",
  95   "wais:",
  96   "whois++:",
  97   NULL
  98 };
  99
 100 struct proto
 101 {
 102   char *name;
 103   uerr_t ind;
 104   unsigned short port;
 105 };
 106
 107 /* Similar to former, but for supported protocols: */
 108 static struct proto sup_protos[] =
 109 {
 110   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 111 #ifdef HAVE_SSL
 112   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 113 #endif
 114   { "ftp://", URLFTP, DEFAULT_FTP_PORT }
 115 };
 116
 117 static void parse_dir PARAMS ((const char *, char **, char **));
 118 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 119 static char *construct_relative PARAMS ((const char *, const char *));
 120 static char process_ftp_type PARAMS ((char *));
 121
 122 \f
 123 /* Support for encoding and decoding of URL strings.  We determine
 124    whether a character is unsafe through static table lookup.  This
 125    code assumes ASCII character set and 8-bit chars.  */
 126
 127 enum {
 128   urlchr_reserved = 1,
 129   urlchr_unsafe   = 2
 130 };
 131
 132 #define R  urlchr_reserved
 133 #define U  urlchr_unsafe
 134 #define RU R|U
 135
 136 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 137
 138 /* rfc1738 reserved chars.  We don't use this yet; preservation of
 139    reserved chars will be implemented when I integrate the new
 140    `reencode_string' function.  */
 141
 142 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 143
 144 /* Unsafe chars:
 145    - anything <= 32;
 146    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 147    - '@' and ':'; needed for encoding URL username and password.
 148    - anything >= 127. */
 149
 150 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 151
 152 const static unsigned char urlchr_table[256] =
 153 {
 154   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 155   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 156   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 157   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 158   U,  0,  U,  U,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 159   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 160   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 161   0,  0,  U,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 162  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 163   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 164   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 165   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 166   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 167   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 168   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 169   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 170
 171   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 172   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 173   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 174   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 175
 176   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 177   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 178   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 179   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 180 };
 181
 182 /* Decodes the forms %xy in a URL to the character the hexadecimal
 183    code of which is xy.  xy are hexadecimal digits from
 184    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 185    hex-digits or `%' precedes `\0', the sequence is inserted
 186    literally.  */
 187
 188 static void
 189 decode_string (char *s)
 190 {
 191   char *t = s;                  /* t - tortoise */
 192   char *h = s;                  /* h - hare     */
 193
 194   for (; *h; h++, t++)
 195     {
 196       if (*h != '%')
 197         {
 198         copychar:
 199           *t = *h;
 200         }
 201       else
 202         {
 203           /* Do nothing if '%' is not followed by two hex digits. */
 204           if (!*(h + 1) || !*(h + 2)
 205               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 206             goto copychar;
 207           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 208           h += 2;
 209         }
 210     }
 211   *t = '\0';
 212 }
 213
 214 /* Like encode_string, but return S if there are no unsafe chars.  */
 215
 216 static char *
 217 encode_string_maybe (const char *s)
 218 {
 219   const char *p1;
 220   char *p2, *newstr;
 221   int newlen;
 222   int addition = 0;
 223
 224   for (p1 = s; *p1; p1++)
 225     if (UNSAFE_CHAR (*p1))
 226       addition += 2;            /* Two more characters (hex digits) */
 227
 228   if (!addition)
 229     return (char *)s;
 230
 231   newlen = (p1 - s) + addition;
 232   newstr = (char *)xmalloc (newlen + 1);
 233
 234   p1 = s;
 235   p2 = newstr;
 236   while (*p1)
 237     {
 238       if (UNSAFE_CHAR (*p1))
 239         {
 240           const unsigned char c = *p1++;
 241           *p2++ = '%';
 242           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 243           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 244         }
 245       else
 246         *p2++ = *p1++;
 247     }
 248   *p2 = '\0';
 249   assert (p2 - newstr == newlen);
 250
 251   return newstr;
 252 }
 253
 254 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 255    given string, returning a malloc-ed %XX encoded string.  */
 256
 257 char *
 258 encode_string (const char *s)
 259 {
 260   char *encoded = encode_string_maybe (s);
 261   if (encoded != s)
 262     return encoded;
 263   else
 264     return xstrdup (s);
 265 }
 266
 267 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 268    the old value of PTR is freed and PTR is made to point to the newly
 269    allocated storage.  */
 270
 271 #define ENCODE(ptr) do {                        \
 272   char *e_new = encode_string_maybe (ptr);      \
 273   if (e_new != ptr)                             \
 274     {                                           \
 275       xfree (ptr);                              \
 276       ptr = e_new;                              \
 277     }                                           \
 278 } while (0)
 279 \f
 280 /* Returns the protocol type if URL's protocol is supported, or
 281    URLUNKNOWN if not.  */
 282 uerr_t
 283 urlproto (const char *url)
 284 {
 285   int i;
 286
 287   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 288     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 289       return sup_protos[i].ind;
 290   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 291   if (url[i] == ':')
 292     {
 293       for (++i; url[i] && url[i] != '/'; i++)
 294         if (!ISDIGIT (url[i]))
 295           return URLBADPORT;
 296       if (url[i - 1] == ':')
 297         return URLFTP;
 298       else
 299         return URLHTTP;
 300     }
 301   else
 302     return URLHTTP;
 303 }
 304
 305 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 306    part is found, returns 0.  */
 307 int
 308 skip_proto (const char *url)
 309 {
 310   char **s;
 311   int l;
 312
 313   for (s = protostrings; *s; s++)
 314     if (!strncasecmp (*s, url, strlen (*s)))
 315       break;
 316   if (!*s)
 317     return 0;
 318   l = strlen (*s);
 319   /* HTTP and FTP protocols are expected to yield exact host names
 320      (i.e. the `//' part must be skipped, too).  */
 321   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 322     l += 2;
 323   return l;
 324 }
 325
 326 /* Returns 1 if the URL begins with a protocol (supported or
 327    unsupported), 0 otherwise.  */
 328 int
 329 has_proto (const char *url)
 330 {
 331   char **s;
 332
 333   for (s = protostrings; *s; s++)
 334     if (strncasecmp (url, *s, strlen (*s)) == 0)
 335       return 1;
 336   return 0;
 337 }
 338
 339 /* Skip the username and password, if present here.  The function
 340    should be called *not* with the complete URL, but with the part
 341    right after the protocol.
 342
 343    If no username and password are found, return 0.  */
 344 int
 345 skip_uname (const char *url)
 346 {
 347   const char *p;
 348   const char *q = NULL;
 349   for (p = url ; *p && *p != '/'; p++)
 350     if (*p == '@') q = p;
 351   /* If a `@' was found before the first occurrence of `/', skip
 352      it.  */
 353   if (q != NULL)
 354     return q - url + 1;
 355   else
 356     return 0;
 357 }
 358 \f
 359 /* Allocate a new urlinfo structure, fill it with default values and
 360    return a pointer to it.  */
 361 struct urlinfo *
 362 newurl (void)
 363 {
 364   struct urlinfo *u;
 365
 366   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 367   memset (u, 0, sizeof (*u));
 368   u->proto = URLUNKNOWN;
 369   return u;
 370 }
 371
 372 /* Perform a "deep" free of the urlinfo structure.  The structure
 373    should have been created with newurl, but need not have been used.
 374    If free_pointer is non-0, free the pointer itself.  */
 375 void
 376 freeurl (struct urlinfo *u, int complete)
 377 {
 378   assert (u != NULL);
 379   FREE_MAYBE (u->url);
 380   FREE_MAYBE (u->host);
 381   FREE_MAYBE (u->path);
 382   FREE_MAYBE (u->file);
 383   FREE_MAYBE (u->dir);
 384   FREE_MAYBE (u->user);
 385   FREE_MAYBE (u->passwd);
 386   FREE_MAYBE (u->local);
 387   FREE_MAYBE (u->referer);
 388   if (u->proxy)
 389     freeurl (u->proxy, 1);
 390   if (complete)
 391     xfree (u);
 392   return;
 393 }
 394 \f
 395 /* Extract the given URL of the form
 396    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 397    1. hostname (terminated with `/' or `:')
 398    2. port number (terminated with `/'), or chosen for the protocol
 399    3. dirname (everything after hostname)
 400    Most errors are handled.  No allocation is done, you must supply
 401    pointers to allocated memory.
 402    ...and a host of other stuff :-)
 403
 404    - Recognizes hostname:dir/file for FTP and
 405      hostname (:portnum)?/dir/file for HTTP.
 406    - Parses the path to yield directory and file
 407    - Parses the URL to yield the username and passwd (if present)
 408    - Decodes the strings, in case they contain "forbidden" characters
 409    - Writes the result to struct urlinfo
 410
 411    If the argument STRICT is set, it recognizes only the canonical
 412    form.  */
 413 uerr_t
 414 parseurl (const char *url, struct urlinfo *u, int strict)
 415 {
 416   int i, l, abs_ftp;
 417   int recognizable;            /* Recognizable URL is the one where
 418                                   the protocol name was explicitly
 419                                   named, i.e. it wasn't deduced from
 420                                   the URL format.  */
 421   uerr_t type;
 422
 423   DEBUGP (("parseurl (\"%s\") -> ", url));
 424   recognizable = has_proto (url);
 425   if (strict && !recognizable)
 426     return URLUNKNOWN;
 427   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 428     {
 429       l = strlen (sup_protos[i].name);
 430       if (!strncasecmp (sup_protos[i].name, url, l))
 431         break;
 432     }
 433   /* If protocol is recognizable, but unsupported, bail out, else
 434      suppose unknown.  */
 435   if (recognizable && i == ARRAY_SIZE (sup_protos))
 436     return URLUNKNOWN;
 437   else if (i == ARRAY_SIZE (sup_protos))
 438     type = URLUNKNOWN;
 439   else
 440     u->proto = type = sup_protos[i].ind;
 441
 442   if (type == URLUNKNOWN)
 443     l = 0;
 444   /* Allow a username and password to be specified (i.e. just skip
 445      them for now).  */
 446   if (recognizable)
 447     l += skip_uname (url + l);
 448   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 449   if (i == l)
 450     return URLBADHOST;
 451   /* Get the hostname.  */
 452   u->host = strdupdelim (url + l, url + i);
 453   DEBUGP (("host %s -> ", u->host));
 454
 455   /* Assume no port has been given.  */
 456   u->port = 0;
 457   if (url[i] == ':')
 458     {
 459       /* We have a colon delimiting the hostname.  It could mean that
 460          a port number is following it, or a directory.  */
 461       if (ISDIGIT (url[++i]))    /* A port number */
 462         {
 463           if (type == URLUNKNOWN)
 464             u->proto = type = URLHTTP;
 465           for (; url[i] && url[i] != '/'; i++)
 466             if (ISDIGIT (url[i]))
 467               u->port = 10 * u->port + (url[i] - '0');
 468             else
 469               return URLBADPORT;
 470           if (!u->port)
 471             return URLBADPORT;
 472           DEBUGP (("port %hu -> ", u->port));
 473         }
 474       else if (type == URLUNKNOWN) /* or a directory */
 475         u->proto = type = URLFTP;
 476       else                      /* or just a misformed port number */
 477         return URLBADPORT;
 478     }
 479   else if (type == URLUNKNOWN)
 480     u->proto = type = URLHTTP;
 481   if (!u->port)
 482     {
 483       int ind;
 484       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 485         if (sup_protos[ind].ind == type)
 486           break;
 487       if (ind == ARRAY_SIZE (sup_protos))
 488         return URLUNKNOWN;
 489       u->port = sup_protos[ind].port;
 490     }
 491   /* Some delimiter troubles...  */
 492   if (url[i] == '/' && url[i - 1] != ':')
 493     ++i;
 494   if (type == URLHTTP)
 495     while (url[i] && url[i] == '/')
 496       ++i;
 497   u->path = (char *)xmalloc (strlen (url + i) + 8);
 498   strcpy (u->path, url + i);
 499   if (type == URLFTP)
 500     {
 501       u->ftp_type = process_ftp_type (u->path);
 502       /* #### We don't handle type `d' correctly yet.  */
 503       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 504         u->ftp_type = 'I';
 505       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 506     }
 507   DEBUGP (("opath %s -> ", u->path));
 508   /* Parse the username and password (if existing).  */
 509   parse_uname (url, &u->user, &u->passwd);
 510   /* Decode the strings, as per RFC 1738.  */
 511   decode_string (u->host);
 512   decode_string (u->path);
 513   if (u->user)
 514     decode_string (u->user);
 515   if (u->passwd)
 516     decode_string (u->passwd);
 517   /* Parse the directory.  */
 518   parse_dir (u->path, &u->dir, &u->file);
 519   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 520   /* Simplify the directory.  */
 521   path_simplify (u->dir);
 522   /* Remove the leading `/' in HTTP.  */
 523   if (type == URLHTTP && *u->dir == '/')
 524     strcpy (u->dir, u->dir + 1);
 525   DEBUGP (("ndir %s\n", u->dir));
 526   /* Strip trailing `/'.  */
 527   l = strlen (u->dir);
 528   if (l > 1 && u->dir[l - 1] == '/')
 529     u->dir[l - 1] = '\0';
 530   /* Re-create the path: */
 531   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 532   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 533       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 534   strcpy (u->path, abs_ftp ? "%2F" : "/");
 535   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 536   strcat (u->path, *u->dir ? "/" : "");
 537   strcat (u->path, u->file);
 538   ENCODE (u->path);
 539   DEBUGP (("newpath: %s\n", u->path));
 540   /* Create the clean URL.  */
 541   u->url = str_url (u, 0);
 542   return URLOK;
 543 }
 544 \f
 545 /* Special versions of DOTP and DDOTP for parse_dir().  They work like
 546    DOTP and DDOTP, but they also recognize `?' as end-of-string
 547    delimiter.  This is needed for correct handling of query
 548    strings.  */
 549
 550 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 551 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 552                      && (!*((x) + 2) || *((x) + 2) == '?'))
 553
 554 /* Build the directory and filename components of the path.  Both
 555    components are *separately* malloc-ed strings!  It does not change
 556    the contents of path.
 557
 558    If the path ends with "." or "..", they are (correctly) counted as
 559    directories.  */
 560 static void
 561 parse_dir (const char *path, char **dir, char **file)
 562 {
 563   int i, l;
 564
 565   l = urlpath_length (path);
 566   for (i = l; i && path[i] != '/'; i--);
 567
 568   if (!i && *path != '/')   /* Just filename */
 569     {
 570       if (PD_DOTP (path) || PD_DDOTP (path))
 571         {
 572           *dir = strdupdelim (path, path + l);
 573           *file = xstrdup (path + l); /* normally empty, but could
 574                                          contain ?... */
 575         }
 576       else
 577         {
 578           *dir = xstrdup ("");     /* This is required because of FTP */
 579           *file = xstrdup (path);
 580         }
 581     }
 582   else if (!i)                 /* /filename */
 583     {
 584       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 585         {
 586           *dir = strdupdelim (path, path + l);
 587           *file = xstrdup (path + l); /* normally empty, but could
 588                                          contain ?... */
 589         }
 590       else
 591         {
 592           *dir = xstrdup ("/");
 593           *file = xstrdup (path + 1);
 594         }
 595     }
 596   else /* Nonempty directory with or without a filename */
 597     {
 598       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 599         {
 600           *dir = strdupdelim (path, path + l);
 601           *file = xstrdup (path + l); /* normally empty, but could
 602                                          contain ?... */
 603         }
 604       else
 605         {
 606           *dir = strdupdelim (path, path + i);
 607           *file = xstrdup (path + i + 1);
 608         }
 609     }
 610 }
 611
 612 /* Find the optional username and password within the URL, as per
 613    RFC1738.  The returned user and passwd char pointers are
 614    malloc-ed.  */
 615 static uerr_t
 616 parse_uname (const char *url, char **user, char **passwd)
 617 {
 618   int l;
 619   const char *p, *q, *col;
 620   char **where;
 621
 622   *user = NULL;
 623   *passwd = NULL;
 624
 625   /* Look for the end of the protocol string.  */
 626   l = skip_proto (url);
 627   if (!l)
 628     return URLUNKNOWN;
 629   /* Add protocol offset.  */
 630   url += l;
 631   /* Is there an `@' character?  */
 632   for (p = url; *p && *p != '/'; p++)
 633     if (*p == '@')
 634       break;
 635   /* If not, return.  */
 636   if (*p != '@')
 637     return URLOK;
 638   /* Else find the username and password.  */
 639   for (p = q = col = url; *p && *p != '/'; p++)
 640     {
 641       if (*p == ':' && !*user)
 642         {
 643           *user = (char *)xmalloc (p - url + 1);
 644           memcpy (*user, url, p - url);
 645           (*user)[p - url] = '\0';
 646           col = p + 1;
 647         }
 648       if (*p == '@') q = p;
 649     }
 650   /* Decide whether you have only the username or both.  */
 651   where = *user ? passwd : user;
 652   *where = (char *)xmalloc (q - col + 1);
 653   memcpy (*where, col, q - col);
 654   (*where)[q - col] = '\0';
 655   return URLOK;
 656 }
 657
 658 /* If PATH ends with `;type=X', return the character X.  */
 659 static char
 660 process_ftp_type (char *path)
 661 {
 662   int len = strlen (path);
 663
 664   if (len >= 7
 665       && !memcmp (path + len - 7, ";type=", 6))
 666     {
 667       path[len - 7] = '\0';
 668       return path[len - 1];
 669     }
 670   else
 671     return '\0';
 672 }
 673 \f
 674 /* Return the URL as fine-formed string, with a proper protocol, optional port
 675    number, directory and optional user/password.  If `hide' is non-zero (as it
 676    is when we're calling this on a URL we plan to print, but not when calling it
 677    to canonicalize a URL for use within the program), password will be hidden.
 678    The forbidden characters in the URL will be cleansed.  */
 679 char *
 680 str_url (const struct urlinfo *u, int hide)
 681 {
 682   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 683   int i, l, ln, lu, lh, lp, lf, ld;
 684   unsigned short proto_default_port;
 685
 686   /* Look for the protocol name.  */
 687   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 688     if (sup_protos[i].ind == u->proto)
 689       break;
 690   if (i == ARRAY_SIZE (sup_protos))
 691     return NULL;
 692   proto_name = sup_protos[i].name;
 693   proto_default_port = sup_protos[i].port;
 694   host = encode_string (u->host);
 695   dir = encode_string (u->dir);
 696   file = encode_string (u->file);
 697   user = passwd = NULL;
 698   if (u->user)
 699     user = encode_string (u->user);
 700   if (u->passwd)
 701     {
 702       if (hide)
 703         /* Don't output the password, or someone might see it over the user's
 704            shoulder (or in saved wget output).  Don't give away the number of
 705            characters in the password, either, as we did in past versions of
 706            this code, when we replaced the password characters with 'x's. */
 707         passwd = xstrdup("<password>");
 708       else
 709         passwd = encode_string (u->passwd);
 710     }
 711   if (u->proto == URLFTP && *dir == '/')
 712     {
 713       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 714       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 715       tmp[0] = '%';
 716       tmp[1] = '2';
 717       tmp[2] = 'F';
 718       strcpy (tmp + 3, dir + 1);
 719       xfree (dir);
 720       dir = tmp;
 721     }
 722
 723   ln = strlen (proto_name);
 724   lu = user ? strlen (user) : 0;
 725   lp = passwd ? strlen (passwd) : 0;
 726   lh = strlen (host);
 727   ld = strlen (dir);
 728   lf = strlen (file);
 729   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 730   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 731      (user ? user : ""), (passwd ? ":" : ""),
 732      (passwd ? passwd : ""), (user ? "@" : ""),
 733      host, u->port, dir, *dir ? "/" : "", file); */
 734   l = 0;
 735   memcpy (res, proto_name, ln);
 736   l += ln;
 737   if (user)
 738     {
 739       memcpy (res + l, user, lu);
 740       l += lu;
 741       if (passwd)
 742         {
 743           res[l++] = ':';
 744           memcpy (res + l, passwd, lp);
 745           l += lp;
 746         }
 747       res[l++] = '@';
 748     }
 749   memcpy (res + l, host, lh);
 750   l += lh;
 751   if (u->port != proto_default_port)
 752     {
 753       res[l++] = ':';
 754       long_to_string (res + l, (long)u->port);
 755       l += numdigit (u->port);
 756     }
 757   res[l++] = '/';
 758   memcpy (res + l, dir, ld);
 759   l += ld;
 760   if (*dir)
 761     res[l++] = '/';
 762   strcpy (res + l, file);
 763   xfree (host);
 764   xfree (dir);
 765   xfree (file);
 766   FREE_MAYBE (user);
 767   FREE_MAYBE (passwd);
 768   return res;
 769 }
 770
 771 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 772    location.  Uses parseurl to parse them, and compares the canonical
 773    forms.
 774
 775    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 776    return 0 on error.  */
 777 int
 778 url_equal (const char *url1, const char *url2)
 779 {
 780   struct urlinfo *u1, *u2;
 781   uerr_t err;
 782   int res;
 783
 784   u1 = newurl ();
 785   err = parseurl (url1, u1, 0);
 786   if (err != URLOK)
 787     {
 788       freeurl (u1, 1);
 789       return 0;
 790     }
 791   u2 = newurl ();
 792   err = parseurl (url2, u2, 0);
 793   if (err != URLOK)
 794     {
 795       freeurl (u2, 1);
 796       return 0;
 797     }
 798   res = !strcmp (u1->url, u2->url);
 799   freeurl (u1, 1);
 800   freeurl (u2, 1);
 801   return res;
 802 }
 803 \f
 804 urlpos *
 805 get_urls_file (const char *file)
 806 {
 807   struct file_memory *fm;
 808   urlpos *head, *tail;
 809   const char *text, *text_end;
 810
 811   /* Load the file.  */
 812   fm = read_file (file);
 813   if (!fm)
 814     {
 815       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 816       return NULL;
 817     }
 818   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 819   head = tail = NULL;
 820   text = fm->content;
 821   text_end = fm->content + fm->length;
 822   while (text < text_end)
 823     {
 824       const char *line_beg = text;
 825       const char *line_end = memchr (text, '\n', text_end - text);
 826       if (!line_end)
 827         line_end = text_end;
 828       else
 829         ++line_end;
 830       text = line_end;
 831       while (line_beg < line_end
 832              && ISSPACE (*line_beg))
 833         ++line_beg;
 834       while (line_end > line_beg + 1
 835              && ISSPACE (*(line_end - 1)))
 836         --line_end;
 837       if (line_end > line_beg)
 838         {
 839           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 840           memset (entry, 0, sizeof (*entry));
 841           entry->next = NULL;
 842           entry->url = strdupdelim (line_beg, line_end);
 843           if (!head)
 844             head = entry;
 845           else
 846             tail->next = entry;
 847           tail = entry;
 848         }
 849     }
 850   read_file_free (fm);
 851   return head;
 852 }
 853 \f
 854 /* Free the linked list of urlpos.  */
 855 void
 856 free_urlpos (urlpos *l)
 857 {
 858   while (l)
 859     {
 860       urlpos *next = l->next;
 861       xfree (l->url);
 862       FREE_MAYBE (l->local_name);
 863       xfree (l);
 864       l = next;
 865     }
 866 }
 867
 868 /* Rotate FNAME opt.backups times */
 869 void
 870 rotate_backups(const char *fname)
 871 {
 872   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 873   char *from = (char *)alloca (maxlen);
 874   char *to = (char *)alloca (maxlen);
 875   struct stat sb;
 876   int i;
 877
 878   if (stat (fname, &sb) == 0)
 879     if (S_ISREG (sb.st_mode) == 0)
 880       return;
 881
 882   for (i = opt.backups; i > 1; i--)
 883     {
 884       sprintf (from, "%s.%d", fname, i - 1);
 885       sprintf (to, "%s.%d", fname, i);
 886       /* #### This will fail on machines without the rename() system
 887          call.  */
 888       rename (from, to);
 889     }
 890
 891   sprintf (to, "%s.%d", fname, 1);
 892   rename(fname, to);
 893 }
 894
 895 /* Create all the necessary directories for PATH (a file).  Calls
 896    mkdirhier() internally.  */
 897 int
 898 mkalldirs (const char *path)
 899 {
 900   const char *p;
 901   char *t;
 902   struct stat st;
 903   int res;
 904
 905   p = path + strlen (path);
 906   for (; *p != '/' && p != path; p--);
 907   /* Don't create if it's just a file.  */
 908   if ((p == path) && (*p != '/'))
 909     return 0;
 910   t = strdupdelim (path, p);
 911   /* Check whether the directory exists.  */
 912   if ((stat (t, &st) == 0))
 913     {
 914       if (S_ISDIR (st.st_mode))
 915         {
 916           xfree (t);
 917           return 0;
 918         }
 919       else
 920         {
 921           /* If the dir exists as a file name, remove it first.  This
 922              is *only* for Wget to work with buggy old CERN http
 923              servers.  Here is the scenario: When Wget tries to
 924              retrieve a directory without a slash, e.g.
 925              http://foo/bar (bar being a directory), CERN server will
 926              not redirect it too http://foo/bar/ -- it will generate a
 927              directory listing containing links to bar/file1,
 928              bar/file2, etc.  Wget will lose because it saves this
 929              HTML listing to a file `bar', so it cannot create the
 930              directory.  To work around this, if the file of the same
 931              name exists, we just remove it and create the directory
 932              anyway.  */
 933           DEBUGP (("Removing %s because of directory danger!\n", t));
 934           unlink (t);
 935         }
 936     }
 937   res = make_directory (t);
 938   if (res != 0)
 939     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 940   xfree (t);
 941   return res;
 942 }
 943
 944 static int
 945 count_slashes (const char *s)
 946 {
 947   int i = 0;
 948   while (*s)
 949     if (*s++ == '/')
 950       ++i;
 951   return i;
 952 }
 953
 954 /* Return the path name of the URL-equivalent file name, with a
 955    remote-like structure of directories.  */
 956 static char *
 957 mkstruct (const struct urlinfo *u)
 958 {
 959   char *host, *dir, *file, *res, *dirpref;
 960   int l;
 961
 962   assert (u->dir != NULL);
 963   assert (u->host != NULL);
 964
 965   if (opt.cut_dirs)
 966     {
 967       char *ptr = u->dir + (*u->dir == '/');
 968       int slash_count = 1 + count_slashes (ptr);
 969       int cut = MINVAL (opt.cut_dirs, slash_count);
 970       for (; cut && *ptr; ptr++)
 971         if (*ptr == '/')
 972           --cut;
 973       STRDUP_ALLOCA (dir, ptr);
 974     }
 975   else
 976     dir = u->dir + (*u->dir == '/');
 977
 978   host = xstrdup (u->host);
 979   /* Check for the true name (or at least a consistent name for saving
 980      to directory) of HOST, reusing the hlist if possible.  */
 981   if (opt.add_hostdir && !opt.simple_check)
 982     {
 983       char *nhost = realhost (host);
 984       xfree (host);
 985       host = nhost;
 986     }
 987   /* Add dir_prefix and hostname (if required) to the beginning of
 988      dir.  */
 989   if (opt.add_hostdir)
 990     {
 991       if (!DOTP (opt.dir_prefix))
 992         {
 993           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 994                                     + strlen (host) + 1);
 995           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 996         }
 997       else
 998         STRDUP_ALLOCA (dirpref, host);
 999     }
1000   else                         /* not add_hostdir */
1001     {
1002       if (!DOTP (opt.dir_prefix))
1003         dirpref = opt.dir_prefix;
1004       else
1005         dirpref = "";
1006     }
1007   xfree (host);
1008
1009   /* If there is a prefix, prepend it.  */
1010   if (*dirpref)
1011     {
1012       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1013       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1014       dir = newdir;
1015     }
1016   dir = encode_string (dir);
1017   l = strlen (dir);
1018   if (l && dir[l - 1] == '/')
1019     dir[l - 1] = '\0';
1020
1021   if (!*u->file)
1022     file = "index.html";
1023   else
1024     file = u->file;
1025
1026   /* Finally, construct the full name.  */
1027   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1028   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1029   xfree (dir);
1030   return res;
1031 }
1032
1033 /* Create a unique filename, corresponding to a given URL.  Calls
1034    mkstruct if necessary.  Does *not* actually create any directories.  */
1035 char *
1036 url_filename (const struct urlinfo *u)
1037 {
1038   char *file, *name;
1039   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1040
1041   if (opt.dirstruct)
1042     {
1043       file = mkstruct (u);
1044       have_prefix = 1;
1045     }
1046   else
1047     {
1048       if (!*u->file)
1049         file = xstrdup ("index.html");
1050       else
1051         file = xstrdup (u->file);
1052     }
1053
1054   if (!have_prefix)
1055     {
1056       /* Check whether the prefix directory is something other than "."
1057          before prepending it.  */
1058       if (!DOTP (opt.dir_prefix))
1059         {
1060           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1061                                          + 1 + strlen (file) + 1);
1062           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1063           xfree (file);
1064           file = nfile;
1065         }
1066     }
1067   /* DOS-ish file systems don't like `%' signs in them; we change it
1068      to `@'.  */
1069 #ifdef WINDOWS
1070   {
1071     char *p = file;
1072     for (p = file; *p; p++)
1073       if (*p == '%')
1074         *p = '@';
1075   }
1076 #endif /* WINDOWS */
1077
1078   /* Check the cases in which the unique extensions are not used:
1079      1) Clobbering is turned off (-nc).
1080      2) Retrieval with regetting.
1081      3) Timestamping is used.
1082      4) Hierarchy is built.
1083
1084      The exception is the case when file does exist and is a
1085      directory (actually support for bad httpd-s).  */
1086   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1087       && !(file_exists_p (file) && !file_non_directory_p (file)))
1088     return file;
1089
1090   /* Find a unique name.  */
1091   name = unique_name (file);
1092   xfree (file);
1093   return name;
1094 }
1095
1096 /* Like strlen(), but allow the URL to be ended with '?'.  */
1097 static int
1098 urlpath_length (const char *url)
1099 {
1100   const char *q = strchr (url, '?');
1101   if (q)
1102     return q - url;
1103   return strlen (url);
1104 }
1105
1106 /* Find the last occurrence of character C in the range [b, e), or
1107    NULL, if none are present.  This is almost completely equivalent to
1108    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1109    the contents of the string.  */
1110 static const char *
1111 find_last_char (const char *b, const char *e, char c)
1112 {
1113   for (; e > b; e--)
1114     if (*e == c)
1115       return e;
1116   return NULL;
1117 }
1118
1119 /* Resolve the result of "linking" a base URI (BASE) to a
1120    link-specified URI (LINK).
1121
1122    Either of the URIs may be absolute or relative, complete with the
1123    host name, or path only.  This tries to behave "reasonably" in all
1124    foreseeable cases.  It employs little specific knowledge about
1125    protocols or URL-specific stuff -- it just works on strings.
1126
1127    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1128    See uri_merge for a gentler interface to this functionality.
1129
1130    #### This function should handle `./' and `../' so that the evil
1131    path_simplify can go.  */
1132 static char *
1133 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1134 {
1135   char *constr;
1136
1137   if (no_proto)
1138     {
1139       const char *end = base + urlpath_length (base);
1140
1141       if (*link != '/')
1142         {
1143           /* LINK is a relative URL: we need to replace everything
1144              after last slash (possibly empty) with LINK.
1145
1146              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1147              our result should be "whatever/foo/qux/xyzzy".  */
1148           int need_explicit_slash = 0;
1149           int span;
1150           const char *start_insert;
1151           const char *last_slash = find_last_char (base, end, '/');
1152           if (!last_slash)
1153             {
1154               /* No slash found at all.  Append LINK to what we have,
1155                  but we'll need a slash as a separator.
1156
1157                  Example: if base == "foo" and link == "qux/xyzzy", then
1158                  we cannot just append link to base, because we'd get
1159                  "fooqux/xyzzy", whereas what we want is
1160                  "foo/qux/xyzzy".
1161
1162                  To make sure the / gets inserted, we set
1163                  need_explicit_slash to 1.  We also set start_insert
1164                  to end + 1, so that the length calculations work out
1165                  correctly for one more (slash) character.  Accessing
1166                  that character is fine, since it will be the
1167                  delimiter, '\0' or '?'.  */
1168               /* example: "foo?..." */
1169               /*               ^    ('?' gets changed to '/') */
1170               start_insert = end + 1;
1171               need_explicit_slash = 1;
1172             }
1173           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1174             {
1175               /* example: http://host"  */
1176               /*                      ^ */
1177               start_insert = end + 1;
1178               need_explicit_slash = 1;
1179             }
1180           else
1181             {
1182               /* example: "whatever/foo/bar" */
1183               /*                        ^    */
1184               start_insert = last_slash + 1;
1185             }
1186
1187           span = start_insert - base;
1188           constr = (char *)xmalloc (span + linklength + 1);
1189           if (span)
1190             memcpy (constr, base, span);
1191           if (need_explicit_slash)
1192             constr[span - 1] = '/';
1193           if (linklength)
1194             memcpy (constr + span, link, linklength);
1195           constr[span + linklength] = '\0';
1196         }
1197       else /* *link == `/' */
1198         {
1199           /* LINK is an absolute path: we need to replace everything
1200              after (and including) the FIRST slash with LINK.
1201
1202              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1203              "/qux/xyzzy", our result should be
1204              "http://host/qux/xyzzy".  */
1205           int span;
1206           const char *slash;
1207           const char *start_insert = NULL; /* for gcc to shut up. */
1208           const char *pos = base;
1209           int seen_slash_slash = 0;
1210           /* We're looking for the first slash, but want to ignore
1211              double slash. */
1212         again:
1213           slash = memchr (pos, '/', end - pos);
1214           if (slash && !seen_slash_slash)
1215             if (*(slash + 1) == '/')
1216               {
1217                 pos = slash + 2;
1218                 seen_slash_slash = 1;
1219                 goto again;
1220               }
1221
1222           /* At this point, SLASH is the location of the first / after
1223              "//", or the first slash altogether.  START_INSERT is the
1224              pointer to the location where LINK will be inserted.  When
1225              examining the last two examples, keep in mind that LINK
1226              begins with '/'. */
1227
1228           if (!slash && !seen_slash_slash)
1229             /* example: "foo" */
1230             /*           ^    */
1231             start_insert = base;
1232           else if (!slash && seen_slash_slash)
1233             /* example: "http://foo" */
1234             /*                     ^ */
1235             start_insert = end;
1236           else if (slash && !seen_slash_slash)
1237             /* example: "foo/bar" */
1238             /*           ^        */
1239             start_insert = base;
1240           else if (slash && seen_slash_slash)
1241             /* example: "http://something/" */
1242             /*                           ^  */
1243             start_insert = slash;
1244
1245           span = start_insert - base;
1246           constr = (char *)xmalloc (span + linklength + 1);
1247           if (span)
1248             memcpy (constr, base, span);
1249           if (linklength)
1250             memcpy (constr + span, link, linklength);
1251           constr[span + linklength] = '\0';
1252         }
1253     }
1254   else /* !no_proto */
1255     {
1256       constr = strdupdelim (link, link + linklength);
1257     }
1258   return constr;
1259 }
1260
1261 /* Merge BASE with LINK and return the resulting URI.  This is an
1262    interface to uri_merge_1 that assumes that LINK is a
1263    zero-terminated string.  */
1264 char *
1265 uri_merge (const char *base, const char *link)
1266 {
1267   return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1268 }
1269 \f
1270 /* Optimize URL by host, destructively replacing u->host with realhost
1271    (u->host).  Do this regardless of opt.simple_check.  */
1272 void
1273 opt_url (struct urlinfo *u)
1274 {
1275   /* Find the "true" host.  */
1276   char *host = realhost (u->host);
1277   xfree (u->host);
1278   u->host = host;
1279   assert (u->dir != NULL);      /* the URL must have been parsed */
1280   /* Refresh the printed representation.  */
1281   xfree (u->url);
1282   u->url = str_url (u, 0);
1283 }
1284 \f
1285 /* Returns proxy host address, in accordance with PROTO.  */
1286 char *
1287 getproxy (uerr_t proto)
1288 {
1289   char *proxy;
1290
1291   if (proto == URLHTTP)
1292     proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1293   else if (proto == URLFTP)
1294     proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1295 #ifdef HAVE_SSL
1296   else if (proto == URLHTTPS)
1297     proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1298 #endif /* HAVE_SSL */
1299   else
1300     proxy = NULL;
1301   if (!proxy || !*proxy)
1302     return NULL;
1303   return proxy;
1304 }
1305
1306 /* Should a host be accessed through proxy, concerning no_proxy?  */
1307 int
1308 no_proxy_match (const char *host, const char **no_proxy)
1309 {
1310   if (!no_proxy)
1311     return 1;
1312   else
1313     return !sufmatch (no_proxy, host);
1314 }
1315 \f
1316 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1317 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1318
1319 /* Change the links in an HTML document.  Accepts a structure that
1320    defines the positions of all the links.  */
1321 void
1322 convert_links (const char *file, urlpos *l)
1323 {
1324   struct file_memory *fm;
1325   FILE               *fp;
1326   const char         *p;
1327   downloaded_file_t  downloaded_file_return;
1328
1329   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1330
1331   {
1332     /* First we do a "dry run": go through the list L and see whether
1333        any URL needs to be converted in the first place.  If not, just
1334        leave the file alone.  */
1335     int count = 0;
1336     urlpos *dry = l;
1337     for (dry = l; dry; dry = dry->next)
1338       if (dry->convert != CO_NOCONVERT)
1339         ++count;
1340     if (!count)
1341       {
1342         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1343         return;
1344       }
1345   }
1346
1347   fm = read_file (file);
1348   if (!fm)
1349     {
1350       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1351                  file, strerror (errno));
1352       return;
1353     }
1354
1355   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1356   if (opt.backup_converted && downloaded_file_return)
1357     write_backup_file (file, downloaded_file_return);
1358
1359   /* Before opening the file for writing, unlink the file.  This is
1360      important if the data in FM is mmaped.  In such case, nulling the
1361      file, which is what fopen() below does, would make us read all
1362      zeroes from the mmaped region.  */
1363   if (unlink (file) < 0 && errno != ENOENT)
1364     {
1365       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1366                  file, strerror (errno));
1367       read_file_free (fm);
1368       return;
1369     }
1370   /* Now open the file for writing.  */
1371   fp = fopen (file, "wb");
1372   if (!fp)
1373     {
1374       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1375                  file, strerror (errno));
1376       read_file_free (fm);
1377       return;
1378     }
1379   /* Here we loop through all the URLs in file, replacing those of
1380      them that are downloaded with relative references.  */
1381   p = fm->content;
1382   for (; l; l = l->next)
1383     {
1384       char *url_start = fm->content + l->pos;
1385
1386       if (l->pos >= fm->length)
1387         {
1388           DEBUGP (("Something strange is going on.  Please investigate."));
1389           break;
1390         }
1391       /* If the URL is not to be converted, skip it.  */
1392       if (l->convert == CO_NOCONVERT)
1393         {
1394           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1395           continue;
1396         }
1397
1398       /* Echo the file contents, up to the offending URL's opening
1399          quote, to the outfile.  */
1400       fwrite (p, 1, url_start - p, fp);
1401       p = url_start;
1402       if (l->convert == CO_CONVERT_TO_RELATIVE)
1403         {
1404           /* Convert absolute URL to relative. */
1405           char *newname = construct_relative (file, l->local_name);
1406           char *quoted_newname = html_quote_string (newname);
1407           replace_attr (&p, l->size, fp, quoted_newname);
1408           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1409                    l->url, newname, l->pos, file));
1410           xfree (newname);
1411           xfree (quoted_newname);
1412         }
1413       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1414         {
1415           /* Convert the link to absolute URL. */
1416           char *newlink = l->url;
1417           char *quoted_newlink = html_quote_string (newlink);
1418           replace_attr (&p, l->size, fp, quoted_newlink);
1419           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1420                    newlink, l->pos, file));
1421           xfree (quoted_newlink);
1422         }
1423     }
1424   /* Output the rest of the file. */
1425   if (p - fm->content < fm->length)
1426     fwrite (p, 1, fm->length - (p - fm->content), fp);
1427   fclose (fp);
1428   read_file_free (fm);
1429   logputs (LOG_VERBOSE, _("done.\n"));
1430 }
1431
1432 /* Construct and return a malloced copy of the relative link from two
1433    pieces of information: local name S1 of the referring file and
1434    local name S2 of the referred file.
1435
1436    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1437    "jagor.srce.hr/images/news.gif", the function will return
1438    "images/news.gif".
1439
1440    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1441    "fly.cc.fer.hr/images/fly.gif", the function will return
1442    "../images/fly.gif".
1443
1444    Caveats: S1 should not begin with `/', unless S2 also begins with
1445    '/'.  S1 should not contain things like ".." and such --
1446    construct_relative ("fly/ioccc/../index.html",
1447    "fly/images/fly.gif") will fail.  (A workaround is to call
1448    something like path_simplify() on S1).  */
1449 static char *
1450 construct_relative (const char *s1, const char *s2)
1451 {
1452   int i, cnt, sepdirs1;
1453   char *res;
1454
1455   if (*s2 == '/')
1456     return xstrdup (s2);
1457   /* S1 should *not* be absolute, if S2 wasn't.  */
1458   assert (*s1 != '/');
1459   i = cnt = 0;
1460   /* Skip the directories common to both strings.  */
1461   while (1)
1462     {
1463       while (s1[i] && s2[i]
1464              && (s1[i] == s2[i])
1465              && (s1[i] != '/')
1466              && (s2[i] != '/'))
1467         ++i;
1468       if (s1[i] == '/' && s2[i] == '/')
1469         cnt = ++i;
1470       else
1471         break;
1472     }
1473   for (sepdirs1 = 0; s1[i]; i++)
1474     if (s1[i] == '/')
1475       ++sepdirs1;
1476   /* Now, construct the file as of:
1477      - ../ repeated sepdirs1 time
1478      - all the non-mutual directories of S2.  */
1479   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1480   for (i = 0; i < sepdirs1; i++)
1481     memcpy (res + 3 * i, "../", 3);
1482   strcpy (res + 3 * i, s2 + cnt);
1483   return res;
1484 }
1485 \f
1486 /* Add URL to the head of the list L.  */
1487 urlpos *
1488 add_url (urlpos *l, const char *url, const char *file)
1489 {
1490   urlpos *t;
1491
1492   t = (urlpos *)xmalloc (sizeof (urlpos));
1493   memset (t, 0, sizeof (*t));
1494   t->url = xstrdup (url);
1495   t->local_name = xstrdup (file);
1496   t->next = l;
1497   return t;
1498 }
1499
1500 static void
1501 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1502 {
1503   /* Rather than just writing over the original .html file with the
1504      converted version, save the former to *.orig.  Note we only do
1505      this for files we've _successfully_ downloaded, so we don't
1506      clobber .orig files sitting around from previous invocations. */
1507
1508   /* Construct the backup filename as the original name plus ".orig". */
1509   size_t         filename_len = strlen(file);
1510   char*          filename_plus_orig_suffix;
1511   boolean        already_wrote_backup_file = FALSE;
1512   slist*         converted_file_ptr;
1513   static slist*  converted_files = NULL;
1514
1515   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1516     {
1517       /* Just write "orig" over "html".  We need to do it this way
1518          because when we're checking to see if we've downloaded the
1519          file before (to see if we can skip downloading it), we don't
1520          know if it's a text/html file.  Therefore we don't know yet
1521          at that stage that -E is going to cause us to tack on
1522          ".html", so we need to compare vs. the original URL plus
1523          ".orig", not the original URL plus ".html.orig". */
1524       filename_plus_orig_suffix = alloca (filename_len + 1);
1525       strcpy(filename_plus_orig_suffix, file);
1526       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1527     }
1528   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1529     {
1530       /* Append ".orig" to the name. */
1531       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1532       strcpy(filename_plus_orig_suffix, file);
1533       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1534     }
1535
1536   /* We can get called twice on the same URL thanks to the
1537      convert_all_links() call in main().  If we write the .orig file
1538      each time in such a case, it'll end up containing the first-pass
1539      conversion, not the original file.  So, see if we've already been
1540      called on this file. */
1541   converted_file_ptr = converted_files;
1542   while (converted_file_ptr != NULL)
1543     if (strcmp(converted_file_ptr->string, file) == 0)
1544       {
1545         already_wrote_backup_file = TRUE;
1546         break;
1547       }
1548     else
1549       converted_file_ptr = converted_file_ptr->next;
1550
1551   if (!already_wrote_backup_file)
1552     {
1553       /* Rename <file> to <file>.orig before former gets written over. */
1554       if (rename(file, filename_plus_orig_suffix) != 0)
1555         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1556                    file, filename_plus_orig_suffix, strerror (errno));
1557
1558       /* Remember that we've already written a .orig backup for this file.
1559          Note that we never free this memory since we need it till the
1560          convert_all_links() call, which is one of the last things the
1561          program does before terminating.  BTW, I'm not sure if it would be
1562          safe to just set 'converted_file_ptr->string' to 'file' below,
1563          rather than making a copy of the string...  Another note is that I
1564          thought I could just add a field to the urlpos structure saying
1565          that we'd written a .orig file for this URL, but that didn't work,
1566          so I had to make this separate list.
1567          -- Dan Harkless <wget@harkless.org>
1568
1569          This [adding a field to the urlpos structure] didn't work
1570          because convert_file() is called twice: once after all its
1571          sublinks have been retrieved in recursive_retrieve(), and
1572          once at the end of the day in convert_all_links().  The
1573          original linked list collected in recursive_retrieve() is
1574          lost after the first invocation of convert_links(), and
1575          convert_all_links() makes a new one (it calls get_urls_html()
1576          for each file it covers.)  That's why your first approach didn't
1577          work.  The way to make it work is perhaps to make this flag a
1578          field in the `urls_html' list.
1579          -- Hrvoje Niksic <hniksic@arsdigita.com>
1580       */
1581       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1582       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1583       converted_file_ptr->next = converted_files;
1584       converted_files = converted_file_ptr;
1585     }
1586 }
1587
1588 static int find_fragment PARAMS ((const char *, int, const char **,
1589                                   const char **));
1590
1591 static void
1592 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1593 {
1594   const char *p = *pp;
1595   int quote_flag = 0;
1596   int size = raw_size;
1597   char quote_char = '\"';
1598   const char *frag_beg, *frag_end;
1599
1600   /* Structure of our string is:
1601        "...old-contents..."
1602        <---  l->size   --->  (with quotes)
1603      OR:
1604        ...old-contents...
1605        <---  l->size  -->    (no quotes)   */
1606
1607   if (*p == '\"' || *p == '\'')
1608     {
1609       quote_char = *p;
1610       quote_flag = 1;
1611       ++p;
1612       size -= 2;                /* disregard opening and closing quote */
1613     }
1614   putc (quote_char, fp);
1615   fputs (new_str, fp);
1616
1617   /* Look for fragment identifier, if any. */
1618   if (find_fragment (p, size, &frag_beg, &frag_end))
1619     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1620   p += size;
1621   if (quote_flag)
1622     ++p;
1623   putc (quote_char, fp);
1624   *pp = p;
1625 }
1626
1627 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1628    preceded by '&'.  If the character is not found, return zero.  If
1629    the character is found, return 1 and set BP and EP to point to the
1630    beginning and end of the region.
1631
1632    This is used for finding the fragment indentifiers in URLs.  */
1633
1634 static int
1635 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1636 {
1637   const char *end = beg + size;
1638   int saw_amp = 0;
1639   for (; beg < end; beg++)
1640     {
1641       switch (*beg)
1642         {
1643         case '&':
1644           saw_amp = 1;
1645           break;
1646         case '#':
1647           if (!saw_amp)
1648             {
1649               *bp = beg;
1650               *ep = end;
1651               return 1;
1652             }
1653           /* fallthrough */
1654         default:
1655           saw_amp = 0;
1656         }
1657     }
1658   return 0;
1659 }
1660
1661 typedef struct _downloaded_file_list {
1662   char*                          file;
1663   downloaded_file_t              download_type;
1664   struct _downloaded_file_list*  next;
1665 } downloaded_file_list;
1666
1667 static downloaded_file_list *downloaded_files;
1668
1669 /* Remembers which files have been downloaded.  In the standard case, should be
1670    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1671    download successfully (i.e. not for ones we have failures on or that we skip
1672    due to -N).
1673
1674    When we've downloaded a file and tacked on a ".html" extension due to -E,
1675    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1676    FILE_DOWNLOADED_NORMALLY.
1677
1678    If you just want to check if a file has been previously added without adding
1679    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1680    with local filenames, not remote URLs. */
1681 downloaded_file_t
1682 downloaded_file (downloaded_file_t  mode, const char*  file)
1683 {
1684   boolean                       found_file = FALSE;
1685   downloaded_file_list*         rover = downloaded_files;
1686
1687   while (rover != NULL)
1688     if (strcmp(rover->file, file) == 0)
1689       {
1690         found_file = TRUE;
1691         break;
1692       }
1693     else
1694       rover = rover->next;
1695
1696   if (found_file)
1697     return rover->download_type;  /* file had already been downloaded */
1698   else
1699     {
1700       if (mode != CHECK_FOR_FILE)
1701         {
1702           rover = xmalloc(sizeof(*rover));
1703           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1704           rover->download_type = mode;
1705           rover->next = downloaded_files;
1706           downloaded_files = rover;
1707         }
1708
1709       return FILE_NOT_ALREADY_DOWNLOADED;
1710     }
1711 }
1712
1713 void
1714 downloaded_files_free (void)
1715 {
1716   downloaded_file_list*         rover = downloaded_files;
1717   while (rover)
1718     {
1719       downloaded_file_list *next = rover->next;
1720       xfree (rover->file);
1721       xfree (rover);
1722       rover = next;
1723     }
1724 }