sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Is X "."?  */
  46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  47 /* Is X ".."?  */
  48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  49
  50 static int urlpath_length PARAMS ((const char *));
  51
  52 /* A NULL-terminated list of strings to be recognized as protocol
  53    types (URL schemes).  Note that recognized doesn't mean supported
  54    -- only HTTP, HTTPS and FTP are currently supported.
  55
  56    However, a string that does not match anything in the list will be
  57    considered a relative URL.  Thus it's important that this list has
  58    anything anyone could think of being legal.
  59
  60    #### This is probably broken.  Wget should use other means to
  61    distinguish between absolute and relative URIs in HTML links.
  62
  63    Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
  64    for more.  */
  65 static char *protostrings[] =
  66 {
  67   "cid:",
  68   "clsid:",
  69   "file:",
  70   "finger:",
  71   "ftp:",
  72   "gopher:",
  73   "hdl:",
  74   "http:",
  75   "https:",
  76   "ilu:",
  77   "ior:",
  78   "irc:",
  79   "java:",
  80   "javascript:",
  81   "lifn:",
  82   "mailto:",
  83   "mid:",
  84   "news:",
  85   "nntp:",
  86   "path:",
  87   "prospero:",
  88   "rlogin:",
  89   "service:",
  90   "shttp:",
  91   "snews:",
  92   "stanf:",
  93   "telnet:",
  94   "tn3270:",
  95   "wais:",
  96   "whois++:",
  97   NULL
  98 };
  99
 100 struct proto
 101 {
 102   char *name;
 103   uerr_t ind;
 104   unsigned short port;
 105 };
 106
 107 /* Similar to former, but for supported protocols: */
 108 static struct proto sup_protos[] =
 109 {
 110   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 111 #ifdef HAVE_SSL
 112   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 113 #endif
 114   { "ftp://", URLFTP, DEFAULT_FTP_PORT }
 115 };
 116
 117 static void parse_dir PARAMS ((const char *, char **, char **));
 118 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 119 static char *construct_relative PARAMS ((const char *, const char *));
 120 static char process_ftp_type PARAMS ((char *));
 121
 122 \f
 123 /* Support for encoding and decoding of URL strings.  We determine
 124    whether a character is unsafe through static table lookup.  This
 125    code assumes ASCII character set and 8-bit chars.  */
 126
 127 enum {
 128   urlchr_reserved = 1,
 129   urlchr_unsafe   = 2
 130 };
 131
 132 #define R  urlchr_reserved
 133 #define U  urlchr_unsafe
 134 #define RU R|U
 135
 136 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 137
 138 /* rfc1738 reserved chars.  We don't use this yet; preservation of
 139    reserved chars will be implemented when I integrate the new
 140    `reencode_string' function.  */
 141
 142 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 143
 144 /* Unsafe chars:
 145    - anything <= 32;
 146    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 147    - '@' and ':'; needed for encoding URL username and password.
 148    - anything >= 127. */
 149
 150 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 151
 152 const static unsigned char urlchr_table[256] =
 153 {
 154   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 155   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 156   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 157   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 158   U,  0,  U,  U,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 159   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 160   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 161   0,  0,  U,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 162  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 163   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 164   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 165   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 166   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 167   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 168   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 169   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 170
 171   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 172   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 173   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 174   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 175
 176   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 177   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 178   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 179   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 180 };
 181
 182 /* Decodes the forms %xy in a URL to the character the hexadecimal
 183    code of which is xy.  xy are hexadecimal digits from
 184    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 185    hex-digits or `%' precedes `\0', the sequence is inserted
 186    literally.  */
 187
 188 static void
 189 decode_string (char *s)
 190 {
 191   char *t = s;                  /* t - tortoise */
 192   char *h = s;                  /* h - hare     */
 193
 194   for (; *h; h++, t++)
 195     {
 196       if (*h != '%')
 197         {
 198         copychar:
 199           *t = *h;
 200         }
 201       else
 202         {
 203           /* Do nothing if '%' is not followed by two hex digits. */
 204           if (!*(h + 1) || !*(h + 2)
 205               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 206             goto copychar;
 207           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 208           h += 2;
 209         }
 210     }
 211   *t = '\0';
 212 }
 213
 214 /* Like encode_string, but return S if there are no unsafe chars.  */
 215
 216 static char *
 217 encode_string_maybe (const char *s)
 218 {
 219   const char *p1;
 220   char *p2, *newstr;
 221   int newlen;
 222   int addition = 0;
 223
 224   for (p1 = s; *p1; p1++)
 225     if (UNSAFE_CHAR (*p1))
 226       addition += 2;            /* Two more characters (hex digits) */
 227
 228   if (!addition)
 229     return (char *)s;
 230
 231   newlen = (p1 - s) + addition;
 232   newstr = (char *)xmalloc (newlen + 1);
 233
 234   p1 = s;
 235   p2 = newstr;
 236   while (*p1)
 237     {
 238       if (UNSAFE_CHAR (*p1))
 239         {
 240           const unsigned char c = *p1++;
 241           *p2++ = '%';
 242           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 243           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 244         }
 245       else
 246         *p2++ = *p1++;
 247     }
 248   *p2 = '\0';
 249   assert (p2 - newstr == newlen);
 250
 251   return newstr;
 252 }
 253
 254 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 255    given string, returning a malloc-ed %XX encoded string.  */
 256
 257 char *
 258 encode_string (const char *s)
 259 {
 260   char *encoded = encode_string_maybe (s);
 261   if (encoded != s)
 262     return encoded;
 263   else
 264     return xstrdup (s);
 265 }
 266
 267 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 268    the old value of PTR is freed and PTR is made to point to the newly
 269    allocated storage.  */
 270
 271 #define ENCODE(ptr) do {                        \
 272   char *e_new = encode_string_maybe (ptr);      \
 273   if (e_new != ptr)                             \
 274     {                                           \
 275       xfree (ptr);                              \
 276       ptr = e_new;                              \
 277     }                                           \
 278 } while (0)
 279 \f
 280 /* Returns the protocol type if URL's protocol is supported, or
 281    URLUNKNOWN if not.  */
 282 uerr_t
 283 urlproto (const char *url)
 284 {
 285   int i;
 286
 287   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 288     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 289       return sup_protos[i].ind;
 290   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 291   if (url[i] == ':')
 292     {
 293       for (++i; url[i] && url[i] != '/'; i++)
 294         if (!ISDIGIT (url[i]))
 295           return URLBADPORT;
 296       if (url[i - 1] == ':')
 297         return URLFTP;
 298       else
 299         return URLHTTP;
 300     }
 301   else
 302     return URLHTTP;
 303 }
 304
 305 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 306    part is found, returns 0.  */
 307 int
 308 skip_proto (const char *url)
 309 {
 310   char **s;
 311   int l;
 312
 313   for (s = protostrings; *s; s++)
 314     if (!strncasecmp (*s, url, strlen (*s)))
 315       break;
 316   if (!*s)
 317     return 0;
 318   l = strlen (*s);
 319   /* HTTP and FTP protocols are expected to yield exact host names
 320      (i.e. the `//' part must be skipped, too).  */
 321   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 322     l += 2;
 323   return l;
 324 }
 325
 326 /* Returns 1 if the URL begins with a protocol (supported or
 327    unsupported), 0 otherwise.  */
 328 int
 329 has_proto (const char *url)
 330 {
 331   char **s;
 332
 333   for (s = protostrings; *s; s++)
 334     if (strncasecmp (url, *s, strlen (*s)) == 0)
 335       return 1;
 336   return 0;
 337 }
 338
 339 /* Skip the username and password, if present here.  The function
 340    should be called *not* with the complete URL, but with the part
 341    right after the protocol.
 342
 343    If no username and password are found, return 0.  */
 344 int
 345 skip_uname (const char *url)
 346 {
 347   const char *p;
 348   const char *q = NULL;
 349   for (p = url ; *p && *p != '/'; p++)
 350     if (*p == '@') q = p;
 351   /* If a `@' was found before the first occurrence of `/', skip
 352      it.  */
 353   if (q != NULL)
 354     return q - url + 1;
 355   else
 356     return 0;
 357 }
 358 \f
 359 /* Allocate a new urlinfo structure, fill it with default values and
 360    return a pointer to it.  */
 361 struct urlinfo *
 362 newurl (void)
 363 {
 364   struct urlinfo *u;
 365
 366   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 367   memset (u, 0, sizeof (*u));
 368   u->proto = URLUNKNOWN;
 369   return u;
 370 }
 371
 372 /* Perform a "deep" free of the urlinfo structure.  The structure
 373    should have been created with newurl, but need not have been used.
 374    If free_pointer is non-0, free the pointer itself.  */
 375 void
 376 freeurl (struct urlinfo *u, int complete)
 377 {
 378   assert (u != NULL);
 379   FREE_MAYBE (u->url);
 380   FREE_MAYBE (u->host);
 381   FREE_MAYBE (u->path);
 382   FREE_MAYBE (u->file);
 383   FREE_MAYBE (u->dir);
 384   FREE_MAYBE (u->user);
 385   FREE_MAYBE (u->passwd);
 386   FREE_MAYBE (u->local);
 387   FREE_MAYBE (u->referer);
 388   if (u->proxy)
 389     freeurl (u->proxy, 1);
 390   if (complete)
 391     xfree (u);
 392   return;
 393 }
 394 \f
 395 /* Extract the given URL of the form
 396    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 397    1. hostname (terminated with `/' or `:')
 398    2. port number (terminated with `/'), or chosen for the protocol
 399    3. dirname (everything after hostname)
 400    Most errors are handled.  No allocation is done, you must supply
 401    pointers to allocated memory.
 402    ...and a host of other stuff :-)
 403
 404    - Recognizes hostname:dir/file for FTP and
 405      hostname (:portnum)?/dir/file for HTTP.
 406    - Parses the path to yield directory and file
 407    - Parses the URL to yield the username and passwd (if present)
 408    - Decodes the strings, in case they contain "forbidden" characters
 409    - Writes the result to struct urlinfo
 410
 411    If the argument STRICT is set, it recognizes only the canonical
 412    form.  */
 413 uerr_t
 414 parseurl (const char *url, struct urlinfo *u, int strict)
 415 {
 416   int i, l, abs_ftp;
 417   int recognizable;            /* Recognizable URL is the one where
 418                                   the protocol name was explicitly
 419                                   named, i.e. it wasn't deduced from
 420                                   the URL format.  */
 421   uerr_t type;
 422
 423   DEBUGP (("parseurl (\"%s\") -> ", url));
 424   recognizable = has_proto (url);
 425   if (strict && !recognizable)
 426     return URLUNKNOWN;
 427   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 428     {
 429       l = strlen (sup_protos[i].name);
 430       if (!strncasecmp (sup_protos[i].name, url, l))
 431         break;
 432     }
 433   /* If protocol is recognizable, but unsupported, bail out, else
 434      suppose unknown.  */
 435   if (recognizable && i == ARRAY_SIZE (sup_protos))
 436     return URLUNKNOWN;
 437   else if (i == ARRAY_SIZE (sup_protos))
 438     type = URLUNKNOWN;
 439   else
 440     u->proto = type = sup_protos[i].ind;
 441
 442   if (type == URLUNKNOWN)
 443     l = 0;
 444   /* Allow a username and password to be specified (i.e. just skip
 445      them for now).  */
 446   if (recognizable)
 447     l += skip_uname (url + l);
 448   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 449   if (i == l)
 450     return URLBADHOST;
 451   /* Get the hostname.  */
 452   u->host = strdupdelim (url + l, url + i);
 453   DEBUGP (("host %s -> ", u->host));
 454
 455   /* Assume no port has been given.  */
 456   u->port = 0;
 457   if (url[i] == ':')
 458     {
 459       /* We have a colon delimiting the hostname.  It could mean that
 460          a port number is following it, or a directory.  */
 461       if (ISDIGIT (url[++i]))    /* A port number */
 462         {
 463           if (type == URLUNKNOWN)
 464             u->proto = type = URLHTTP;
 465           for (; url[i] && url[i] != '/'; i++)
 466             if (ISDIGIT (url[i]))
 467               u->port = 10 * u->port + (url[i] - '0');
 468             else
 469               return URLBADPORT;
 470           if (!u->port)
 471             return URLBADPORT;
 472           DEBUGP (("port %hu -> ", u->port));
 473         }
 474       else if (type == URLUNKNOWN) /* or a directory */
 475         u->proto = type = URLFTP;
 476       else                      /* or just a misformed port number */
 477         return URLBADPORT;
 478     }
 479   else if (type == URLUNKNOWN)
 480     u->proto = type = URLHTTP;
 481   if (!u->port)
 482     {
 483       int ind;
 484       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 485         if (sup_protos[ind].ind == type)
 486           break;
 487       if (ind == ARRAY_SIZE (sup_protos))
 488         return URLUNKNOWN;
 489       u->port = sup_protos[ind].port;
 490     }
 491   /* Some delimiter troubles...  */
 492   if (url[i] == '/' && url[i - 1] != ':')
 493     ++i;
 494   if (type == URLHTTP)
 495     while (url[i] && url[i] == '/')
 496       ++i;
 497   u->path = (char *)xmalloc (strlen (url + i) + 8);
 498   strcpy (u->path, url + i);
 499   if (type == URLFTP)
 500     {
 501       u->ftp_type = process_ftp_type (u->path);
 502       /* #### We don't handle type `d' correctly yet.  */
 503       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 504         u->ftp_type = 'I';
 505       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 506     }
 507   DEBUGP (("opath %s -> ", u->path));
 508   /* Parse the username and password (if existing).  */
 509   parse_uname (url, &u->user, &u->passwd);
 510   /* Decode the strings, as per RFC 1738.  */
 511   decode_string (u->host);
 512   decode_string (u->path);
 513   if (u->user)
 514     decode_string (u->user);
 515   if (u->passwd)
 516     decode_string (u->passwd);
 517   /* Parse the directory.  */
 518   parse_dir (u->path, &u->dir, &u->file);
 519   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 520   /* Simplify the directory.  */
 521   path_simplify (u->dir);
 522   /* Remove the leading `/' in HTTP.  */
 523   if (type == URLHTTP && *u->dir == '/')
 524     strcpy (u->dir, u->dir + 1);
 525   DEBUGP (("ndir %s\n", u->dir));
 526   /* Strip trailing `/'.  */
 527   l = strlen (u->dir);
 528   if (l > 1 && u->dir[l - 1] == '/')
 529     u->dir[l - 1] = '\0';
 530   /* Re-create the path: */
 531   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 532   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 533       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 534   strcpy (u->path, abs_ftp ? "%2F" : "/");
 535   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 536   strcat (u->path, *u->dir ? "/" : "");
 537   strcat (u->path, u->file);
 538   ENCODE (u->path);
 539   DEBUGP (("newpath: %s\n", u->path));
 540   /* Create the clean URL.  */
 541   u->url = str_url (u, 0);
 542   return URLOK;
 543 }
 544 \f
 545 /* Special versions of DOTP and DDOTP for parse_dir().  They work like
 546    DOTP and DDOTP, but they also recognize `?' as end-of-string
 547    delimiter.  This is needed for correct handling of query
 548    strings.  */
 549
 550 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 551 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 552                      && (!*((x) + 2) || *((x) + 2) == '?'))
 553
 554 /* Build the directory and filename components of the path.  Both
 555    components are *separately* malloc-ed strings!  It does not change
 556    the contents of path.
 557
 558    If the path ends with "." or "..", they are (correctly) counted as
 559    directories.  */
 560 static void
 561 parse_dir (const char *path, char **dir, char **file)
 562 {
 563   int i, l;
 564
 565   l = urlpath_length (path);
 566   for (i = l; i && path[i] != '/'; i--);
 567
 568   if (!i && *path != '/')   /* Just filename */
 569     {
 570       if (PD_DOTP (path) || PD_DDOTP (path))
 571         {
 572           *dir = strdupdelim (path, path + l);
 573           *file = xstrdup (path + l); /* normally empty, but could
 574                                          contain ?... */
 575         }
 576       else
 577         {
 578           *dir = xstrdup ("");     /* This is required because of FTP */
 579           *file = xstrdup (path);
 580         }
 581     }
 582   else if (!i)                 /* /filename */
 583     {
 584       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 585         {
 586           *dir = strdupdelim (path, path + l);
 587           *file = xstrdup (path + l); /* normally empty, but could
 588                                          contain ?... */
 589         }
 590       else
 591         {
 592           *dir = xstrdup ("/");
 593           *file = xstrdup (path + 1);
 594         }
 595     }
 596   else /* Nonempty directory with or without a filename */
 597     {
 598       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 599         {
 600           *dir = strdupdelim (path, path + l);
 601           *file = xstrdup (path + l); /* normally empty, but could
 602                                          contain ?... */
 603         }
 604       else
 605         {
 606           *dir = strdupdelim (path, path + i);
 607           *file = xstrdup (path + i + 1);
 608         }
 609     }
 610 }
 611
 612 /* Find the optional username and password within the URL, as per
 613    RFC1738.  The returned user and passwd char pointers are
 614    malloc-ed.  */
 615 static uerr_t
 616 parse_uname (const char *url, char **user, char **passwd)
 617 {
 618   int l;
 619   const char *p, *q, *col;
 620   char **where;
 621
 622   *user = NULL;
 623   *passwd = NULL;
 624
 625   /* Look for the end of the protocol string.  */
 626   l = skip_proto (url);
 627   if (!l)
 628     return URLUNKNOWN;
 629   /* Add protocol offset.  */
 630   url += l;
 631   /* Is there an `@' character?  */
 632   for (p = url; *p && *p != '/'; p++)
 633     if (*p == '@')
 634       break;
 635   /* If not, return.  */
 636   if (*p != '@')
 637     return URLOK;
 638   /* Else find the username and password.  */
 639   for (p = q = col = url; *p && *p != '/'; p++)
 640     {
 641       if (*p == ':' && !*user)
 642         {
 643           *user = (char *)xmalloc (p - url + 1);
 644           memcpy (*user, url, p - url);
 645           (*user)[p - url] = '\0';
 646           col = p + 1;
 647         }
 648       if (*p == '@') q = p;
 649     }
 650   /* Decide whether you have only the username or both.  */
 651   where = *user ? passwd : user;
 652   *where = (char *)xmalloc (q - col + 1);
 653   memcpy (*where, col, q - col);
 654   (*where)[q - col] = '\0';
 655   return URLOK;
 656 }
 657
 658 /* If PATH ends with `;type=X', return the character X.  */
 659 static char
 660 process_ftp_type (char *path)
 661 {
 662   int len = strlen (path);
 663
 664   if (len >= 7
 665       && !memcmp (path + len - 7, ";type=", 6))
 666     {
 667       path[len - 7] = '\0';
 668       return path[len - 1];
 669     }
 670   else
 671     return '\0';
 672 }
 673 \f
 674 /* Return the URL as fine-formed string, with a proper protocol, optional port
 675    number, directory and optional user/password.  If `hide' is non-zero (as it
 676    is when we're calling this on a URL we plan to print, but not when calling it
 677    to canonicalize a URL for use within the program), password will be hidden.
 678    The forbidden characters in the URL will be cleansed.  */
 679 char *
 680 str_url (const struct urlinfo *u, int hide)
 681 {
 682   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 683   int i, l, ln, lu, lh, lp, lf, ld;
 684   unsigned short proto_default_port;
 685
 686   /* Look for the protocol name.  */
 687   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 688     if (sup_protos[i].ind == u->proto)
 689       break;
 690   if (i == ARRAY_SIZE (sup_protos))
 691     return NULL;
 692   proto_name = sup_protos[i].name;
 693   proto_default_port = sup_protos[i].port;
 694   host = encode_string (u->host);
 695   dir = encode_string (u->dir);
 696   file = encode_string (u->file);
 697   user = passwd = NULL;
 698   if (u->user)
 699     user = encode_string (u->user);
 700   if (u->passwd)
 701     {
 702       if (hide)
 703         /* Don't output the password, or someone might see it over the user's
 704            shoulder (or in saved wget output).  Don't give away the number of
 705            characters in the password, either, as we did in past versions of
 706            this code, when we replaced the password characters with 'x's. */
 707         passwd = xstrdup("<password>");
 708       else
 709         passwd = encode_string (u->passwd);
 710     }
 711   if (u->proto == URLFTP && *dir == '/')
 712     {
 713       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 714       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 715       tmp[0] = '%';
 716       tmp[1] = '2';
 717       tmp[2] = 'F';
 718       strcpy (tmp + 3, dir + 1);
 719       xfree (dir);
 720       dir = tmp;
 721     }
 722
 723   ln = strlen (proto_name);
 724   lu = user ? strlen (user) : 0;
 725   lp = passwd ? strlen (passwd) : 0;
 726   lh = strlen (host);
 727   ld = strlen (dir);
 728   lf = strlen (file);
 729   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 730   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 731      (user ? user : ""), (passwd ? ":" : ""),
 732      (passwd ? passwd : ""), (user ? "@" : ""),
 733      host, u->port, dir, *dir ? "/" : "", file); */
 734   l = 0;
 735   memcpy (res, proto_name, ln);
 736   l += ln;
 737   if (user)
 738     {
 739       memcpy (res + l, user, lu);
 740       l += lu;
 741       if (passwd)
 742         {
 743           res[l++] = ':';
 744           memcpy (res + l, passwd, lp);
 745           l += lp;
 746         }
 747       res[l++] = '@';
 748     }
 749   memcpy (res + l, host, lh);
 750   l += lh;
 751   if (u->port != proto_default_port)
 752     {
 753       res[l++] = ':';
 754       long_to_string (res + l, (long)u->port);
 755       l += numdigit (u->port);
 756     }
 757   res[l++] = '/';
 758   memcpy (res + l, dir, ld);
 759   l += ld;
 760   if (*dir)
 761     res[l++] = '/';
 762   strcpy (res + l, file);
 763   xfree (host);
 764   xfree (dir);
 765   xfree (file);
 766   FREE_MAYBE (user);
 767   FREE_MAYBE (passwd);
 768   return res;
 769 }
 770
 771 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 772    location.  Uses parseurl to parse them, and compares the canonical
 773    forms.
 774
 775    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 776    return 0 on error.  */
 777 int
 778 url_equal (const char *url1, const char *url2)
 779 {
 780   struct urlinfo *u1, *u2;
 781   uerr_t err;
 782   int res;
 783
 784   u1 = newurl ();
 785   err = parseurl (url1, u1, 0);
 786   if (err != URLOK)
 787     {
 788       freeurl (u1, 1);
 789       return 0;
 790     }
 791   u2 = newurl ();
 792   err = parseurl (url2, u2, 0);
 793   if (err != URLOK)
 794     {
 795       freeurl (u2, 1);
 796       return 0;
 797     }
 798   res = !strcmp (u1->url, u2->url);
 799   freeurl (u1, 1);
 800   freeurl (u2, 1);
 801   return res;
 802 }
 803 \f
 804 urlpos *
 805 get_urls_file (const char *file)
 806 {
 807   struct file_memory *fm;
 808   urlpos *head, *tail;
 809   const char *text, *text_end;
 810
 811   /* Load the file.  */
 812   fm = read_file (file);
 813   if (!fm)
 814     {
 815       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 816       return NULL;
 817     }
 818   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 819   head = tail = NULL;
 820   text = fm->content;
 821   text_end = fm->content + fm->length;
 822   while (text < text_end)
 823     {
 824       const char *line_beg = text;
 825       const char *line_end = memchr (text, '\n', text_end - text);
 826       if (!line_end)
 827         line_end = text_end;
 828       else
 829         ++line_end;
 830       text = line_end;
 831       while (line_beg < line_end
 832              && ISSPACE (*line_beg))
 833         ++line_beg;
 834       while (line_end > line_beg + 1
 835              && ISSPACE (*(line_end - 1)))
 836         --line_end;
 837       if (line_end > line_beg)
 838         {
 839           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 840           memset (entry, 0, sizeof (*entry));
 841           entry->next = NULL;
 842           entry->url = strdupdelim (line_beg, line_end);
 843           if (!head)
 844             head = entry;
 845           else
 846             tail->next = entry;
 847           tail = entry;
 848         }
 849     }
 850   read_file_free (fm);
 851   return head;
 852 }
 853 \f
 854 /* Free the linked list of urlpos.  */
 855 void
 856 free_urlpos (urlpos *l)
 857 {
 858   while (l)
 859     {
 860       urlpos *next = l->next;
 861       xfree (l->url);
 862       FREE_MAYBE (l->local_name);
 863       xfree (l);
 864       l = next;
 865     }
 866 }
 867
 868 /* Rotate FNAME opt.backups times */
 869 void
 870 rotate_backups(const char *fname)
 871 {
 872   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 873   char *from = (char *)alloca (maxlen);
 874   char *to = (char *)alloca (maxlen);
 875   struct stat sb;
 876   int i;
 877
 878   if (stat (fname, &sb) == 0)
 879     if (S_ISREG (sb.st_mode) == 0)
 880       return;
 881
 882   for (i = opt.backups; i > 1; i--)
 883     {
 884       sprintf (from, "%s.%d", fname, i - 1);
 885       sprintf (to, "%s.%d", fname, i);
 886       /* #### This will fail on machines without the rename() system
 887          call.  */
 888       rename (from, to);
 889     }
 890
 891   sprintf (to, "%s.%d", fname, 1);
 892   rename(fname, to);
 893 }
 894
 895 /* Create all the necessary directories for PATH (a file).  Calls
 896    mkdirhier() internally.  */
 897 int
 898 mkalldirs (const char *path)
 899 {
 900   const char *p;
 901   char *t;
 902   struct stat st;
 903   int res;
 904
 905   p = path + strlen (path);
 906   for (; *p != '/' && p != path; p--);
 907   /* Don't create if it's just a file.  */
 908   if ((p == path) && (*p != '/'))
 909     return 0;
 910   t = strdupdelim (path, p);
 911   /* Check whether the directory exists.  */
 912   if ((stat (t, &st) == 0))
 913     {
 914       if (S_ISDIR (st.st_mode))
 915         {
 916           xfree (t);
 917           return 0;
 918         }
 919       else
 920         {
 921           /* If the dir exists as a file name, remove it first.  This
 922              is *only* for Wget to work with buggy old CERN http
 923              servers.  Here is the scenario: When Wget tries to
 924              retrieve a directory without a slash, e.g.
 925              http://foo/bar (bar being a directory), CERN server will
 926              not redirect it too http://foo/bar/ -- it will generate a
 927              directory listing containing links to bar/file1,
 928              bar/file2, etc.  Wget will lose because it saves this
 929              HTML listing to a file `bar', so it cannot create the
 930              directory.  To work around this, if the file of the same
 931              name exists, we just remove it and create the directory
 932              anyway.  */
 933           DEBUGP (("Removing %s because of directory danger!\n", t));
 934           unlink (t);
 935         }
 936     }
 937   res = make_directory (t);
 938   if (res != 0)
 939     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 940   xfree (t);
 941   return res;
 942 }
 943
 944 static int
 945 count_slashes (const char *s)
 946 {
 947   int i = 0;
 948   while (*s)
 949     if (*s++ == '/')
 950       ++i;
 951   return i;
 952 }
 953
 954 /* Return the path name of the URL-equivalent file name, with a
 955    remote-like structure of directories.  */
 956 static char *
 957 mkstruct (const struct urlinfo *u)
 958 {
 959   char *host, *dir, *file, *res, *dirpref;
 960   int l;
 961
 962   assert (u->dir != NULL);
 963   assert (u->host != NULL);
 964
 965   if (opt.cut_dirs)
 966     {
 967       char *ptr = u->dir + (*u->dir == '/');
 968       int slash_count = 1 + count_slashes (ptr);
 969       int cut = MINVAL (opt.cut_dirs, slash_count);
 970       for (; cut && *ptr; ptr++)
 971         if (*ptr == '/')
 972           --cut;
 973       STRDUP_ALLOCA (dir, ptr);
 974     }
 975   else
 976     dir = u->dir + (*u->dir == '/');
 977
 978   host = xstrdup (u->host);
 979   /* Check for the true name (or at least a consistent name for saving
 980      to directory) of HOST, reusing the hlist if possible.  */
 981   if (opt.add_hostdir && !opt.simple_check)
 982     {
 983       char *nhost = realhost (host);
 984       xfree (host);
 985       host = nhost;
 986     }
 987   /* Add dir_prefix and hostname (if required) to the beginning of
 988      dir.  */
 989   if (opt.add_hostdir)
 990     {
 991       if (!DOTP (opt.dir_prefix))
 992         {
 993           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 994                                     + strlen (host) + 1);
 995           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 996         }
 997       else
 998         STRDUP_ALLOCA (dirpref, host);
 999     }
1000   else                         /* not add_hostdir */
1001     {
1002       if (!DOTP (opt.dir_prefix))
1003         dirpref = opt.dir_prefix;
1004       else
1005         dirpref = "";
1006     }
1007   xfree (host);
1008
1009   /* If there is a prefix, prepend it.  */
1010   if (*dirpref)
1011     {
1012       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1013       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1014       dir = newdir;
1015     }
1016   dir = encode_string (dir);
1017   l = strlen (dir);
1018   if (l && dir[l - 1] == '/')
1019     dir[l - 1] = '\0';
1020
1021   if (!*u->file)
1022     file = "index.html";
1023   else
1024     file = u->file;
1025
1026   /* Finally, construct the full name.  */
1027   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1028   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1029   xfree (dir);
1030   return res;
1031 }
1032
1033 /* Return a malloced copy of S, but protect any '/' characters. */
1034
1035 static char *
1036 file_name_protect_query_string (const char *s)
1037 {
1038   const char *from;
1039   char *to, *dest;
1040   int destlen = 0;
1041   for (from = s; *from; from++)
1042     {
1043       ++destlen;
1044       if (*from == '/')
1045         destlen += 2;           /* each / gets replaced with %2F, so
1046                                    it adds two more chars.  */
1047     }
1048   dest = (char *)xmalloc (destlen + 1);
1049   for (from = s, to = dest; *from; from++)
1050     {
1051       if (*from != '/')
1052         *to++ = *from;
1053       else
1054         {
1055           *to++ = '%';
1056           *to++ = '2';
1057           *to++ = 'F';
1058         }
1059     }
1060   assert (to - dest == destlen);
1061   *to = '\0';
1062   return dest;
1063 }
1064
1065 /* Create a unique filename, corresponding to a given URL.  Calls
1066    mkstruct if necessary.  Does *not* actually create any directories.  */
1067 char *
1068 url_filename (const struct urlinfo *u)
1069 {
1070   char *file, *name;
1071   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1072
1073   if (opt.dirstruct)
1074     {
1075       file = mkstruct (u);
1076       have_prefix = 1;
1077     }
1078   else
1079     {
1080       if (!*u->file)
1081         file = xstrdup ("index.html");
1082       else
1083         {
1084           /* If the URL came with a query string, u->file will contain
1085              a question mark followed by query string contents.  These
1086              contents can contain '/' which would make us create
1087              unwanted directories.  These slashes must be protected
1088              explicitly.  */
1089           if (!strchr (u->file, '/'))
1090             file = xstrdup (u->file);
1091           else
1092             {
1093               /*assert (strchr (u->file, '?') != NULL);*/
1094               file = file_name_protect_query_string (u->file);
1095             }
1096         }
1097     }
1098
1099   if (!have_prefix)
1100     {
1101       /* Check whether the prefix directory is something other than "."
1102          before prepending it.  */
1103       if (!DOTP (opt.dir_prefix))
1104         {
1105           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1106                                          + 1 + strlen (file) + 1);
1107           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1108           xfree (file);
1109           file = nfile;
1110         }
1111     }
1112   /* DOS-ish file systems don't like `%' signs in them; we change it
1113      to `@'.  */
1114 #ifdef WINDOWS
1115   {
1116     char *p = file;
1117     for (p = file; *p; p++)
1118       if (*p == '%')
1119         *p = '@';
1120   }
1121 #endif /* WINDOWS */
1122
1123   /* Check the cases in which the unique extensions are not used:
1124      1) Clobbering is turned off (-nc).
1125      2) Retrieval with regetting.
1126      3) Timestamping is used.
1127      4) Hierarchy is built.
1128
1129      The exception is the case when file does exist and is a
1130      directory (actually support for bad httpd-s).  */
1131   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1132       && !(file_exists_p (file) && !file_non_directory_p (file)))
1133     return file;
1134
1135   /* Find a unique name.  */
1136   name = unique_name (file);
1137   xfree (file);
1138   return name;
1139 }
1140
1141 /* Like strlen(), but allow the URL to be ended with '?'.  */
1142 static int
1143 urlpath_length (const char *url)
1144 {
1145   const char *q = strchr (url, '?');
1146   if (q)
1147     return q - url;
1148   return strlen (url);
1149 }
1150
1151 /* Find the last occurrence of character C in the range [b, e), or
1152    NULL, if none are present.  This is almost completely equivalent to
1153    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1154    the contents of the string.  */
1155 static const char *
1156 find_last_char (const char *b, const char *e, char c)
1157 {
1158   for (; e > b; e--)
1159     if (*e == c)
1160       return e;
1161   return NULL;
1162 }
1163
1164 /* Resolve the result of "linking" a base URI (BASE) to a
1165    link-specified URI (LINK).
1166
1167    Either of the URIs may be absolute or relative, complete with the
1168    host name, or path only.  This tries to behave "reasonably" in all
1169    foreseeable cases.  It employs little specific knowledge about
1170    protocols or URL-specific stuff -- it just works on strings.
1171
1172    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1173    See uri_merge for a gentler interface to this functionality.
1174
1175    #### This function should handle `./' and `../' so that the evil
1176    path_simplify can go.  */
1177 static char *
1178 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1179 {
1180   char *constr;
1181
1182   if (no_proto)
1183     {
1184       const char *end = base + urlpath_length (base);
1185
1186       if (*link != '/')
1187         {
1188           /* LINK is a relative URL: we need to replace everything
1189              after last slash (possibly empty) with LINK.
1190
1191              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1192              our result should be "whatever/foo/qux/xyzzy".  */
1193           int need_explicit_slash = 0;
1194           int span;
1195           const char *start_insert;
1196           const char *last_slash = find_last_char (base, end, '/');
1197           if (!last_slash)
1198             {
1199               /* No slash found at all.  Append LINK to what we have,
1200                  but we'll need a slash as a separator.
1201
1202                  Example: if base == "foo" and link == "qux/xyzzy", then
1203                  we cannot just append link to base, because we'd get
1204                  "fooqux/xyzzy", whereas what we want is
1205                  "foo/qux/xyzzy".
1206
1207                  To make sure the / gets inserted, we set
1208                  need_explicit_slash to 1.  We also set start_insert
1209                  to end + 1, so that the length calculations work out
1210                  correctly for one more (slash) character.  Accessing
1211                  that character is fine, since it will be the
1212                  delimiter, '\0' or '?'.  */
1213               /* example: "foo?..." */
1214               /*               ^    ('?' gets changed to '/') */
1215               start_insert = end + 1;
1216               need_explicit_slash = 1;
1217             }
1218           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1219             {
1220               /* example: http://host"  */
1221               /*                      ^ */
1222               start_insert = end + 1;
1223               need_explicit_slash = 1;
1224             }
1225           else
1226             {
1227               /* example: "whatever/foo/bar" */
1228               /*                        ^    */
1229               start_insert = last_slash + 1;
1230             }
1231
1232           span = start_insert - base;
1233           constr = (char *)xmalloc (span + linklength + 1);
1234           if (span)
1235             memcpy (constr, base, span);
1236           if (need_explicit_slash)
1237             constr[span - 1] = '/';
1238           if (linklength)
1239             memcpy (constr + span, link, linklength);
1240           constr[span + linklength] = '\0';
1241         }
1242       else /* *link == `/' */
1243         {
1244           /* LINK is an absolute path: we need to replace everything
1245              after (and including) the FIRST slash with LINK.
1246
1247              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1248              "/qux/xyzzy", our result should be
1249              "http://host/qux/xyzzy".  */
1250           int span;
1251           const char *slash;
1252           const char *start_insert = NULL; /* for gcc to shut up. */
1253           const char *pos = base;
1254           int seen_slash_slash = 0;
1255           /* We're looking for the first slash, but want to ignore
1256              double slash. */
1257         again:
1258           slash = memchr (pos, '/', end - pos);
1259           if (slash && !seen_slash_slash)
1260             if (*(slash + 1) == '/')
1261               {
1262                 pos = slash + 2;
1263                 seen_slash_slash = 1;
1264                 goto again;
1265               }
1266
1267           /* At this point, SLASH is the location of the first / after
1268              "//", or the first slash altogether.  START_INSERT is the
1269              pointer to the location where LINK will be inserted.  When
1270              examining the last two examples, keep in mind that LINK
1271              begins with '/'. */
1272
1273           if (!slash && !seen_slash_slash)
1274             /* example: "foo" */
1275             /*           ^    */
1276             start_insert = base;
1277           else if (!slash && seen_slash_slash)
1278             /* example: "http://foo" */
1279             /*                     ^ */
1280             start_insert = end;
1281           else if (slash && !seen_slash_slash)
1282             /* example: "foo/bar" */
1283             /*           ^        */
1284             start_insert = base;
1285           else if (slash && seen_slash_slash)
1286             /* example: "http://something/" */
1287             /*                           ^  */
1288             start_insert = slash;
1289
1290           span = start_insert - base;
1291           constr = (char *)xmalloc (span + linklength + 1);
1292           if (span)
1293             memcpy (constr, base, span);
1294           if (linklength)
1295             memcpy (constr + span, link, linklength);
1296           constr[span + linklength] = '\0';
1297         }
1298     }
1299   else /* !no_proto */
1300     {
1301       constr = strdupdelim (link, link + linklength);
1302     }
1303   return constr;
1304 }
1305
1306 /* Merge BASE with LINK and return the resulting URI.  This is an
1307    interface to uri_merge_1 that assumes that LINK is a
1308    zero-terminated string.  */
1309 char *
1310 uri_merge (const char *base, const char *link)
1311 {
1312   return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1313 }
1314 \f
1315 /* Optimize URL by host, destructively replacing u->host with realhost
1316    (u->host).  Do this regardless of opt.simple_check.  */
1317 void
1318 opt_url (struct urlinfo *u)
1319 {
1320   /* Find the "true" host.  */
1321   char *host = realhost (u->host);
1322   xfree (u->host);
1323   u->host = host;
1324   assert (u->dir != NULL);      /* the URL must have been parsed */
1325   /* Refresh the printed representation.  */
1326   xfree (u->url);
1327   u->url = str_url (u, 0);
1328 }
1329 \f
1330 /* Returns proxy host address, in accordance with PROTO.  */
1331 char *
1332 getproxy (uerr_t proto)
1333 {
1334   char *proxy;
1335
1336   if (proto == URLHTTP)
1337     proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1338   else if (proto == URLFTP)
1339     proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1340 #ifdef HAVE_SSL
1341   else if (proto == URLHTTPS)
1342     proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1343 #endif /* HAVE_SSL */
1344   else
1345     proxy = NULL;
1346   if (!proxy || !*proxy)
1347     return NULL;
1348   return proxy;
1349 }
1350
1351 /* Should a host be accessed through proxy, concerning no_proxy?  */
1352 int
1353 no_proxy_match (const char *host, const char **no_proxy)
1354 {
1355   if (!no_proxy)
1356     return 1;
1357   else
1358     return !sufmatch (no_proxy, host);
1359 }
1360 \f
1361 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1362 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1363
1364 /* Change the links in an HTML document.  Accepts a structure that
1365    defines the positions of all the links.  */
1366 void
1367 convert_links (const char *file, urlpos *l)
1368 {
1369   struct file_memory *fm;
1370   FILE               *fp;
1371   const char         *p;
1372   downloaded_file_t  downloaded_file_return;
1373
1374   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1375
1376   {
1377     /* First we do a "dry run": go through the list L and see whether
1378        any URL needs to be converted in the first place.  If not, just
1379        leave the file alone.  */
1380     int count = 0;
1381     urlpos *dry = l;
1382     for (dry = l; dry; dry = dry->next)
1383       if (dry->convert != CO_NOCONVERT)
1384         ++count;
1385     if (!count)
1386       {
1387         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1388         return;
1389       }
1390   }
1391
1392   fm = read_file (file);
1393   if (!fm)
1394     {
1395       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1396                  file, strerror (errno));
1397       return;
1398     }
1399
1400   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1401   if (opt.backup_converted && downloaded_file_return)
1402     write_backup_file (file, downloaded_file_return);
1403
1404   /* Before opening the file for writing, unlink the file.  This is
1405      important if the data in FM is mmaped.  In such case, nulling the
1406      file, which is what fopen() below does, would make us read all
1407      zeroes from the mmaped region.  */
1408   if (unlink (file) < 0 && errno != ENOENT)
1409     {
1410       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1411                  file, strerror (errno));
1412       read_file_free (fm);
1413       return;
1414     }
1415   /* Now open the file for writing.  */
1416   fp = fopen (file, "wb");
1417   if (!fp)
1418     {
1419       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1420                  file, strerror (errno));
1421       read_file_free (fm);
1422       return;
1423     }
1424   /* Here we loop through all the URLs in file, replacing those of
1425      them that are downloaded with relative references.  */
1426   p = fm->content;
1427   for (; l; l = l->next)
1428     {
1429       char *url_start = fm->content + l->pos;
1430
1431       if (l->pos >= fm->length)
1432         {
1433           DEBUGP (("Something strange is going on.  Please investigate."));
1434           break;
1435         }
1436       /* If the URL is not to be converted, skip it.  */
1437       if (l->convert == CO_NOCONVERT)
1438         {
1439           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1440           continue;
1441         }
1442
1443       /* Echo the file contents, up to the offending URL's opening
1444          quote, to the outfile.  */
1445       fwrite (p, 1, url_start - p, fp);
1446       p = url_start;
1447       if (l->convert == CO_CONVERT_TO_RELATIVE)
1448         {
1449           /* Convert absolute URL to relative. */
1450           char *newname = construct_relative (file, l->local_name);
1451           char *quoted_newname = html_quote_string (newname);
1452           replace_attr (&p, l->size, fp, quoted_newname);
1453           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1454                    l->url, newname, l->pos, file));
1455           xfree (newname);
1456           xfree (quoted_newname);
1457         }
1458       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1459         {
1460           /* Convert the link to absolute URL. */
1461           char *newlink = l->url;
1462           char *quoted_newlink = html_quote_string (newlink);
1463           replace_attr (&p, l->size, fp, quoted_newlink);
1464           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1465                    newlink, l->pos, file));
1466           xfree (quoted_newlink);
1467         }
1468     }
1469   /* Output the rest of the file. */
1470   if (p - fm->content < fm->length)
1471     fwrite (p, 1, fm->length - (p - fm->content), fp);
1472   fclose (fp);
1473   read_file_free (fm);
1474   logputs (LOG_VERBOSE, _("done.\n"));
1475 }
1476
1477 /* Construct and return a malloced copy of the relative link from two
1478    pieces of information: local name S1 of the referring file and
1479    local name S2 of the referred file.
1480
1481    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1482    "jagor.srce.hr/images/news.gif", the function will return
1483    "images/news.gif".
1484
1485    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1486    "fly.cc.fer.hr/images/fly.gif", the function will return
1487    "../images/fly.gif".
1488
1489    Caveats: S1 should not begin with `/', unless S2 also begins with
1490    '/'.  S1 should not contain things like ".." and such --
1491    construct_relative ("fly/ioccc/../index.html",
1492    "fly/images/fly.gif") will fail.  (A workaround is to call
1493    something like path_simplify() on S1).  */
1494 static char *
1495 construct_relative (const char *s1, const char *s2)
1496 {
1497   int i, cnt, sepdirs1;
1498   char *res;
1499
1500   if (*s2 == '/')
1501     return xstrdup (s2);
1502   /* S1 should *not* be absolute, if S2 wasn't.  */
1503   assert (*s1 != '/');
1504   i = cnt = 0;
1505   /* Skip the directories common to both strings.  */
1506   while (1)
1507     {
1508       while (s1[i] && s2[i]
1509              && (s1[i] == s2[i])
1510              && (s1[i] != '/')
1511              && (s2[i] != '/'))
1512         ++i;
1513       if (s1[i] == '/' && s2[i] == '/')
1514         cnt = ++i;
1515       else
1516         break;
1517     }
1518   for (sepdirs1 = 0; s1[i]; i++)
1519     if (s1[i] == '/')
1520       ++sepdirs1;
1521   /* Now, construct the file as of:
1522      - ../ repeated sepdirs1 time
1523      - all the non-mutual directories of S2.  */
1524   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1525   for (i = 0; i < sepdirs1; i++)
1526     memcpy (res + 3 * i, "../", 3);
1527   strcpy (res + 3 * i, s2 + cnt);
1528   return res;
1529 }
1530 \f
1531 /* Add URL to the head of the list L.  */
1532 urlpos *
1533 add_url (urlpos *l, const char *url, const char *file)
1534 {
1535   urlpos *t;
1536
1537   t = (urlpos *)xmalloc (sizeof (urlpos));
1538   memset (t, 0, sizeof (*t));
1539   t->url = xstrdup (url);
1540   t->local_name = xstrdup (file);
1541   t->next = l;
1542   return t;
1543 }
1544
1545 static void
1546 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1547 {
1548   /* Rather than just writing over the original .html file with the
1549      converted version, save the former to *.orig.  Note we only do
1550      this for files we've _successfully_ downloaded, so we don't
1551      clobber .orig files sitting around from previous invocations. */
1552
1553   /* Construct the backup filename as the original name plus ".orig". */
1554   size_t         filename_len = strlen(file);
1555   char*          filename_plus_orig_suffix;
1556   boolean        already_wrote_backup_file = FALSE;
1557   slist*         converted_file_ptr;
1558   static slist*  converted_files = NULL;
1559
1560   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1561     {
1562       /* Just write "orig" over "html".  We need to do it this way
1563          because when we're checking to see if we've downloaded the
1564          file before (to see if we can skip downloading it), we don't
1565          know if it's a text/html file.  Therefore we don't know yet
1566          at that stage that -E is going to cause us to tack on
1567          ".html", so we need to compare vs. the original URL plus
1568          ".orig", not the original URL plus ".html.orig". */
1569       filename_plus_orig_suffix = alloca (filename_len + 1);
1570       strcpy(filename_plus_orig_suffix, file);
1571       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1572     }
1573   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1574     {
1575       /* Append ".orig" to the name. */
1576       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1577       strcpy(filename_plus_orig_suffix, file);
1578       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1579     }
1580
1581   /* We can get called twice on the same URL thanks to the
1582      convert_all_links() call in main().  If we write the .orig file
1583      each time in such a case, it'll end up containing the first-pass
1584      conversion, not the original file.  So, see if we've already been
1585      called on this file. */
1586   converted_file_ptr = converted_files;
1587   while (converted_file_ptr != NULL)
1588     if (strcmp(converted_file_ptr->string, file) == 0)
1589       {
1590         already_wrote_backup_file = TRUE;
1591         break;
1592       }
1593     else
1594       converted_file_ptr = converted_file_ptr->next;
1595
1596   if (!already_wrote_backup_file)
1597     {
1598       /* Rename <file> to <file>.orig before former gets written over. */
1599       if (rename(file, filename_plus_orig_suffix) != 0)
1600         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1601                    file, filename_plus_orig_suffix, strerror (errno));
1602
1603       /* Remember that we've already written a .orig backup for this file.
1604          Note that we never free this memory since we need it till the
1605          convert_all_links() call, which is one of the last things the
1606          program does before terminating.  BTW, I'm not sure if it would be
1607          safe to just set 'converted_file_ptr->string' to 'file' below,
1608          rather than making a copy of the string...  Another note is that I
1609          thought I could just add a field to the urlpos structure saying
1610          that we'd written a .orig file for this URL, but that didn't work,
1611          so I had to make this separate list.
1612          -- Dan Harkless <wget@harkless.org>
1613
1614          This [adding a field to the urlpos structure] didn't work
1615          because convert_file() is called twice: once after all its
1616          sublinks have been retrieved in recursive_retrieve(), and
1617          once at the end of the day in convert_all_links().  The
1618          original linked list collected in recursive_retrieve() is
1619          lost after the first invocation of convert_links(), and
1620          convert_all_links() makes a new one (it calls get_urls_html()
1621          for each file it covers.)  That's why your first approach didn't
1622          work.  The way to make it work is perhaps to make this flag a
1623          field in the `urls_html' list.
1624          -- Hrvoje Niksic <hniksic@arsdigita.com>
1625       */
1626       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1627       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1628       converted_file_ptr->next = converted_files;
1629       converted_files = converted_file_ptr;
1630     }
1631 }
1632
1633 static int find_fragment PARAMS ((const char *, int, const char **,
1634                                   const char **));
1635
1636 static void
1637 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1638 {
1639   const char *p = *pp;
1640   int quote_flag = 0;
1641   int size = raw_size;
1642   char quote_char = '\"';
1643   const char *frag_beg, *frag_end;
1644
1645   /* Structure of our string is:
1646        "...old-contents..."
1647        <---  l->size   --->  (with quotes)
1648      OR:
1649        ...old-contents...
1650        <---  l->size  -->    (no quotes)   */
1651
1652   if (*p == '\"' || *p == '\'')
1653     {
1654       quote_char = *p;
1655       quote_flag = 1;
1656       ++p;
1657       size -= 2;                /* disregard opening and closing quote */
1658     }
1659   putc (quote_char, fp);
1660   fputs (new_str, fp);
1661
1662   /* Look for fragment identifier, if any. */
1663   if (find_fragment (p, size, &frag_beg, &frag_end))
1664     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1665   p += size;
1666   if (quote_flag)
1667     ++p;
1668   putc (quote_char, fp);
1669   *pp = p;
1670 }
1671
1672 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1673    preceded by '&'.  If the character is not found, return zero.  If
1674    the character is found, return 1 and set BP and EP to point to the
1675    beginning and end of the region.
1676
1677    This is used for finding the fragment indentifiers in URLs.  */
1678
1679 static int
1680 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1681 {
1682   const char *end = beg + size;
1683   int saw_amp = 0;
1684   for (; beg < end; beg++)
1685     {
1686       switch (*beg)
1687         {
1688         case '&':
1689           saw_amp = 1;
1690           break;
1691         case '#':
1692           if (!saw_amp)
1693             {
1694               *bp = beg;
1695               *ep = end;
1696               return 1;
1697             }
1698           /* fallthrough */
1699         default:
1700           saw_amp = 0;
1701         }
1702     }
1703   return 0;
1704 }
1705
1706 typedef struct _downloaded_file_list {
1707   char*                          file;
1708   downloaded_file_t              download_type;
1709   struct _downloaded_file_list*  next;
1710 } downloaded_file_list;
1711
1712 static downloaded_file_list *downloaded_files;
1713
1714 /* Remembers which files have been downloaded.  In the standard case, should be
1715    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1716    download successfully (i.e. not for ones we have failures on or that we skip
1717    due to -N).
1718
1719    When we've downloaded a file and tacked on a ".html" extension due to -E,
1720    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1721    FILE_DOWNLOADED_NORMALLY.
1722
1723    If you just want to check if a file has been previously added without adding
1724    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1725    with local filenames, not remote URLs. */
1726 downloaded_file_t
1727 downloaded_file (downloaded_file_t  mode, const char*  file)
1728 {
1729   boolean                       found_file = FALSE;
1730   downloaded_file_list*         rover = downloaded_files;
1731
1732   while (rover != NULL)
1733     if (strcmp(rover->file, file) == 0)
1734       {
1735         found_file = TRUE;
1736         break;
1737       }
1738     else
1739       rover = rover->next;
1740
1741   if (found_file)
1742     return rover->download_type;  /* file had already been downloaded */
1743   else
1744     {
1745       if (mode != CHECK_FOR_FILE)
1746         {
1747           rover = xmalloc(sizeof(*rover));
1748           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1749           rover->download_type = mode;
1750           rover->next = downloaded_files;
1751           downloaded_files = rover;
1752         }
1753
1754       return FILE_NOT_ALREADY_DOWNLOADED;
1755     }
1756 }
1757
1758 void
1759 downloaded_files_free (void)
1760 {
1761   downloaded_file_list*         rover = downloaded_files;
1762   while (rover)
1763     {
1764       downloaded_file_list *next = rover->next;
1765       xfree (rover->file);
1766       xfree (rover);
1767       rover = next;
1768     }
1769 }