sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Is X "."?  */
  46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  47 /* Is X ".."?  */
  48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  49
  50 static int urlpath_length PARAMS ((const char *));
  51
  52 /* A NULL-terminated list of strings to be recognized as protocol
  53    types (URL schemes).  Note that recognized doesn't mean supported
  54    -- only HTTP, HTTPS and FTP are currently supported.
  55
  56    However, a string that does not match anything in the list will be
  57    considered a relative URL.  Thus it's important that this list has
  58    anything anyone could think of being legal.
  59
  60    #### This is probably broken.  Wget should use other means to
  61    distinguish between absolute and relative URIs in HTML links.
  62
  63    Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
  64    for more.  */
  65 static char *protostrings[] =
  66 {
  67   "cid:",
  68   "clsid:",
  69   "file:",
  70   "finger:",
  71   "ftp:",
  72   "gopher:",
  73   "hdl:",
  74   "http:",
  75   "https:",
  76   "ilu:",
  77   "ior:",
  78   "irc:",
  79   "java:",
  80   "javascript:",
  81   "lifn:",
  82   "mailto:",
  83   "mid:",
  84   "news:",
  85   "nntp:",
  86   "path:",
  87   "prospero:",
  88   "rlogin:",
  89   "service:",
  90   "shttp:",
  91   "snews:",
  92   "stanf:",
  93   "telnet:",
  94   "tn3270:",
  95   "wais:",
  96   "whois++:",
  97   NULL
  98 };
  99
 100 struct proto
 101 {
 102   char *name;
 103   uerr_t ind;
 104   unsigned short port;
 105 };
 106
 107 /* Similar to former, but for supported protocols: */
 108 static struct proto sup_protos[] =
 109 {
 110   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 111 #ifdef HAVE_SSL
 112   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 113 #endif
 114   { "ftp://", URLFTP, DEFAULT_FTP_PORT }
 115 };
 116
 117 static void parse_dir PARAMS ((const char *, char **, char **));
 118 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 119 static char *construct_relative PARAMS ((const char *, const char *));
 120 static char process_ftp_type PARAMS ((char *));
 121
 122 \f
 123 /* Support for encoding and decoding of URL strings.  We determine
 124    whether a character is unsafe through static table lookup.  This
 125    code assumes ASCII character set and 8-bit chars.  */
 126
 127 enum {
 128   urlchr_reserved = 1,
 129   urlchr_unsafe   = 2
 130 };
 131
 132 #define R  urlchr_reserved
 133 #define U  urlchr_unsafe
 134 #define RU R|U
 135
 136 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
 137
 138 /* rfc1738 reserved chars.  We don't use this yet; preservation of
 139    reserved chars will be implemented when I integrate the new
 140    `reencode_string' function.  */
 141
 142 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
 143
 144 /* Unsafe chars:
 145    - anything <= 32;
 146    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 147    - '@' and ':'; needed for encoding URL username and password.
 148    - anything >= 127. */
 149
 150 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 151
 152 const static unsigned char urlchr_table[256] =
 153 {
 154   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 155   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 156   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 157   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 158   U,  0,  U,  U,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 159   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 160   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 161   0,  0,  U,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 162  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 163   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 164   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 165   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 166   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 167   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 168   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 169   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 170
 171   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 172   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 173   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 174   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 175
 176   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 177   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 178   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 179   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 180 };
 181
 182 /* Decodes the forms %xy in a URL to the character the hexadecimal
 183    code of which is xy.  xy are hexadecimal digits from
 184    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 185    hex-digits or `%' precedes `\0', the sequence is inserted
 186    literally.  */
 187
 188 static void
 189 decode_string (char *s)
 190 {
 191   char *t = s;                  /* t - tortoise */
 192   char *h = s;                  /* h - hare     */
 193
 194   for (; *h; h++, t++)
 195     {
 196       if (*h != '%')
 197         {
 198         copychar:
 199           *t = *h;
 200         }
 201       else
 202         {
 203           /* Do nothing if '%' is not followed by two hex digits. */
 204           if (!*(h + 1) || !*(h + 2)
 205               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 206             goto copychar;
 207           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 208           h += 2;
 209         }
 210     }
 211   *t = '\0';
 212 }
 213
 214 /* Like encode_string, but return S if there are no unsafe chars.  */
 215
 216 static char *
 217 encode_string_maybe (const char *s)
 218 {
 219   const char *p1;
 220   char *p2, *newstr;
 221   int newlen;
 222   int addition = 0;
 223
 224   for (p1 = s; *p1; p1++)
 225     if (UNSAFE_CHAR (*p1))
 226       addition += 2;            /* Two more characters (hex digits) */
 227
 228   if (!addition)
 229     return (char *)s;
 230
 231   newlen = (p1 - s) + addition;
 232   newstr = (char *)xmalloc (newlen + 1);
 233
 234   p1 = s;
 235   p2 = newstr;
 236   while (*p1)
 237     {
 238       if (UNSAFE_CHAR (*p1))
 239         {
 240           const unsigned char c = *p1++;
 241           *p2++ = '%';
 242           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 243           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 244         }
 245       else
 246         *p2++ = *p1++;
 247     }
 248   *p2 = '\0';
 249   assert (p2 - newstr == newlen);
 250
 251   return newstr;
 252 }
 253
 254 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 255    given string, returning a malloc-ed %XX encoded string.  */
 256
 257 char *
 258 encode_string (const char *s)
 259 {
 260   char *encoded = encode_string_maybe (s);
 261   if (encoded != s)
 262     return encoded;
 263   else
 264     return xstrdup (s);
 265 }
 266
 267 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 268    the old value of PTR is freed and PTR is made to point to the newly
 269    allocated storage.  */
 270
 271 #define ENCODE(ptr) do {                        \
 272   char *e_new = encode_string_maybe (ptr);      \
 273   if (e_new != ptr)                             \
 274     {                                           \
 275       xfree (ptr);                              \
 276       ptr = e_new;                              \
 277     }                                           \
 278 } while (0)
 279 \f
 280 /* Returns the protocol type if URL's protocol is supported, or
 281    URLUNKNOWN if not.  */
 282 uerr_t
 283 urlproto (const char *url)
 284 {
 285   int i;
 286
 287   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 288     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 289       return sup_protos[i].ind;
 290   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 291   if (url[i] == ':')
 292     {
 293       for (++i; url[i] && url[i] != '/'; i++)
 294         if (!ISDIGIT (url[i]))
 295           return URLBADPORT;
 296       if (url[i - 1] == ':')
 297         return URLFTP;
 298       else
 299         return URLHTTP;
 300     }
 301   else
 302     return URLHTTP;
 303 }
 304
 305 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 306    part is found, returns 0.  */
 307 int
 308 skip_proto (const char *url)
 309 {
 310   char **s;
 311   int l;
 312
 313   for (s = protostrings; *s; s++)
 314     if (!strncasecmp (*s, url, strlen (*s)))
 315       break;
 316   if (!*s)
 317     return 0;
 318   l = strlen (*s);
 319   /* HTTP and FTP protocols are expected to yield exact host names
 320      (i.e. the `//' part must be skipped, too).  */
 321   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 322     l += 2;
 323   return l;
 324 }
 325
 326 /* Returns 1 if the URL begins with a protocol (supported or
 327    unsupported), 0 otherwise.  */
 328 int
 329 has_proto (const char *url)
 330 {
 331   char **s;
 332
 333   for (s = protostrings; *s; s++)
 334     if (strncasecmp (url, *s, strlen (*s)) == 0)
 335       return 1;
 336   return 0;
 337 }
 338
 339 /* Skip the username and password, if present here.  The function
 340    should be called *not* with the complete URL, but with the part
 341    right after the protocol.
 342
 343    If no username and password are found, return 0.  */
 344 int
 345 skip_uname (const char *url)
 346 {
 347   const char *p;
 348   const char *q = NULL;
 349   for (p = url ; *p && *p != '/'; p++)
 350     if (*p == '@') q = p;
 351   /* If a `@' was found before the first occurrence of `/', skip
 352      it.  */
 353   if (q != NULL)
 354     return q - url + 1;
 355   else
 356     return 0;
 357 }
 358 \f
 359 /* Allocate a new urlinfo structure, fill it with default values and
 360    return a pointer to it.  */
 361 struct urlinfo *
 362 newurl (void)
 363 {
 364   struct urlinfo *u;
 365
 366   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 367   memset (u, 0, sizeof (*u));
 368   u->proto = URLUNKNOWN;
 369   return u;
 370 }
 371
 372 /* Perform a "deep" free of the urlinfo structure.  The structure
 373    should have been created with newurl, but need not have been used.
 374    If free_pointer is non-0, free the pointer itself.  */
 375 void
 376 freeurl (struct urlinfo *u, int complete)
 377 {
 378   assert (u != NULL);
 379   FREE_MAYBE (u->url);
 380   FREE_MAYBE (u->host);
 381   FREE_MAYBE (u->path);
 382   FREE_MAYBE (u->file);
 383   FREE_MAYBE (u->dir);
 384   FREE_MAYBE (u->user);
 385   FREE_MAYBE (u->passwd);
 386   FREE_MAYBE (u->local);
 387   FREE_MAYBE (u->referer);
 388   if (u->proxy)
 389     freeurl (u->proxy, 1);
 390   if (complete)
 391     xfree (u);
 392   return;
 393 }
 394 \f
 395 /* Extract the given URL of the form
 396    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 397    1. hostname (terminated with `/' or `:')
 398    2. port number (terminated with `/'), or chosen for the protocol
 399    3. dirname (everything after hostname)
 400    Most errors are handled.  No allocation is done, you must supply
 401    pointers to allocated memory.
 402    ...and a host of other stuff :-)
 403
 404    - Recognizes hostname:dir/file for FTP and
 405      hostname (:portnum)?/dir/file for HTTP.
 406    - Parses the path to yield directory and file
 407    - Parses the URL to yield the username and passwd (if present)
 408    - Decodes the strings, in case they contain "forbidden" characters
 409    - Writes the result to struct urlinfo
 410
 411    If the argument STRICT is set, it recognizes only the canonical
 412    form.  */
 413 uerr_t
 414 parseurl (const char *url, struct urlinfo *u, int strict)
 415 {
 416   int i, l, abs_ftp;
 417   int recognizable;            /* Recognizable URL is the one where
 418                                   the protocol name was explicitly
 419                                   named, i.e. it wasn't deduced from
 420                                   the URL format.  */
 421   uerr_t type;
 422
 423   DEBUGP (("parseurl (\"%s\") -> ", url));
 424   recognizable = has_proto (url);
 425   if (strict && !recognizable)
 426     return URLUNKNOWN;
 427   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 428     {
 429       l = strlen (sup_protos[i].name);
 430       if (!strncasecmp (sup_protos[i].name, url, l))
 431         break;
 432     }
 433   /* If protocol is recognizable, but unsupported, bail out, else
 434      suppose unknown.  */
 435   if (recognizable && i == ARRAY_SIZE (sup_protos))
 436     return URLUNKNOWN;
 437   else if (i == ARRAY_SIZE (sup_protos))
 438     type = URLUNKNOWN;
 439   else
 440     u->proto = type = sup_protos[i].ind;
 441
 442   if (type == URLUNKNOWN)
 443     l = 0;
 444   /* Allow a username and password to be specified (i.e. just skip
 445      them for now).  */
 446   if (recognizable)
 447     l += skip_uname (url + l);
 448   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 449   if (i == l)
 450     return URLBADHOST;
 451   /* Get the hostname.  */
 452   u->host = strdupdelim (url + l, url + i);
 453   DEBUGP (("host %s -> ", u->host));
 454
 455   /* Assume no port has been given.  */
 456   u->port = 0;
 457   if (url[i] == ':')
 458     {
 459       /* We have a colon delimiting the hostname.  It could mean that
 460          a port number is following it, or a directory.  */
 461       if (ISDIGIT (url[++i]))    /* A port number */
 462         {
 463           if (type == URLUNKNOWN)
 464             u->proto = type = URLHTTP;
 465           for (; url[i] && url[i] != '/'; i++)
 466             if (ISDIGIT (url[i]))
 467               u->port = 10 * u->port + (url[i] - '0');
 468             else
 469               return URLBADPORT;
 470           if (!u->port)
 471             return URLBADPORT;
 472           DEBUGP (("port %hu -> ", u->port));
 473         }
 474       else if (type == URLUNKNOWN) /* or a directory */
 475         u->proto = type = URLFTP;
 476       else                      /* or just a misformed port number */
 477         return URLBADPORT;
 478     }
 479   else if (type == URLUNKNOWN)
 480     u->proto = type = URLHTTP;
 481   if (!u->port)
 482     {
 483       int ind;
 484       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 485         if (sup_protos[ind].ind == type)
 486           break;
 487       if (ind == ARRAY_SIZE (sup_protos))
 488         return URLUNKNOWN;
 489       u->port = sup_protos[ind].port;
 490     }
 491   /* Some delimiter troubles...  */
 492   if (url[i] == '/' && url[i - 1] != ':')
 493     ++i;
 494   if (type == URLHTTP)
 495     while (url[i] && url[i] == '/')
 496       ++i;
 497   u->path = (char *)xmalloc (strlen (url + i) + 8);
 498   strcpy (u->path, url + i);
 499   if (type == URLFTP)
 500     {
 501       u->ftp_type = process_ftp_type (u->path);
 502       /* #### We don't handle type `d' correctly yet.  */
 503       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 504         u->ftp_type = 'I';
 505       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 506     }
 507   DEBUGP (("opath %s -> ", u->path));
 508   /* Parse the username and password (if existing).  */
 509   parse_uname (url, &u->user, &u->passwd);
 510   /* Decode the strings, as per RFC 1738.  */
 511   decode_string (u->host);
 512   decode_string (u->path);
 513   if (u->user)
 514     decode_string (u->user);
 515   if (u->passwd)
 516     decode_string (u->passwd);
 517   /* Parse the directory.  */
 518   parse_dir (u->path, &u->dir, &u->file);
 519   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 520   /* Simplify the directory.  */
 521   path_simplify (u->dir);
 522   /* Remove the leading `/' in HTTP.  */
 523   if (type == URLHTTP && *u->dir == '/')
 524     strcpy (u->dir, u->dir + 1);
 525   DEBUGP (("ndir %s\n", u->dir));
 526   /* Strip trailing `/'.  */
 527   l = strlen (u->dir);
 528   if (l > 1 && u->dir[l - 1] == '/')
 529     u->dir[l - 1] = '\0';
 530   /* Re-create the path: */
 531   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 532   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 533       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 534   strcpy (u->path, abs_ftp ? "%2F" : "/");
 535   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 536   strcat (u->path, *u->dir ? "/" : "");
 537   strcat (u->path, u->file);
 538   ENCODE (u->path);
 539   DEBUGP (("newpath: %s\n", u->path));
 540   /* Create the clean URL.  */
 541   u->url = str_url (u, 0);
 542   return URLOK;
 543 }
 544 \f
 545 /* Special versions of DOTP and DDOTP for parse_dir().  They work like
 546    DOTP and DDOTP, but they also recognize `?' as end-of-string
 547    delimiter.  This is needed for correct handling of query
 548    strings.  */
 549
 550 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 551 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 552                      && (!*((x) + 2) || *((x) + 2) == '?'))
 553
 554 /* Build the directory and filename components of the path.  Both
 555    components are *separately* malloc-ed strings!  It does not change
 556    the contents of path.
 557
 558    If the path ends with "." or "..", they are (correctly) counted as
 559    directories.  */
 560 static void
 561 parse_dir (const char *path, char **dir, char **file)
 562 {
 563   int i, l;
 564
 565   l = urlpath_length (path);
 566   for (i = l; i && path[i] != '/'; i--);
 567
 568   if (!i && *path != '/')   /* Just filename */
 569     {
 570       if (PD_DOTP (path) || PD_DDOTP (path))
 571         {
 572           *dir = strdupdelim (path, path + l);
 573           *file = xstrdup (path + l); /* normally empty, but could
 574                                          contain ?... */
 575         }
 576       else
 577         {
 578           *dir = xstrdup ("");     /* This is required because of FTP */
 579           *file = xstrdup (path);
 580         }
 581     }
 582   else if (!i)                 /* /filename */
 583     {
 584       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 585         {
 586           *dir = strdupdelim (path, path + l);
 587           *file = xstrdup (path + l); /* normally empty, but could
 588                                          contain ?... */
 589         }
 590       else
 591         {
 592           *dir = xstrdup ("/");
 593           *file = xstrdup (path + 1);
 594         }
 595     }
 596   else /* Nonempty directory with or without a filename */
 597     {
 598       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 599         {
 600           *dir = strdupdelim (path, path + l);
 601           *file = xstrdup (path + l); /* normally empty, but could
 602                                          contain ?... */
 603         }
 604       else
 605         {
 606           *dir = strdupdelim (path, path + i);
 607           *file = xstrdup (path + i + 1);
 608         }
 609     }
 610 }
 611
 612 /* Find the optional username and password within the URL, as per
 613    RFC1738.  The returned user and passwd char pointers are
 614    malloc-ed.  */
 615 static uerr_t
 616 parse_uname (const char *url, char **user, char **passwd)
 617 {
 618   int l;
 619   const char *p, *q, *col;
 620   char **where;
 621
 622   *user = NULL;
 623   *passwd = NULL;
 624
 625   /* Look for the end of the protocol string.  */
 626   l = skip_proto (url);
 627   if (!l)
 628     return URLUNKNOWN;
 629   /* Add protocol offset.  */
 630   url += l;
 631   /* Is there an `@' character?  */
 632   for (p = url; *p && *p != '/'; p++)
 633     if (*p == '@')
 634       break;
 635   /* If not, return.  */
 636   if (*p != '@')
 637     return URLOK;
 638   /* Else find the username and password.  */
 639   for (p = q = col = url; *p && *p != '/'; p++)
 640     {
 641       if (*p == ':' && !*user)
 642         {
 643           *user = (char *)xmalloc (p - url + 1);
 644           memcpy (*user, url, p - url);
 645           (*user)[p - url] = '\0';
 646           col = p + 1;
 647         }
 648       if (*p == '@') q = p;
 649     }
 650   /* Decide whether you have only the username or both.  */
 651   where = *user ? passwd : user;
 652   *where = (char *)xmalloc (q - col + 1);
 653   memcpy (*where, col, q - col);
 654   (*where)[q - col] = '\0';
 655   return URLOK;
 656 }
 657
 658 /* If PATH ends with `;type=X', return the character X.  */
 659 static char
 660 process_ftp_type (char *path)
 661 {
 662   int len = strlen (path);
 663
 664   if (len >= 7
 665       && !memcmp (path + len - 7, ";type=", 6))
 666     {
 667       path[len - 7] = '\0';
 668       return path[len - 1];
 669     }
 670   else
 671     return '\0';
 672 }
 673 \f
 674 /* Return the URL as fine-formed string, with a proper protocol, optional port
 675    number, directory and optional user/password.  If `hide' is non-zero (as it
 676    is when we're calling this on a URL we plan to print, but not when calling it
 677    to canonicalize a URL for use within the program), password will be hidden.
 678    The forbidden characters in the URL will be cleansed.  */
 679 char *
 680 str_url (const struct urlinfo *u, int hide)
 681 {
 682   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 683   int i, l, ln, lu, lh, lp, lf, ld;
 684   unsigned short proto_default_port;
 685
 686   /* Look for the protocol name.  */
 687   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 688     if (sup_protos[i].ind == u->proto)
 689       break;
 690   if (i == ARRAY_SIZE (sup_protos))
 691     return NULL;
 692   proto_name = sup_protos[i].name;
 693   proto_default_port = sup_protos[i].port;
 694   host = encode_string (u->host);
 695   dir = encode_string (u->dir);
 696   file = encode_string (u->file);
 697   user = passwd = NULL;
 698   if (u->user)
 699     user = encode_string (u->user);
 700   if (u->passwd)
 701     {
 702       if (hide)
 703         /* Don't output the password, or someone might see it over the user's
 704            shoulder (or in saved wget output).  Don't give away the number of
 705            characters in the password, either, as we did in past versions of
 706            this code, when we replaced the password characters with 'x's. */
 707         passwd = xstrdup("<password>");
 708       else
 709         passwd = encode_string (u->passwd);
 710     }
 711   if (u->proto == URLFTP && *dir == '/')
 712     {
 713       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 714       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 715       tmp[0] = '%';
 716       tmp[1] = '2';
 717       tmp[2] = 'F';
 718       strcpy (tmp + 3, dir + 1);
 719       xfree (dir);
 720       dir = tmp;
 721     }
 722
 723   ln = strlen (proto_name);
 724   lu = user ? strlen (user) : 0;
 725   lp = passwd ? strlen (passwd) : 0;
 726   lh = strlen (host);
 727   ld = strlen (dir);
 728   lf = strlen (file);
 729   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 730   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 731      (user ? user : ""), (passwd ? ":" : ""),
 732      (passwd ? passwd : ""), (user ? "@" : ""),
 733      host, u->port, dir, *dir ? "/" : "", file); */
 734   l = 0;
 735   memcpy (res, proto_name, ln);
 736   l += ln;
 737   if (user)
 738     {
 739       memcpy (res + l, user, lu);
 740       l += lu;
 741       if (passwd)
 742         {
 743           res[l++] = ':';
 744           memcpy (res + l, passwd, lp);
 745           l += lp;
 746         }
 747       res[l++] = '@';
 748     }
 749   memcpy (res + l, host, lh);
 750   l += lh;
 751   if (u->port != proto_default_port)
 752     {
 753       res[l++] = ':';
 754       long_to_string (res + l, (long)u->port);
 755       l += numdigit (u->port);
 756     }
 757   res[l++] = '/';
 758   memcpy (res + l, dir, ld);
 759   l += ld;
 760   if (*dir)
 761     res[l++] = '/';
 762   strcpy (res + l, file);
 763   xfree (host);
 764   xfree (dir);
 765   xfree (file);
 766   FREE_MAYBE (user);
 767   FREE_MAYBE (passwd);
 768   return res;
 769 }
 770
 771 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 772    location.  Uses parseurl to parse them, and compares the canonical
 773    forms.
 774
 775    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 776    return 0 on error.  */
 777 /* Do not compile unused code. */
 778 #if 0
 779 int
 780 url_equal (const char *url1, const char *url2)
 781 {
 782   struct urlinfo *u1, *u2;
 783   uerr_t err;
 784   int res;
 785
 786   u1 = newurl ();
 787   err = parseurl (url1, u1, 0);
 788   if (err != URLOK)
 789     {
 790       freeurl (u1, 1);
 791       return 0;
 792     }
 793   u2 = newurl ();
 794   err = parseurl (url2, u2, 0);
 795   if (err != URLOK)
 796     {
 797       freeurl (u1, 1);
 798       freeurl (u2, 1);
 799       return 0;
 800     }
 801   res = !strcmp (u1->url, u2->url);
 802   freeurl (u1, 1);
 803   freeurl (u2, 1);
 804   return res;
 805 }
 806 #endif /* 0 */
 807 \f
 808 urlpos *
 809 get_urls_file (const char *file)
 810 {
 811   struct file_memory *fm;
 812   urlpos *head, *tail;
 813   const char *text, *text_end;
 814
 815   /* Load the file.  */
 816   fm = read_file (file);
 817   if (!fm)
 818     {
 819       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 820       return NULL;
 821     }
 822   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 823   head = tail = NULL;
 824   text = fm->content;
 825   text_end = fm->content + fm->length;
 826   while (text < text_end)
 827     {
 828       const char *line_beg = text;
 829       const char *line_end = memchr (text, '\n', text_end - text);
 830       if (!line_end)
 831         line_end = text_end;
 832       else
 833         ++line_end;
 834       text = line_end;
 835       while (line_beg < line_end
 836              && ISSPACE (*line_beg))
 837         ++line_beg;
 838       while (line_end > line_beg + 1
 839              && ISSPACE (*(line_end - 1)))
 840         --line_end;
 841       if (line_end > line_beg)
 842         {
 843           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 844           memset (entry, 0, sizeof (*entry));
 845           entry->next = NULL;
 846           entry->url = strdupdelim (line_beg, line_end);
 847           if (!head)
 848             head = entry;
 849           else
 850             tail->next = entry;
 851           tail = entry;
 852         }
 853     }
 854   read_file_free (fm);
 855   return head;
 856 }
 857 \f
 858 /* Free the linked list of urlpos.  */
 859 void
 860 free_urlpos (urlpos *l)
 861 {
 862   while (l)
 863     {
 864       urlpos *next = l->next;
 865       xfree (l->url);
 866       FREE_MAYBE (l->local_name);
 867       xfree (l);
 868       l = next;
 869     }
 870 }
 871
 872 /* Rotate FNAME opt.backups times */
 873 void
 874 rotate_backups(const char *fname)
 875 {
 876   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 877   char *from = (char *)alloca (maxlen);
 878   char *to = (char *)alloca (maxlen);
 879   struct stat sb;
 880   int i;
 881
 882   if (stat (fname, &sb) == 0)
 883     if (S_ISREG (sb.st_mode) == 0)
 884       return;
 885
 886   for (i = opt.backups; i > 1; i--)
 887     {
 888       sprintf (from, "%s.%d", fname, i - 1);
 889       sprintf (to, "%s.%d", fname, i);
 890       /* #### This will fail on machines without the rename() system
 891          call.  */
 892       rename (from, to);
 893     }
 894
 895   sprintf (to, "%s.%d", fname, 1);
 896   rename(fname, to);
 897 }
 898
 899 /* Create all the necessary directories for PATH (a file).  Calls
 900    mkdirhier() internally.  */
 901 int
 902 mkalldirs (const char *path)
 903 {
 904   const char *p;
 905   char *t;
 906   struct stat st;
 907   int res;
 908
 909   p = path + strlen (path);
 910   for (; *p != '/' && p != path; p--);
 911   /* Don't create if it's just a file.  */
 912   if ((p == path) && (*p != '/'))
 913     return 0;
 914   t = strdupdelim (path, p);
 915   /* Check whether the directory exists.  */
 916   if ((stat (t, &st) == 0))
 917     {
 918       if (S_ISDIR (st.st_mode))
 919         {
 920           xfree (t);
 921           return 0;
 922         }
 923       else
 924         {
 925           /* If the dir exists as a file name, remove it first.  This
 926              is *only* for Wget to work with buggy old CERN http
 927              servers.  Here is the scenario: When Wget tries to
 928              retrieve a directory without a slash, e.g.
 929              http://foo/bar (bar being a directory), CERN server will
 930              not redirect it too http://foo/bar/ -- it will generate a
 931              directory listing containing links to bar/file1,
 932              bar/file2, etc.  Wget will lose because it saves this
 933              HTML listing to a file `bar', so it cannot create the
 934              directory.  To work around this, if the file of the same
 935              name exists, we just remove it and create the directory
 936              anyway.  */
 937           DEBUGP (("Removing %s because of directory danger!\n", t));
 938           unlink (t);
 939         }
 940     }
 941   res = make_directory (t);
 942   if (res != 0)
 943     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 944   xfree (t);
 945   return res;
 946 }
 947
 948 static int
 949 count_slashes (const char *s)
 950 {
 951   int i = 0;
 952   while (*s)
 953     if (*s++ == '/')
 954       ++i;
 955   return i;
 956 }
 957
 958 /* Return the path name of the URL-equivalent file name, with a
 959    remote-like structure of directories.  */
 960 static char *
 961 mkstruct (const struct urlinfo *u)
 962 {
 963   char *host, *dir, *file, *res, *dirpref;
 964   int l;
 965
 966   assert (u->dir != NULL);
 967   assert (u->host != NULL);
 968
 969   if (opt.cut_dirs)
 970     {
 971       char *ptr = u->dir + (*u->dir == '/');
 972       int slash_count = 1 + count_slashes (ptr);
 973       int cut = MINVAL (opt.cut_dirs, slash_count);
 974       for (; cut && *ptr; ptr++)
 975         if (*ptr == '/')
 976           --cut;
 977       STRDUP_ALLOCA (dir, ptr);
 978     }
 979   else
 980     dir = u->dir + (*u->dir == '/');
 981
 982   host = xstrdup (u->host);
 983   /* Check for the true name (or at least a consistent name for saving
 984      to directory) of HOST, reusing the hlist if possible.  */
 985   if (opt.add_hostdir && !opt.simple_check)
 986     {
 987       char *nhost = realhost (host);
 988       xfree (host);
 989       host = nhost;
 990     }
 991   /* Add dir_prefix and hostname (if required) to the beginning of
 992      dir.  */
 993   if (opt.add_hostdir)
 994     {
 995       if (!DOTP (opt.dir_prefix))
 996         {
 997           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 998                                     + strlen (host) + 1);
 999           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1000         }
1001       else
1002         STRDUP_ALLOCA (dirpref, host);
1003     }
1004   else                         /* not add_hostdir */
1005     {
1006       if (!DOTP (opt.dir_prefix))
1007         dirpref = opt.dir_prefix;
1008       else
1009         dirpref = "";
1010     }
1011   xfree (host);
1012
1013   /* If there is a prefix, prepend it.  */
1014   if (*dirpref)
1015     {
1016       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1017       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1018       dir = newdir;
1019     }
1020   dir = encode_string (dir);
1021   l = strlen (dir);
1022   if (l && dir[l - 1] == '/')
1023     dir[l - 1] = '\0';
1024
1025   if (!*u->file)
1026     file = "index.html";
1027   else
1028     file = u->file;
1029
1030   /* Finally, construct the full name.  */
1031   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1032   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1033   xfree (dir);
1034   return res;
1035 }
1036
1037 /* Return a malloced copy of S, but protect any '/' characters. */
1038
1039 static char *
1040 file_name_protect_query_string (const char *s)
1041 {
1042   const char *from;
1043   char *to, *dest;
1044   int destlen = 0;
1045   for (from = s; *from; from++)
1046     {
1047       ++destlen;
1048       if (*from == '/')
1049         destlen += 2;           /* each / gets replaced with %2F, so
1050                                    it adds two more chars.  */
1051     }
1052   dest = (char *)xmalloc (destlen + 1);
1053   for (from = s, to = dest; *from; from++)
1054     {
1055       if (*from != '/')
1056         *to++ = *from;
1057       else
1058         {
1059           *to++ = '%';
1060           *to++ = '2';
1061           *to++ = 'F';
1062         }
1063     }
1064   assert (to - dest == destlen);
1065   *to = '\0';
1066   return dest;
1067 }
1068
1069 /* Create a unique filename, corresponding to a given URL.  Calls
1070    mkstruct if necessary.  Does *not* actually create any directories.  */
1071 char *
1072 url_filename (const struct urlinfo *u)
1073 {
1074   char *file, *name;
1075   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1076
1077   if (opt.dirstruct)
1078     {
1079       file = mkstruct (u);
1080       have_prefix = 1;
1081     }
1082   else
1083     {
1084       if (!*u->file)
1085         file = xstrdup ("index.html");
1086       else
1087         {
1088           /* If the URL came with a query string, u->file will contain
1089              a question mark followed by query string contents.  These
1090              contents can contain '/' which would make us create
1091              unwanted directories.  These slashes must be protected
1092              explicitly.  */
1093           if (!strchr (u->file, '/'))
1094             file = xstrdup (u->file);
1095           else
1096             {
1097               /*assert (strchr (u->file, '?') != NULL);*/
1098               file = file_name_protect_query_string (u->file);
1099             }
1100         }
1101     }
1102
1103   if (!have_prefix)
1104     {
1105       /* Check whether the prefix directory is something other than "."
1106          before prepending it.  */
1107       if (!DOTP (opt.dir_prefix))
1108         {
1109           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1110                                          + 1 + strlen (file) + 1);
1111           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1112           xfree (file);
1113           file = nfile;
1114         }
1115     }
1116   /* DOS-ish file systems don't like `%' signs in them; we change it
1117      to `@'.  */
1118 #ifdef WINDOWS
1119   {
1120     char *p = file;
1121     for (p = file; *p; p++)
1122       if (*p == '%')
1123         *p = '@';
1124   }
1125 #endif /* WINDOWS */
1126
1127   /* Check the cases in which the unique extensions are not used:
1128      1) Clobbering is turned off (-nc).
1129      2) Retrieval with regetting.
1130      3) Timestamping is used.
1131      4) Hierarchy is built.
1132
1133      The exception is the case when file does exist and is a
1134      directory (actually support for bad httpd-s).  */
1135   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1136       && !(file_exists_p (file) && !file_non_directory_p (file)))
1137     return file;
1138
1139   /* Find a unique name.  */
1140   name = unique_name (file);
1141   xfree (file);
1142   return name;
1143 }
1144
1145 /* Like strlen(), but allow the URL to be ended with '?'.  */
1146 static int
1147 urlpath_length (const char *url)
1148 {
1149   const char *q = strchr (url, '?');
1150   if (q)
1151     return q - url;
1152   return strlen (url);
1153 }
1154
1155 /* Find the last occurrence of character C in the range [b, e), or
1156    NULL, if none are present.  This is almost completely equivalent to
1157    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1158    the contents of the string.  */
1159 static const char *
1160 find_last_char (const char *b, const char *e, char c)
1161 {
1162   for (; e > b; e--)
1163     if (*e == c)
1164       return e;
1165   return NULL;
1166 }
1167
1168 /* Resolve the result of "linking" a base URI (BASE) to a
1169    link-specified URI (LINK).
1170
1171    Either of the URIs may be absolute or relative, complete with the
1172    host name, or path only.  This tries to behave "reasonably" in all
1173    foreseeable cases.  It employs little specific knowledge about
1174    protocols or URL-specific stuff -- it just works on strings.
1175
1176    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1177    See uri_merge for a gentler interface to this functionality.
1178
1179    #### This function should handle `./' and `../' so that the evil
1180    path_simplify can go.  */
1181 static char *
1182 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1183 {
1184   char *constr;
1185
1186   if (no_proto)
1187     {
1188       const char *end = base + urlpath_length (base);
1189
1190       if (*link != '/')
1191         {
1192           /* LINK is a relative URL: we need to replace everything
1193              after last slash (possibly empty) with LINK.
1194
1195              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1196              our result should be "whatever/foo/qux/xyzzy".  */
1197           int need_explicit_slash = 0;
1198           int span;
1199           const char *start_insert;
1200           const char *last_slash = find_last_char (base, end, '/');
1201           if (!last_slash)
1202             {
1203               /* No slash found at all.  Append LINK to what we have,
1204                  but we'll need a slash as a separator.
1205
1206                  Example: if base == "foo" and link == "qux/xyzzy", then
1207                  we cannot just append link to base, because we'd get
1208                  "fooqux/xyzzy", whereas what we want is
1209                  "foo/qux/xyzzy".
1210
1211                  To make sure the / gets inserted, we set
1212                  need_explicit_slash to 1.  We also set start_insert
1213                  to end + 1, so that the length calculations work out
1214                  correctly for one more (slash) character.  Accessing
1215                  that character is fine, since it will be the
1216                  delimiter, '\0' or '?'.  */
1217               /* example: "foo?..." */
1218               /*               ^    ('?' gets changed to '/') */
1219               start_insert = end + 1;
1220               need_explicit_slash = 1;
1221             }
1222           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1223             {
1224               /* example: http://host"  */
1225               /*                      ^ */
1226               start_insert = end + 1;
1227               need_explicit_slash = 1;
1228             }
1229           else
1230             {
1231               /* example: "whatever/foo/bar" */
1232               /*                        ^    */
1233               start_insert = last_slash + 1;
1234             }
1235
1236           span = start_insert - base;
1237           constr = (char *)xmalloc (span + linklength + 1);
1238           if (span)
1239             memcpy (constr, base, span);
1240           if (need_explicit_slash)
1241             constr[span - 1] = '/';
1242           if (linklength)
1243             memcpy (constr + span, link, linklength);
1244           constr[span + linklength] = '\0';
1245         }
1246       else /* *link == `/' */
1247         {
1248           /* LINK is an absolute path: we need to replace everything
1249              after (and including) the FIRST slash with LINK.
1250
1251              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1252              "/qux/xyzzy", our result should be
1253              "http://host/qux/xyzzy".  */
1254           int span;
1255           const char *slash;
1256           const char *start_insert = NULL; /* for gcc to shut up. */
1257           const char *pos = base;
1258           int seen_slash_slash = 0;
1259           /* We're looking for the first slash, but want to ignore
1260              double slash. */
1261         again:
1262           slash = memchr (pos, '/', end - pos);
1263           if (slash && !seen_slash_slash)
1264             if (*(slash + 1) == '/')
1265               {
1266                 pos = slash + 2;
1267                 seen_slash_slash = 1;
1268                 goto again;
1269               }
1270
1271           /* At this point, SLASH is the location of the first / after
1272              "//", or the first slash altogether.  START_INSERT is the
1273              pointer to the location where LINK will be inserted.  When
1274              examining the last two examples, keep in mind that LINK
1275              begins with '/'. */
1276
1277           if (!slash && !seen_slash_slash)
1278             /* example: "foo" */
1279             /*           ^    */
1280             start_insert = base;
1281           else if (!slash && seen_slash_slash)
1282             /* example: "http://foo" */
1283             /*                     ^ */
1284             start_insert = end;
1285           else if (slash && !seen_slash_slash)
1286             /* example: "foo/bar" */
1287             /*           ^        */
1288             start_insert = base;
1289           else if (slash && seen_slash_slash)
1290             /* example: "http://something/" */
1291             /*                           ^  */
1292             start_insert = slash;
1293
1294           span = start_insert - base;
1295           constr = (char *)xmalloc (span + linklength + 1);
1296           if (span)
1297             memcpy (constr, base, span);
1298           if (linklength)
1299             memcpy (constr + span, link, linklength);
1300           constr[span + linklength] = '\0';
1301         }
1302     }
1303   else /* !no_proto */
1304     {
1305       constr = strdupdelim (link, link + linklength);
1306     }
1307   return constr;
1308 }
1309
1310 /* Merge BASE with LINK and return the resulting URI.  This is an
1311    interface to uri_merge_1 that assumes that LINK is a
1312    zero-terminated string.  */
1313 char *
1314 uri_merge (const char *base, const char *link)
1315 {
1316   return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1317 }
1318 \f
1319 /* Optimize URL by host, destructively replacing u->host with realhost
1320    (u->host).  Do this regardless of opt.simple_check.  */
1321 void
1322 opt_url (struct urlinfo *u)
1323 {
1324   /* Find the "true" host.  */
1325   char *host = realhost (u->host);
1326   xfree (u->host);
1327   u->host = host;
1328   assert (u->dir != NULL);      /* the URL must have been parsed */
1329   /* Refresh the printed representation.  */
1330   xfree (u->url);
1331   u->url = str_url (u, 0);
1332 }
1333 \f
1334 /* Returns proxy host address, in accordance with PROTO.  */
1335 char *
1336 getproxy (uerr_t proto)
1337 {
1338   char *proxy;
1339
1340   if (proto == URLHTTP)
1341     proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1342   else if (proto == URLFTP)
1343     proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1344 #ifdef HAVE_SSL
1345   else if (proto == URLHTTPS)
1346     proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1347 #endif /* HAVE_SSL */
1348   else
1349     proxy = NULL;
1350   if (!proxy || !*proxy)
1351     return NULL;
1352   return proxy;
1353 }
1354
1355 /* Should a host be accessed through proxy, concerning no_proxy?  */
1356 int
1357 no_proxy_match (const char *host, const char **no_proxy)
1358 {
1359   if (!no_proxy)
1360     return 1;
1361   else
1362     return !sufmatch (no_proxy, host);
1363 }
1364 \f
1365 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1366 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1367
1368 /* Change the links in an HTML document.  Accepts a structure that
1369    defines the positions of all the links.  */
1370 void
1371 convert_links (const char *file, urlpos *l)
1372 {
1373   struct file_memory *fm;
1374   FILE               *fp;
1375   const char         *p;
1376   downloaded_file_t  downloaded_file_return;
1377
1378   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1379
1380   {
1381     /* First we do a "dry run": go through the list L and see whether
1382        any URL needs to be converted in the first place.  If not, just
1383        leave the file alone.  */
1384     int count = 0;
1385     urlpos *dry = l;
1386     for (dry = l; dry; dry = dry->next)
1387       if (dry->convert != CO_NOCONVERT)
1388         ++count;
1389     if (!count)
1390       {
1391         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1392         return;
1393       }
1394   }
1395
1396   fm = read_file (file);
1397   if (!fm)
1398     {
1399       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1400                  file, strerror (errno));
1401       return;
1402     }
1403
1404   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1405   if (opt.backup_converted && downloaded_file_return)
1406     write_backup_file (file, downloaded_file_return);
1407
1408   /* Before opening the file for writing, unlink the file.  This is
1409      important if the data in FM is mmaped.  In such case, nulling the
1410      file, which is what fopen() below does, would make us read all
1411      zeroes from the mmaped region.  */
1412   if (unlink (file) < 0 && errno != ENOENT)
1413     {
1414       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1415                  file, strerror (errno));
1416       read_file_free (fm);
1417       return;
1418     }
1419   /* Now open the file for writing.  */
1420   fp = fopen (file, "wb");
1421   if (!fp)
1422     {
1423       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1424                  file, strerror (errno));
1425       read_file_free (fm);
1426       return;
1427     }
1428   /* Here we loop through all the URLs in file, replacing those of
1429      them that are downloaded with relative references.  */
1430   p = fm->content;
1431   for (; l; l = l->next)
1432     {
1433       char *url_start = fm->content + l->pos;
1434
1435       if (l->pos >= fm->length)
1436         {
1437           DEBUGP (("Something strange is going on.  Please investigate."));
1438           break;
1439         }
1440       /* If the URL is not to be converted, skip it.  */
1441       if (l->convert == CO_NOCONVERT)
1442         {
1443           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1444           continue;
1445         }
1446
1447       /* Echo the file contents, up to the offending URL's opening
1448          quote, to the outfile.  */
1449       fwrite (p, 1, url_start - p, fp);
1450       p = url_start;
1451       if (l->convert == CO_CONVERT_TO_RELATIVE)
1452         {
1453           /* Convert absolute URL to relative. */
1454           char *newname = construct_relative (file, l->local_name);
1455           char *quoted_newname = html_quote_string (newname);
1456           replace_attr (&p, l->size, fp, quoted_newname);
1457           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1458                    l->url, newname, l->pos, file));
1459           xfree (newname);
1460           xfree (quoted_newname);
1461         }
1462       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1463         {
1464           /* Convert the link to absolute URL. */
1465           char *newlink = l->url;
1466           char *quoted_newlink = html_quote_string (newlink);
1467           replace_attr (&p, l->size, fp, quoted_newlink);
1468           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1469                    newlink, l->pos, file));
1470           xfree (quoted_newlink);
1471         }
1472     }
1473   /* Output the rest of the file. */
1474   if (p - fm->content < fm->length)
1475     fwrite (p, 1, fm->length - (p - fm->content), fp);
1476   fclose (fp);
1477   read_file_free (fm);
1478   logputs (LOG_VERBOSE, _("done.\n"));
1479 }
1480
1481 /* Construct and return a malloced copy of the relative link from two
1482    pieces of information: local name S1 of the referring file and
1483    local name S2 of the referred file.
1484
1485    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1486    "jagor.srce.hr/images/news.gif", the function will return
1487    "images/news.gif".
1488
1489    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1490    "fly.cc.fer.hr/images/fly.gif", the function will return
1491    "../images/fly.gif".
1492
1493    Caveats: S1 should not begin with `/', unless S2 also begins with
1494    '/'.  S1 should not contain things like ".." and such --
1495    construct_relative ("fly/ioccc/../index.html",
1496    "fly/images/fly.gif") will fail.  (A workaround is to call
1497    something like path_simplify() on S1).  */
1498 static char *
1499 construct_relative (const char *s1, const char *s2)
1500 {
1501   int i, cnt, sepdirs1;
1502   char *res;
1503
1504   if (*s2 == '/')
1505     return xstrdup (s2);
1506   /* S1 should *not* be absolute, if S2 wasn't.  */
1507   assert (*s1 != '/');
1508   i = cnt = 0;
1509   /* Skip the directories common to both strings.  */
1510   while (1)
1511     {
1512       while (s1[i] && s2[i]
1513              && (s1[i] == s2[i])
1514              && (s1[i] != '/')
1515              && (s2[i] != '/'))
1516         ++i;
1517       if (s1[i] == '/' && s2[i] == '/')
1518         cnt = ++i;
1519       else
1520         break;
1521     }
1522   for (sepdirs1 = 0; s1[i]; i++)
1523     if (s1[i] == '/')
1524       ++sepdirs1;
1525   /* Now, construct the file as of:
1526      - ../ repeated sepdirs1 time
1527      - all the non-mutual directories of S2.  */
1528   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1529   for (i = 0; i < sepdirs1; i++)
1530     memcpy (res + 3 * i, "../", 3);
1531   strcpy (res + 3 * i, s2 + cnt);
1532   return res;
1533 }
1534 \f
1535 /* Add URL to the head of the list L.  */
1536 urlpos *
1537 add_url (urlpos *l, const char *url, const char *file)
1538 {
1539   urlpos *t;
1540
1541   t = (urlpos *)xmalloc (sizeof (urlpos));
1542   memset (t, 0, sizeof (*t));
1543   t->url = xstrdup (url);
1544   t->local_name = xstrdup (file);
1545   t->next = l;
1546   return t;
1547 }
1548
1549 static void
1550 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1551 {
1552   /* Rather than just writing over the original .html file with the
1553      converted version, save the former to *.orig.  Note we only do
1554      this for files we've _successfully_ downloaded, so we don't
1555      clobber .orig files sitting around from previous invocations. */
1556
1557   /* Construct the backup filename as the original name plus ".orig". */
1558   size_t         filename_len = strlen(file);
1559   char*          filename_plus_orig_suffix;
1560   boolean        already_wrote_backup_file = FALSE;
1561   slist*         converted_file_ptr;
1562   static slist*  converted_files = NULL;
1563
1564   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1565     {
1566       /* Just write "orig" over "html".  We need to do it this way
1567          because when we're checking to see if we've downloaded the
1568          file before (to see if we can skip downloading it), we don't
1569          know if it's a text/html file.  Therefore we don't know yet
1570          at that stage that -E is going to cause us to tack on
1571          ".html", so we need to compare vs. the original URL plus
1572          ".orig", not the original URL plus ".html.orig". */
1573       filename_plus_orig_suffix = alloca (filename_len + 1);
1574       strcpy(filename_plus_orig_suffix, file);
1575       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1576     }
1577   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1578     {
1579       /* Append ".orig" to the name. */
1580       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1581       strcpy(filename_plus_orig_suffix, file);
1582       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1583     }
1584
1585   /* We can get called twice on the same URL thanks to the
1586      convert_all_links() call in main().  If we write the .orig file
1587      each time in such a case, it'll end up containing the first-pass
1588      conversion, not the original file.  So, see if we've already been
1589      called on this file. */
1590   converted_file_ptr = converted_files;
1591   while (converted_file_ptr != NULL)
1592     if (strcmp(converted_file_ptr->string, file) == 0)
1593       {
1594         already_wrote_backup_file = TRUE;
1595         break;
1596       }
1597     else
1598       converted_file_ptr = converted_file_ptr->next;
1599
1600   if (!already_wrote_backup_file)
1601     {
1602       /* Rename <file> to <file>.orig before former gets written over. */
1603       if (rename(file, filename_plus_orig_suffix) != 0)
1604         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1605                    file, filename_plus_orig_suffix, strerror (errno));
1606
1607       /* Remember that we've already written a .orig backup for this file.
1608          Note that we never free this memory since we need it till the
1609          convert_all_links() call, which is one of the last things the
1610          program does before terminating.  BTW, I'm not sure if it would be
1611          safe to just set 'converted_file_ptr->string' to 'file' below,
1612          rather than making a copy of the string...  Another note is that I
1613          thought I could just add a field to the urlpos structure saying
1614          that we'd written a .orig file for this URL, but that didn't work,
1615          so I had to make this separate list.
1616          -- Dan Harkless <wget@harkless.org>
1617
1618          This [adding a field to the urlpos structure] didn't work
1619          because convert_file() is called twice: once after all its
1620          sublinks have been retrieved in recursive_retrieve(), and
1621          once at the end of the day in convert_all_links().  The
1622          original linked list collected in recursive_retrieve() is
1623          lost after the first invocation of convert_links(), and
1624          convert_all_links() makes a new one (it calls get_urls_html()
1625          for each file it covers.)  That's why your first approach didn't
1626          work.  The way to make it work is perhaps to make this flag a
1627          field in the `urls_html' list.
1628          -- Hrvoje Niksic <hniksic@arsdigita.com>
1629       */
1630       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1631       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1632       converted_file_ptr->next = converted_files;
1633       converted_files = converted_file_ptr;
1634     }
1635 }
1636
1637 static int find_fragment PARAMS ((const char *, int, const char **,
1638                                   const char **));
1639
1640 static void
1641 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1642 {
1643   const char *p = *pp;
1644   int quote_flag = 0;
1645   int size = raw_size;
1646   char quote_char = '\"';
1647   const char *frag_beg, *frag_end;
1648
1649   /* Structure of our string is:
1650        "...old-contents..."
1651        <---  l->size   --->  (with quotes)
1652      OR:
1653        ...old-contents...
1654        <---  l->size  -->    (no quotes)   */
1655
1656   if (*p == '\"' || *p == '\'')
1657     {
1658       quote_char = *p;
1659       quote_flag = 1;
1660       ++p;
1661       size -= 2;                /* disregard opening and closing quote */
1662     }
1663   putc (quote_char, fp);
1664   fputs (new_str, fp);
1665
1666   /* Look for fragment identifier, if any. */
1667   if (find_fragment (p, size, &frag_beg, &frag_end))
1668     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1669   p += size;
1670   if (quote_flag)
1671     ++p;
1672   putc (quote_char, fp);
1673   *pp = p;
1674 }
1675
1676 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1677    preceded by '&'.  If the character is not found, return zero.  If
1678    the character is found, return 1 and set BP and EP to point to the
1679    beginning and end of the region.
1680
1681    This is used for finding the fragment indentifiers in URLs.  */
1682
1683 static int
1684 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1685 {
1686   const char *end = beg + size;
1687   int saw_amp = 0;
1688   for (; beg < end; beg++)
1689     {
1690       switch (*beg)
1691         {
1692         case '&':
1693           saw_amp = 1;
1694           break;
1695         case '#':
1696           if (!saw_amp)
1697             {
1698               *bp = beg;
1699               *ep = end;
1700               return 1;
1701             }
1702           /* fallthrough */
1703         default:
1704           saw_amp = 0;
1705         }
1706     }
1707   return 0;
1708 }
1709
1710 typedef struct _downloaded_file_list {
1711   char*                          file;
1712   downloaded_file_t              download_type;
1713   struct _downloaded_file_list*  next;
1714 } downloaded_file_list;
1715
1716 static downloaded_file_list *downloaded_files;
1717
1718 /* Remembers which files have been downloaded.  In the standard case, should be
1719    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1720    download successfully (i.e. not for ones we have failures on or that we skip
1721    due to -N).
1722
1723    When we've downloaded a file and tacked on a ".html" extension due to -E,
1724    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1725    FILE_DOWNLOADED_NORMALLY.
1726
1727    If you just want to check if a file has been previously added without adding
1728    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1729    with local filenames, not remote URLs. */
1730 downloaded_file_t
1731 downloaded_file (downloaded_file_t  mode, const char*  file)
1732 {
1733   boolean                       found_file = FALSE;
1734   downloaded_file_list*         rover = downloaded_files;
1735
1736   while (rover != NULL)
1737     if (strcmp(rover->file, file) == 0)
1738       {
1739         found_file = TRUE;
1740         break;
1741       }
1742     else
1743       rover = rover->next;
1744
1745   if (found_file)
1746     return rover->download_type;  /* file had already been downloaded */
1747   else
1748     {
1749       if (mode != CHECK_FOR_FILE)
1750         {
1751           rover = xmalloc(sizeof(*rover));
1752           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1753           rover->download_type = mode;
1754           rover->next = downloaded_files;
1755           downloaded_files = rover;
1756         }
1757
1758       return FILE_NOT_ALREADY_DOWNLOADED;
1759     }
1760 }
1761
1762 void
1763 downloaded_files_free (void)
1764 {
1765   downloaded_file_list*         rover = downloaded_files;
1766   while (rover)
1767     {
1768       downloaded_file_list *next = rover->next;
1769       xfree (rover->file);
1770       xfree (rover);
1771       rover = next;
1772     }
1773 }