sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  62 #endif /* WINDOWS */
  63
  64 #define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
  65                         || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
  66                         || strchr (URL_UNSAFE_CHARS, c))
  67
  68 /* If S contains unsafe characters, free it and replace it with a
  69    version that doesn't.  */
  70 #define URL_CLEANSE(s) do                       \
  71 {                                               \
  72   if (contains_unsafe (s))                      \
  73     {                                           \
  74       char *uc_tmp = encode_string (s);         \
  75       free (s);                                 \
  76       (s) = uc_tmp;                             \
  77     }                                           \
  78 } while (0)
  79
  80 /* Is a directory "."?  */
  81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  82 /* Is a directory ".."?  */
  83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  84
  85 /* NULL-terminated list of strings to be recognized as prototypes (URL
  86    schemes).  Note that recognized doesn't mean supported -- only HTTP
  87    and FTP are currently supported.
  88
  89    However, a string that does not match anything in the list will be
  90    considered a relative URL.  Thus it's important that this list has
  91    anything anyone could think of being legal.
  92
  93    There are wild things here.  :-) Take a look at
  94    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  95    fun.  */
  96 static char *protostrings[] =
  97 {
  98   "cid:",
  99   "clsid:",
 100   "file:",
 101   "finger:",
 102   "ftp:",
 103   "gopher:",
 104   "hdl:",
 105   "http:",
 106   "https:",
 107   "ilu:",
 108   "ior:",
 109   "irc:",
 110   "java:",
 111   "javascript:",
 112   "lifn:",
 113   "mailto:",
 114   "mid:",
 115   "news:",
 116   "nntp:",
 117   "path:",
 118   "prospero:",
 119   "rlogin:",
 120   "service:",
 121   "shttp:",
 122   "snews:",
 123   "stanf:",
 124   "telnet:",
 125   "tn3270:",
 126   "wais:",
 127   "whois++:",
 128   NULL
 129 };
 130
 131 struct proto
 132 {
 133   char *name;
 134   uerr_t ind;
 135   unsigned short port;
 136 };
 137
 138 /* Similar to former, but for supported protocols: */
 139 static struct proto sup_protos[] =
 140 {
 141   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 142   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 143   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 144 };
 145
 146 static void parse_dir PARAMS ((const char *, char **, char **));
 147 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 148 static char *construct PARAMS ((const char *, const char *, int , int));
 149 static char *construct_relative PARAMS ((const char *, const char *));
 150 static char process_ftp_type PARAMS ((char *));
 151
 152 \f
 153 /* Returns the number of characters to be skipped if the first thing
 154    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 155    URL: are also skipped.  */
 156 int
 157 skip_url (const char *url)
 158 {
 159   int i;
 160
 161   if (TOUPPER (url[0]) == 'U'
 162       && TOUPPER (url[1]) == 'R'
 163       && TOUPPER (url[2]) == 'L'
 164       && url[3] == ':')
 165     {
 166       /* Skip blanks.  */
 167       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 168       return i;
 169     }
 170   else
 171     return 0;
 172 }
 173
 174 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 175 int
 176 contains_unsafe (const char *s)
 177 {
 178   for (; *s; s++)
 179     if (UNSAFE_CHAR (*s))
 180       return 1;
 181   return 0;
 182 }
 183
 184 /* Decodes the forms %xy in a URL to the character the hexadecimal
 185    code of which is xy.  xy are hexadecimal digits from
 186    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 187    hex-digits or `%' precedes `\0', the sequence is inserted
 188    literally.  */
 189
 190 static void
 191 decode_string (char *s)
 192 {
 193   char *p = s;
 194
 195   for (; *s; s++, p++)
 196     {
 197       if (*s != '%')
 198         *p = *s;
 199       else
 200         {
 201           /* Do nothing if at the end of the string, or if the chars
 202              are not hex-digits.  */
 203           if (!*(s + 1) || !*(s + 2)
 204               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 205             {
 206               *p = *s;
 207               continue;
 208             }
 209           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 210           s += 2;
 211         }
 212     }
 213   *p = '\0';
 214 }
 215
 216 /* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
 217    given string, returning a malloc-ed %XX encoded string.  */
 218 char *
 219 encode_string (const char *s)
 220 {
 221   const char *b;
 222   char *p, *res;
 223   int i;
 224
 225   b = s;
 226   for (i = 0; *s; s++, i++)
 227     if (UNSAFE_CHAR (*s))
 228       i += 2; /* Two more characters (hex digits) */
 229   res = (char *)xmalloc (i + 1);
 230   s = b;
 231   for (p = res; *s; s++)
 232     if (UNSAFE_CHAR (*s))
 233       {
 234         const unsigned char c = *s;
 235         *p++ = '%';
 236         *p++ = HEXD2ASC (c >> 4);
 237         *p++ = HEXD2ASC (c & 0xf);
 238       }
 239     else
 240       *p++ = *s;
 241   *p = '\0';
 242   return res;
 243 }
 244 \f
 245 /* Returns the proto-type if URL's protocol is supported, or
 246    URLUNKNOWN if not.  */
 247 uerr_t
 248 urlproto (const char *url)
 249 {
 250   int i;
 251
 252   url += skip_url (url);
 253   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 254     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 255       return sup_protos[i].ind;
 256   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 257   if (url[i] == ':')
 258     {
 259       for (++i; url[i] && url[i] != '/'; i++)
 260         if (!ISDIGIT (url[i]))
 261           return URLBADPORT;
 262       if (url[i - 1] == ':')
 263         return URLFTP;
 264       else
 265         return URLHTTP;
 266     }
 267   else
 268     return URLHTTP;
 269 }
 270
 271 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 272    part is found, returns 0.  */
 273 int
 274 skip_proto (const char *url)
 275 {
 276   char **s;
 277   int l;
 278
 279   for (s = protostrings; *s; s++)
 280     if (!strncasecmp (*s, url, strlen (*s)))
 281       break;
 282   if (!*s)
 283     return 0;
 284   l = strlen (*s);
 285   /* HTTP and FTP protocols are expected to yield exact host names
 286      (i.e. the `//' part must be skipped, too).  */
 287   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 288     l += 2;
 289   return l;
 290 }
 291
 292 /* Returns 1 if the URL begins with a protocol (supported or
 293    unsupported), 0 otherwise.  */
 294 static int
 295 has_proto (const char *url)
 296 {
 297   char **s;
 298
 299   url += skip_url (url);
 300   for (s = protostrings; *s; s++)
 301     if (strncasecmp (url, *s, strlen (*s)) == 0)
 302       return 1;
 303   return 0;
 304 }
 305
 306 /* Skip the username and password, if present here.  The function
 307    should be called *not* with the complete URL, but with the part
 308    right after the protocol.
 309
 310    If no username and password are found, return 0.  */
 311 int
 312 skip_uname (const char *url)
 313 {
 314   const char *p;
 315   for (p = url; *p && *p != '/'; p++)
 316     if (*p == '@')
 317       break;
 318   /* If a `@' was found before the first occurrence of `/', skip
 319      it.  */
 320   if (*p == '@')
 321     return p - url + 1;
 322   else
 323     return 0;
 324 }
 325 \f
 326 /* Allocate a new urlinfo structure, fill it with default values and
 327    return a pointer to it.  */
 328 struct urlinfo *
 329 newurl (void)
 330 {
 331   struct urlinfo *u;
 332
 333   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 334   memset (u, 0, sizeof (*u));
 335   u->proto = URLUNKNOWN;
 336   return u;
 337 }
 338
 339 /* Perform a "deep" free of the urlinfo structure.  The structure
 340    should have been created with newurl, but need not have been used.
 341    If free_pointer is non-0, free the pointer itself.  */
 342 void
 343 freeurl (struct urlinfo *u, int complete)
 344 {
 345   assert (u != NULL);
 346   FREE_MAYBE (u->url);
 347   FREE_MAYBE (u->host);
 348   FREE_MAYBE (u->path);
 349   FREE_MAYBE (u->file);
 350   FREE_MAYBE (u->dir);
 351   FREE_MAYBE (u->user);
 352   FREE_MAYBE (u->passwd);
 353   FREE_MAYBE (u->local);
 354   FREE_MAYBE (u->referer);
 355   if (u->proxy)
 356     freeurl (u->proxy, 1);
 357   if (complete)
 358     free (u);
 359   return;
 360 }
 361 \f
 362 /* Extract the given URL of the form
 363    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 364    1. hostname (terminated with `/' or `:')
 365    2. port number (terminated with `/'), or chosen for the protocol
 366    3. dirname (everything after hostname)
 367    Most errors are handled.  No allocation is done, you must supply
 368    pointers to allocated memory.
 369    ...and a host of other stuff :-)
 370
 371    - Recognizes hostname:dir/file for FTP and
 372      hostname (:portnum)?/dir/file for HTTP.
 373    - Parses the path to yield directory and file
 374    - Parses the URL to yield the username and passwd (if present)
 375    - Decodes the strings, in case they contain "forbidden" characters
 376    - Writes the result to struct urlinfo
 377
 378    If the argument STRICT is set, it recognizes only the canonical
 379    form.  */
 380 uerr_t
 381 parseurl (const char *url, struct urlinfo *u, int strict)
 382 {
 383   int i, l, abs_ftp;
 384   int recognizable;            /* Recognizable URL is the one where
 385                                   the protocol name was explicitly
 386                                   named, i.e. it wasn't deduced from
 387                                   the URL format.  */
 388   uerr_t type;
 389
 390   DEBUGP (("parseurl (\"%s\") -> ", url));
 391   url += skip_url (url);
 392   recognizable = has_proto (url);
 393   if (strict && !recognizable)
 394     return URLUNKNOWN;
 395   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 396     {
 397       l = strlen (sup_protos[i].name);
 398       if (!strncasecmp (sup_protos[i].name, url, l))
 399         break;
 400     }
 401   /* If protocol is recognizable, but unsupported, bail out, else
 402      suppose unknown.  */
 403   if (recognizable && !sup_protos[i].name)
 404     return URLUNKNOWN;
 405   else if (i == ARRAY_SIZE (sup_protos))
 406     type = URLUNKNOWN;
 407   else
 408     u->proto = type = sup_protos[i].ind;
 409
 410   if (type == URLUNKNOWN)
 411     l = 0;
 412   /* Allow a username and password to be specified (i.e. just skip
 413      them for now).  */
 414   if (recognizable)
 415     l += skip_uname (url + l);
 416   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 417   if (i == l)
 418     return URLBADHOST;
 419   /* Get the hostname.  */
 420   u->host = strdupdelim (url + l, url + i);
 421   DEBUGP (("host %s -> ", u->host));
 422
 423   /* Assume no port has been given.  */
 424   u->port = 0;
 425   if (url[i] == ':')
 426     {
 427       /* We have a colon delimiting the hostname.  It could mean that
 428          a port number is following it, or a directory.  */
 429       if (ISDIGIT (url[++i]))    /* A port number */
 430         {
 431           if (type == URLUNKNOWN)
 432             u->proto = type = URLHTTP;
 433           for (; url[i] && url[i] != '/'; i++)
 434             if (ISDIGIT (url[i]))
 435               u->port = 10 * u->port + (url[i] - '0');
 436             else
 437               return URLBADPORT;
 438           if (!u->port)
 439             return URLBADPORT;
 440           DEBUGP (("port %hu -> ", u->port));
 441         }
 442       else if (type == URLUNKNOWN) /* or a directory */
 443         u->proto = type = URLFTP;
 444       else                      /* or just a misformed port number */
 445         return URLBADPORT;
 446     }
 447   else if (type == URLUNKNOWN)
 448     u->proto = type = URLHTTP;
 449   if (!u->port)
 450     {
 451       int i;
 452       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 453         if (sup_protos[i].ind == type)
 454           break;
 455       if (i == ARRAY_SIZE (sup_protos))
 456         return URLUNKNOWN;
 457       u->port = sup_protos[i].port;
 458     }
 459   /* Some delimiter troubles...  */
 460   if (url[i] == '/' && url[i - 1] != ':')
 461     ++i;
 462   if (type == URLHTTP)
 463     while (url[i] && url[i] == '/')
 464       ++i;
 465   u->path = (char *)xmalloc (strlen (url + i) + 8);
 466   strcpy (u->path, url + i);
 467   if (type == URLFTP)
 468     {
 469       u->ftp_type = process_ftp_type (u->path);
 470       /* #### We don't handle type `d' correctly yet.  */
 471       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 472         u->ftp_type = 'I';
 473     }
 474   DEBUGP (("opath %s -> ", u->path));
 475   /* Parse the username and password (if existing).  */
 476   parse_uname (url, &u->user, &u->passwd);
 477   /* Decode the strings, as per RFC 1738.  */
 478   decode_string (u->host);
 479   decode_string (u->path);
 480   if (u->user)
 481     decode_string (u->user);
 482   if (u->passwd)
 483     decode_string (u->passwd);
 484   /* Parse the directory.  */
 485   parse_dir (u->path, &u->dir, &u->file);
 486   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 487   /* Simplify the directory.  */
 488   path_simplify (u->dir);
 489   /* Remove the leading `/' in HTTP.  */
 490   if (type == URLHTTP && *u->dir == '/')
 491     strcpy (u->dir, u->dir + 1);
 492   DEBUGP (("ndir %s\n", u->dir));
 493   /* Strip trailing `/'.  */
 494   l = strlen (u->dir);
 495   if (l && u->dir[l - 1] == '/')
 496     u->dir[l - 1] = '\0';
 497   /* Re-create the path: */
 498   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 499   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 500       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 501   strcpy (u->path, abs_ftp ? "%2F" : "/");
 502   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 503   strcat (u->path, *u->dir ? "/" : "");
 504   strcat (u->path, u->file);
 505   URL_CLEANSE (u->path);
 506   /* Create the clean URL.  */
 507   u->url = str_url (u, 0);
 508   return URLOK;
 509 }
 510 \f
 511 /* Build the directory and filename components of the path.  Both
 512    components are *separately* malloc-ed strings!  It does not change
 513    the contents of path.
 514
 515    If the path ends with "." or "..", they are (correctly) counted as
 516    directories.  */
 517 static void
 518 parse_dir (const char *path, char **dir, char **file)
 519 {
 520   int i, l;
 521
 522   for (i = l = strlen (path); i && path[i] != '/'; i--);
 523   if (!i && *path != '/')   /* Just filename */
 524     {
 525       if (DOTP (path) || DDOTP (path))
 526         {
 527           *dir = xstrdup (path);
 528           *file = xstrdup ("");
 529         }
 530       else
 531         {
 532           *dir = xstrdup ("");     /* This is required because of FTP */
 533           *file = xstrdup (path);
 534         }
 535     }
 536   else if (!i)                 /* /filename */
 537     {
 538       if (DOTP (path + 1) || DDOTP (path + 1))
 539         {
 540           *dir = xstrdup (path);
 541           *file = xstrdup ("");
 542         }
 543       else
 544         {
 545           *dir = xstrdup ("/");
 546           *file = xstrdup (path + 1);
 547         }
 548     }
 549   else /* Nonempty directory with or without a filename */
 550     {
 551       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 552         {
 553           *dir = xstrdup (path);
 554           *file = xstrdup ("");
 555         }
 556       else
 557         {
 558           *dir = strdupdelim (path, path + i);
 559           *file = strdupdelim (path + i + 1, path + l + 1);
 560         }
 561     }
 562 }
 563
 564 /* Find the optional username and password within the URL, as per
 565    RFC1738.  The returned user and passwd char pointers are
 566    malloc-ed.  */
 567 static uerr_t
 568 parse_uname (const char *url, char **user, char **passwd)
 569 {
 570   int l;
 571   const char *p, *col;
 572   char **where;
 573
 574   *user = NULL;
 575   *passwd = NULL;
 576   url += skip_url (url);
 577   /* Look for end of protocol string.  */
 578   l = skip_proto (url);
 579   if (!l)
 580     return URLUNKNOWN;
 581   /* Add protocol offset.  */
 582   url += l;
 583   /* Is there an `@' character?  */
 584   for (p = url; *p && *p != '/'; p++)
 585     if (*p == '@')
 586       break;
 587   /* If not, return.  */
 588   if (*p != '@')
 589     return URLOK;
 590   /* Else find the username and password.  */
 591   for (p = col = url; *p != '@'; p++)
 592     {
 593       if (*p == ':' && !*user)
 594         {
 595           *user = (char *)xmalloc (p - url + 1);
 596           memcpy (*user, url, p - url);
 597           (*user)[p - url] = '\0';
 598           col = p + 1;
 599         }
 600     }
 601   /* Decide whether you have only the username or both.  */
 602   where = *user ? passwd : user;
 603   *where = (char *)xmalloc (p - col + 1);
 604   memcpy (*where, col, p - col);
 605   (*where)[p - col] = '\0';
 606   return URLOK;
 607 }
 608
 609 /* If PATH ends with `;type=X', return the character X.  */
 610 static char
 611 process_ftp_type (char *path)
 612 {
 613   int len = strlen (path);
 614
 615   if (len >= 7
 616       && !memcmp (path + len - 7, ";type=", 6))
 617     {
 618       path[len - 7] = '\0';
 619       return path[len - 1];
 620     }
 621   else
 622     return '\0';
 623 }
 624 \f
 625 /* Return the URL as fine-formed string, with a proper protocol, port
 626    number, directory and optional user/password.  If HIDE is non-zero,
 627    password will be hidden.  The forbidden characters in the URL will
 628    be cleansed.  */
 629 char *
 630 str_url (const struct urlinfo *u, int hide)
 631 {
 632   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 633   int i, l, ln, lu, lh, lp, lf, ld;
 634   unsigned short proto_default_port;
 635
 636   /* Look for the protocol name.  */
 637   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 638     if (sup_protos[i].ind == u->proto)
 639       break;
 640   if (i == ARRAY_SIZE (sup_protos))
 641     return NULL;
 642   proto_name = sup_protos[i].name;
 643   proto_default_port = sup_protos[i].port;
 644   host = CLEANDUP (u->host);
 645   dir = CLEANDUP (u->dir);
 646   file = CLEANDUP (u->file);
 647   user = passwd = NULL;
 648   if (u->user)
 649     user = CLEANDUP (u->user);
 650   if (u->passwd)
 651     {
 652       int i;
 653       passwd = CLEANDUP (u->passwd);
 654       if (hide)
 655         for (i = 0; passwd[i]; i++)
 656           passwd[i] = 'x';
 657     }
 658   if (u->proto == URLFTP && *dir == '/')
 659     {
 660       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 661       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 662       *tmp = '%';
 663       tmp[1] = '2';
 664       tmp[2] = 'F';
 665       strcpy (tmp + 3, dir + 1);
 666       free (dir);
 667       dir = tmp;
 668     }
 669
 670   ln = strlen (proto_name);
 671   lu = user ? strlen (user) : 0;
 672   lp = passwd ? strlen (passwd) : 0;
 673   lh = strlen (host);
 674   ld = strlen (dir);
 675   lf = strlen (file);
 676   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 677   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 678      (user ? user : ""), (passwd ? ":" : ""),
 679      (passwd ? passwd : ""), (user ? "@" : ""),
 680      host, u->port, dir, *dir ? "/" : "", file); */
 681   l = 0;
 682   memcpy (res, proto_name, ln);
 683   l += ln;
 684   if (user)
 685     {
 686       memcpy (res + l, user, lu);
 687       l += lu;
 688       if (passwd)
 689         {
 690           res[l++] = ':';
 691           memcpy (res + l, passwd, lp);
 692           l += lp;
 693         }
 694       res[l++] = '@';
 695     }
 696   memcpy (res + l, host, lh);
 697   l += lh;
 698   if (u->port != proto_default_port)
 699     {
 700       res[l++] = ':';
 701       long_to_string (res + l, (long)u->port);
 702       l += numdigit (u->port);
 703     }
 704   res[l++] = '/';
 705   memcpy (res + l, dir, ld);
 706   l += ld;
 707   if (*dir)
 708     res[l++] = '/';
 709   strcpy (res + l, file);
 710   free (host);
 711   free (dir);
 712   free (file);
 713   FREE_MAYBE (user);
 714   FREE_MAYBE (passwd);
 715   return res;
 716 }
 717
 718 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 719    location.  Uses parseurl to parse them, and compares the canonical
 720    forms.
 721
 722    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 723    return 0 on error.  */
 724 int
 725 url_equal (const char *url1, const char *url2)
 726 {
 727   struct urlinfo *u1, *u2;
 728   uerr_t err;
 729   int res;
 730
 731   u1 = newurl ();
 732   err = parseurl (url1, u1, 0);
 733   if (err != URLOK)
 734     {
 735       freeurl (u1, 1);
 736       return 0;
 737     }
 738   u2 = newurl ();
 739   err = parseurl (url2, u2, 0);
 740   if (err != URLOK)
 741     {
 742       freeurl (u2, 1);
 743       return 0;
 744     }
 745   res = !strcmp (u1->url, u2->url);
 746   freeurl (u1, 1);
 747   freeurl (u2, 1);
 748   return res;
 749 }
 750 \f
 751 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 752    buffer may contain pretty much anything; no errors are signaled.  */
 753 static const char *
 754 findurl (const char *buf, int howmuch, int *count)
 755 {
 756   char **prot;
 757   const char *s1, *s2;
 758
 759   for (s1 = buf; howmuch; s1++, howmuch--)
 760     for (prot = protostrings; *prot; prot++)
 761       if (howmuch <= strlen (*prot))
 762         continue;
 763       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 764         {
 765           for (s2 = s1, *count = 0;
 766                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 767                  !strchr (URL_SEPARATOR, *s2);
 768                s2++, (*count)++, howmuch--);
 769           return s1;
 770         }
 771   return NULL;
 772 }
 773
 774 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 775    each pointer representing a URL string.  The file is *not* assumed
 776    to be HTML.  */
 777 urlpos *
 778 get_urls_file (const char *file)
 779 {
 780   long nread;
 781   FILE *fp;
 782   char *buf;
 783   const char *pbuf;
 784   int size;
 785   urlpos *first, *current, *old;
 786
 787   if (file && !HYPHENP (file))
 788     {
 789       fp = fopen (file, "rb");
 790       if (!fp)
 791         {
 792           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 793           return NULL;
 794         }
 795     }
 796   else
 797     fp = stdin;
 798   /* Load the file.  */
 799   load_file (fp, &buf, &nread);
 800   if (file && !HYPHENP (file))
 801     fclose (fp);
 802   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 803   first = current = NULL;
 804   /* Fill the linked list with URLs.  */
 805   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 806        pbuf += size)
 807     {
 808       /* Allocate the space.  */
 809       old = current;
 810       current = (urlpos *)xmalloc (sizeof (urlpos));
 811       if (old)
 812         old->next = current;
 813       memset (current, 0, sizeof (*current));
 814       current->next = NULL;
 815       current->url = (char *)xmalloc (size + 1);
 816       memcpy (current->url, pbuf, size);
 817       current->url[size] = '\0';
 818       if (!first)
 819         first = current;
 820     }
 821   /* Free the buffer.  */
 822   free (buf);
 823
 824   return first;
 825 }
 826
 827 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 828    an HTML document using htmlfindurl(), which see.  get_urls_html()
 829    constructs the HTML-s from the relative href-s.
 830
 831    If SILENT is non-zero, do not barf on baseless relative links.  */
 832 urlpos *
 833 get_urls_html (const char *file, const char *this_url, int silent)
 834 {
 835   long nread;
 836   FILE *fp;
 837   char *orig_buf;
 838   const char *buf;
 839   int step, first_time;
 840   urlpos *first, *current, *old;
 841
 842   if (file && !HYPHENP (file))
 843     {
 844       fp = fopen (file, "rb");
 845       if (!fp)
 846         {
 847           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 848           return NULL;
 849         }
 850     }
 851   else
 852     fp = stdin;
 853   /* Load the file.  */
 854   load_file (fp, &orig_buf, &nread);
 855   if (file && !HYPHENP (file))
 856     fclose (fp);
 857   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 858   first = current = NULL;
 859   first_time = 1;
 860   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 861   for (buf = orig_buf;
 862        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
 863        buf += step)
 864     {
 865       int i, no_proto;
 866       int size = step;
 867       const char *pbuf = buf;
 868       char *constr, *base;
 869       const char *cbase;
 870
 871       first_time = 0;
 872
 873       /* A frequent phenomenon that needs to be handled are pages
 874          generated by brain-damaged HTML generators, which refer to to
 875          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 876          any spaces at the beginning or at the end of the string.
 877          This is probably not strictly correct, but that's what the
 878          browsers do, so we may follow.  May the authors of "WYSIWYG"
 879          HTML tools burn in hell for the damage they've inflicted!  */
 880       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 881         {
 882           ++pbuf;
 883           --size;
 884         }
 885       while (size && ISSPACE (pbuf[size - 1]))
 886         --size;
 887       if (!size)
 888         break;
 889
 890       for (i = 0; protostrings[i]; i++)
 891         {
 892           if (!strncasecmp (protostrings[i], pbuf,
 893                             MINVAL (strlen (protostrings[i]), size)))
 894             break;
 895         }
 896       /* Check for http:RELATIVE_URI.  See below for details.  */
 897       if (protostrings[i]
 898           && !(strncasecmp (pbuf, "http:", 5) == 0
 899                && strncasecmp (pbuf, "http://", 7) != 0))
 900         {
 901           no_proto = 0;
 902         }
 903       else
 904         {
 905           no_proto = 1;
 906           /* This is for extremely brain-damaged pages that refer to
 907              relative URI-s as <a href="http:URL">.  Just strip off the
 908              silly leading "http:" (as well as any leading blanks
 909              before it).  */
 910           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 911             pbuf += 5, size -= 5;
 912         }
 913       if (!no_proto)
 914         {
 915           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 916             {
 917               if (!strncasecmp (sup_protos[i].name, pbuf,
 918                                MINVAL (strlen (sup_protos[i].name), size)))
 919                 break;
 920             }
 921           /* Do *not* accept a non-supported protocol.  */
 922           if (i == ARRAY_SIZE (sup_protos))
 923             continue;
 924         }
 925       if (no_proto)
 926         {
 927           /* First, construct the base, which can be relative itself.
 928
 929              Criteria for creating the base are:
 930              1) html_base created by <base href="...">
 931              2) current URL
 932              3) base provided from the command line */
 933           cbase = html_base ();
 934           if (!cbase)
 935             cbase = this_url;
 936           if (!cbase)
 937             cbase = opt.base_href;
 938           if (!cbase)             /* Error condition -- a baseless
 939                                      relative link.  */
 940             {
 941               if (!opt.quiet && !silent)
 942                 {
 943                   /* Use malloc, not alloca because this is called in
 944                      a loop. */
 945                   char *temp = (char *)malloc (size + 1);
 946                   strncpy (temp, pbuf, size);
 947                   temp[size] = '\0';
 948                   logprintf (LOG_NOTQUIET,
 949                              _("Error (%s): Link %s without a base provided.\n"),
 950                              file, temp);
 951                   free (temp);
 952                 }
 953               continue;
 954             }
 955           if (this_url)
 956             base = construct (this_url, cbase, strlen (cbase),
 957                               !has_proto (cbase));
 958           else
 959             {
 960               /* Base must now be absolute, with host name and
 961                  protocol.  */
 962               if (!has_proto (cbase))
 963                 {
 964                   logprintf (LOG_NOTQUIET, _("\
 965 Error (%s): Base %s relative, without referer URL.\n"),
 966                              file, cbase);
 967                   continue;
 968                 }
 969               base = xstrdup (cbase);
 970             }
 971           constr = construct (base, pbuf, size, no_proto);
 972           free (base);
 973         }
 974       else /* has proto */
 975         {
 976           constr = (char *)xmalloc (size + 1);
 977           strncpy (constr, pbuf, size);
 978           constr[size] = '\0';
 979         }
 980 #ifdef DEBUG
 981       if (opt.debug)
 982         {
 983           char *tmp;
 984           const char *tmp2;
 985
 986           tmp2 = html_base ();
 987           /* Use malloc, not alloca because this is called in a loop. */
 988           tmp = (char *)xmalloc (size + 1);
 989           strncpy (tmp, pbuf, size);
 990           tmp[size] = '\0';
 991           logprintf (LOG_ALWAYS,
 992                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
 993                      file, this_url ? this_url : "(null)",
 994                      tmp2 ? tmp2 : "(null)", tmp, constr);
 995           free (tmp);
 996         }
 997 #endif
 998
 999       /* Allocate the space.  */
1000       old = current;
1001       current = (urlpos *)xmalloc (sizeof (urlpos));
1002       if (old)
1003         old->next = current;
1004       if (!first)
1005         first = current;
1006       /* Fill the values.  */
1007       memset (current, 0, sizeof (*current));
1008       current->next = NULL;
1009       current->url = constr;
1010       current->size = size;
1011       current->pos = pbuf - orig_buf;
1012       /* A URL is relative if the host and protocol are not named,
1013          and the name does not start with `/'.  */
1014       if (no_proto && *pbuf != '/')
1015         current->flags |= (URELATIVE | UNOPROTO);
1016       else if (no_proto)
1017         current->flags |= UNOPROTO;
1018     }
1019   free (orig_buf);
1020
1021   return first;
1022 }
1023 \f
1024 /* Free the linked list of urlpos.  */
1025 void
1026 free_urlpos (urlpos *l)
1027 {
1028   while (l)
1029     {
1030       urlpos *next = l->next;
1031       free (l->url);
1032       FREE_MAYBE (l->local_name);
1033       free (l);
1034       l = next;
1035     }
1036 }
1037
1038 /* Rotate FNAME opt.backups times */
1039 void
1040 rotate_backups(const char *fname)
1041 {
1042   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1043   char *from = (char *)alloca (maxlen);
1044   char *to = (char *)alloca (maxlen);
1045   struct stat sb;
1046   int i;
1047
1048   if (stat (fname, &sb) == 0)
1049     if (S_ISREG (sb.st_mode) == 0)
1050       return;
1051
1052   for (i = opt.backups; i > 1; i--)
1053     {
1054       sprintf (from, "%s.%d", fname, i - 1);
1055       sprintf (to, "%s.%d", fname, i);
1056       /* #### This will fail on machines without the rename() system
1057          call.  */
1058       rename (from, to);
1059     }
1060
1061   sprintf (to, "%s.%d", fname, 1);
1062   rename(fname, to);
1063 }
1064
1065 /* Create all the necessary directories for PATH (a file).  Calls
1066    mkdirhier() internally.  */
1067 int
1068 mkalldirs (const char *path)
1069 {
1070   const char *p;
1071   char *t;
1072   struct stat st;
1073   int res;
1074
1075   p = path + strlen (path);
1076   for (; *p != '/' && p != path; p--);
1077   /* Don't create if it's just a file.  */
1078   if ((p == path) && (*p != '/'))
1079     return 0;
1080   t = strdupdelim (path, p);
1081   /* Check whether the directory exists.  */
1082   if ((stat (t, &st) == 0))
1083     {
1084       if (S_ISDIR (st.st_mode))
1085         {
1086           free (t);
1087           return 0;
1088         }
1089       else
1090         {
1091           /* If the dir exists as a file name, remove it first.  This
1092              is *only* for Wget to work with buggy old CERN http
1093              servers.  Here is the scenario: When Wget tries to
1094              retrieve a directory without a slash, e.g.
1095              http://foo/bar (bar being a directory), CERN server will
1096              not redirect it too http://foo/bar/ -- it will generate a
1097              directory listing containing links to bar/file1,
1098              bar/file2, etc.  Wget will lose because it saves this
1099              HTML listing to a file `bar', so it cannot create the
1100              directory.  To work around this, if the file of the same
1101              name exists, we just remove it and create the directory
1102              anyway.  */
1103           DEBUGP (("Removing %s because of directory danger!\n", t));
1104           unlink (t);
1105         }
1106     }
1107   res = make_directory (t);
1108   if (res != 0)
1109     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1110   free (t);
1111   return res;
1112 }
1113
1114 static int
1115 count_slashes (const char *s)
1116 {
1117   int i = 0;
1118   while (*s)
1119     if (*s++ == '/')
1120       ++i;
1121   return i;
1122 }
1123
1124 /* Return the path name of the URL-equivalent file name, with a
1125    remote-like structure of directories.  */
1126 static char *
1127 mkstruct (const struct urlinfo *u)
1128 {
1129   char *host, *dir, *file, *res, *dirpref;
1130   int l;
1131
1132   assert (u->dir != NULL);
1133   assert (u->host != NULL);
1134
1135   if (opt.cut_dirs)
1136     {
1137       char *ptr = u->dir + (*u->dir == '/');
1138       int slash_count = 1 + count_slashes (ptr);
1139       int cut = MINVAL (opt.cut_dirs, slash_count);
1140       for (; cut && *ptr; ptr++)
1141         if (*ptr == '/')
1142           --cut;
1143       STRDUP_ALLOCA (dir, ptr);
1144     }
1145   else
1146     dir = u->dir + (*u->dir == '/');
1147
1148   host = xstrdup (u->host);
1149   /* Check for the true name (or at least a consistent name for saving
1150      to directory) of HOST, reusing the hlist if possible.  */
1151   if (opt.add_hostdir && !opt.simple_check)
1152     {
1153       char *nhost = realhost (host);
1154       free (host);
1155       host = nhost;
1156     }
1157   /* Add dir_prefix and hostname (if required) to the beginning of
1158      dir.  */
1159   if (opt.add_hostdir)
1160     {
1161       if (!DOTP (opt.dir_prefix))
1162         {
1163           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1164                                     + strlen (host) + 1);
1165           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1166         }
1167       else
1168         STRDUP_ALLOCA (dirpref, host);
1169     }
1170   else                         /* not add_hostdir */
1171     {
1172       if (!DOTP (opt.dir_prefix))
1173         dirpref = opt.dir_prefix;
1174       else
1175         dirpref = "";
1176     }
1177   free (host);
1178
1179   /* If there is a prefix, prepend it.  */
1180   if (*dirpref)
1181     {
1182       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1183       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1184       dir = newdir;
1185     }
1186   dir = xstrdup (dir);
1187   URL_CLEANSE (dir);
1188   l = strlen (dir);
1189   if (l && dir[l - 1] == '/')
1190     dir[l - 1] = '\0';
1191
1192   if (!*u->file)
1193     file = "index.html";
1194   else
1195     file = u->file;
1196
1197   /* Finally, construct the full name.  */
1198   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1199   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1200   free (dir);
1201   return res;
1202 }
1203
1204 /* Create a unique filename, corresponding to a given URL.  Calls
1205    mkstruct if necessary.  Does *not* actually create any directories.  */
1206 char *
1207 url_filename (const struct urlinfo *u)
1208 {
1209   char *file, *name;
1210   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1211
1212   if (opt.dirstruct)
1213     {
1214       file = mkstruct (u);
1215       have_prefix = 1;
1216     }
1217   else
1218     {
1219       if (!*u->file)
1220         file = xstrdup ("index.html");
1221       else
1222         file = xstrdup (u->file);
1223     }
1224
1225   if (!have_prefix)
1226     {
1227       /* Check whether the prefix directory is something other than "."
1228          before prepending it.  */
1229       if (!DOTP (opt.dir_prefix))
1230         {
1231           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1232                                          + 1 + strlen (file) + 1);
1233           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1234           free (file);
1235           file = nfile;
1236         }
1237     }
1238   /* DOS-ish file systems don't like `%' signs in them; we change it
1239      to `@'.  */
1240 #ifdef WINDOWS
1241   {
1242     char *p = file;
1243     for (p = file; *p; p++)
1244       if (*p == '%')
1245         *p = '@';
1246   }
1247 #endif /* WINDOWS */
1248
1249   /* Check the cases in which the unique extensions are not used:
1250      1) Clobbering is turned off (-nc).
1251      2) Retrieval with regetting.
1252      3) Timestamping is used.
1253      4) Hierarchy is built.
1254
1255      The exception is the case when file does exist and is a
1256      directory (actually support for bad httpd-s).  */
1257   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1258       && !(file_exists_p (file) && !file_non_directory_p (file)))
1259     return file;
1260
1261   /* Find a unique name.  */
1262   name = unique_name (file);
1263   free (file);
1264   return name;
1265 }
1266
1267 /* Construct an absolute URL, given a (possibly) relative one.  This
1268    is more tricky than it might seem, but it works.  */
1269 static char *
1270 construct (const char *url, const char *sub, int subsize, int no_proto)
1271 {
1272   char *constr;
1273
1274   if (no_proto)
1275     {
1276       int i;
1277
1278       if (*sub != '/')
1279         {
1280           for (i = strlen (url); i && url[i] != '/'; i--);
1281           if (!i || (url[i] == url[i - 1]))
1282             {
1283               int l = strlen (url);
1284               char *t = (char *)alloca (l + 2);
1285               strcpy (t, url);
1286               t[l] = '/';
1287               t[l + 1] = '\0';
1288               url = t;
1289               i = l;
1290             }
1291           constr = (char *)xmalloc (i + 1 + subsize + 1);
1292           strncpy (constr, url, i + 1);
1293           constr[i + 1] = '\0';
1294           strncat (constr, sub, subsize);
1295         }
1296       else /* *sub == `/' */
1297         {
1298           int fl;
1299
1300           i = 0;
1301           do
1302             {
1303               for (; url[i] && url[i] != '/'; i++);
1304               if (!url[i])
1305                 break;
1306               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1307               if (fl)
1308                 i += 2;
1309             }
1310           while (fl);
1311           if (!url[i])
1312             {
1313               int l = strlen (url);
1314               char *t = (char *)alloca (l + 2);
1315               strcpy (t, url);
1316               t[l] = '/';
1317               t[l + 1] = '\0';
1318               url = t;
1319             }
1320           constr = (char *)xmalloc (i + 1 + subsize + 1);
1321           strncpy (constr, url, i);
1322           constr[i] = '\0';
1323           strncat (constr + i, sub, subsize);
1324           constr[i + subsize] = '\0';
1325         } /* *sub == `/' */
1326     }
1327   else /* !no_proto */
1328     {
1329       constr = (char *)xmalloc (subsize + 1);
1330       strncpy (constr, sub, subsize);
1331       constr[subsize] = '\0';
1332     }
1333   return constr;
1334 }
1335 \f
1336 /* Optimize URL by host, destructively replacing u->host with realhost
1337    (u->host).  Do this regardless of opt.simple_check.  */
1338 void
1339 opt_url (struct urlinfo *u)
1340 {
1341   /* Find the "true" host.  */
1342   char *host = realhost (u->host);
1343   free (u->host);
1344   u->host = host;
1345   assert (u->dir != NULL);      /* the URL must have been parsed */
1346   /* Refresh the printed representation.  */
1347   free (u->url);
1348   u->url = str_url (u, 0);
1349 }
1350 \f
1351 /* Returns proxy host address, in accordance with PROTO.  */
1352 char *
1353 getproxy (uerr_t proto)
1354 {
1355   if (proto == URLHTTP)
1356     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1357   else if (proto == URLFTP)
1358     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1359   else
1360     return NULL;
1361 }
1362
1363 /* Should a host be accessed through proxy, concerning no_proxy?  */
1364 int
1365 no_proxy_match (const char *host, const char **no_proxy)
1366 {
1367   if (!no_proxy)
1368     return 1;
1369   else
1370     return !sufmatch (no_proxy, host);
1371 }
1372 \f
1373 /* Change the links in an HTML document.  Accepts a structure that
1374    defines the positions of all the links.  */
1375 void
1376 convert_links (const char *file, urlpos *l)
1377 {
1378   FILE *fp;
1379   char *buf, *p, *p2;
1380   long size;
1381
1382   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1383   /* Read from the file....  */
1384   fp = fopen (file, "rb");
1385   if (!fp)
1386     {
1387       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1388                  file, strerror (errno));
1389       return;
1390     }
1391   /* ...to a buffer.  */
1392   load_file (fp, &buf, &size);
1393   fclose (fp);
1394   if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1395     /* Rather than just writing over the original .html file with the converted
1396        version, save the former to *.orig.  Note we only do this for files we've
1397        _successfully_ downloaded, so we don't clobber .orig files sitting around
1398        from previous invocations. */
1399     {
1400       /* Construct the backup filename as the original name plus ".orig". */
1401       size_t         filename_len = strlen(file);
1402       char*          filename_plus_orig_suffix = malloc(filename_len +
1403                                                         sizeof(".orig"));
1404       boolean        already_wrote_backup_file = FALSE;
1405       slist*         converted_file_ptr;
1406       static slist*  converted_files = NULL;
1407
1408       /* Would a single s[n]printf() call be faster? */
1409       strcpy(filename_plus_orig_suffix, file);
1410       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1411
1412       /* We can get called twice on the same URL thanks to the
1413          convert_all_links() call in main().  If we write the .orig file each
1414          time in such a case, it'll end up containing the first-pass conversion,
1415          not the original file.  So, see if we've already been called on this
1416          file. */
1417       converted_file_ptr = converted_files;
1418       while (converted_file_ptr != NULL)
1419         if (strcmp(converted_file_ptr->string, file) == 0)
1420           {
1421             already_wrote_backup_file = TRUE;
1422             break;
1423           }
1424         else
1425           converted_file_ptr = converted_file_ptr->next;
1426
1427       if (!already_wrote_backup_file)
1428         {
1429           /* Rename <file> to <file>.orig before former gets written over. */
1430           if (rename(file, filename_plus_orig_suffix) != 0)
1431             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1432                        file, filename_plus_orig_suffix, strerror (errno));
1433
1434           /* Remember that we've already written a .orig backup for this file.
1435              Note that we never free this memory since we need it till the
1436              convert_all_links() call, which is one of the last things the
1437              program does before terminating.  BTW, I'm not sure if it would be
1438              safe to just set 'converted_file_ptr->string' to 'file' below,
1439              rather than making a copy of the string...  Another note is that I
1440              thought I could just add a field to the urlpos structure saying
1441              that we'd written a .orig file for this URL, but that didn't work,
1442              so I had to make this separate list. */
1443           converted_file_ptr = malloc(sizeof(slist));
1444           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1445           converted_file_ptr->next = converted_files;
1446           converted_files = converted_file_ptr;
1447         }
1448
1449       free(filename_plus_orig_suffix);
1450     }
1451   /* Now open the file for writing.  */
1452   fp = fopen (file, "wb");
1453   if (!fp)
1454     {
1455       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1456                  file, strerror (errno));
1457       free (buf);
1458       return;
1459     }
1460   /* [If someone understands why multiple URLs can correspond to one local file,
1461      can they please add a comment here...?] */
1462   for (p = buf; l; l = l->next)
1463     {
1464       if (l->pos >= size)
1465         {
1466           DEBUGP (("Something strange is going on.  Please investigate."));
1467           break;
1468         }
1469       /* If the URL already is relative or it is not to be converted
1470          for some other reason (e.g. because of not having been
1471          downloaded in the first place), skip it.  */
1472       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1473         {
1474           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1475                    l->pos, l->flags));
1476           continue;
1477         }
1478       /* Else, reach the position of the offending URL, echoing
1479          everything up to it to the outfile.  */
1480       for (p2 = buf + l->pos; p < p2; p++)
1481         putc (*p, fp);
1482       if (l->flags & UABS2REL)
1483         {
1484           char *newname = construct_relative (file, l->local_name);
1485           fprintf (fp, "%s", newname);
1486           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1487                    l->url, newname, l->pos, file));
1488           free (newname);
1489         }
1490       p += l->size;
1491     }
1492   if (p - buf < size)
1493     {
1494       for (p2 = buf + size; p < p2; p++)
1495         putc (*p, fp);
1496     }
1497   fclose (fp);
1498   free (buf);
1499   logputs (LOG_VERBOSE, _("done.\n"));
1500 }
1501
1502 /* Construct and return a malloced copy of the relative link from two
1503    pieces of information: local name S1 of the referring file and
1504    local name S2 of the referred file.
1505
1506    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1507    "jagor.srce.hr/images/news.gif", the function will return
1508    "images/news.gif".
1509
1510    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1511    "fly.cc.fer.hr/images/fly.gif", the function will return
1512    "../images/fly.gif".
1513
1514    Caveats: S1 should not begin with `/', unless S2 also begins with
1515    '/'.  S1 should not contain things like ".." and such --
1516    construct_relative ("fly/ioccc/../index.html",
1517    "fly/images/fly.gif") will fail.  (A workaround is to call
1518    something like path_simplify() on S1).  */
1519 static char *
1520 construct_relative (const char *s1, const char *s2)
1521 {
1522   int i, cnt, sepdirs1;
1523   char *res;
1524
1525   if (*s2 == '/')
1526     return xstrdup (s2);
1527   /* S1 should *not* be absolute, if S2 wasn't.  */
1528   assert (*s1 != '/');
1529   i = cnt = 0;
1530   /* Skip the directories common to both strings.  */
1531   while (1)
1532     {
1533       while (s1[i] && s2[i]
1534              && (s1[i] == s2[i])
1535              && (s1[i] != '/')
1536              && (s2[i] != '/'))
1537         ++i;
1538       if (s1[i] == '/' && s2[i] == '/')
1539         cnt = ++i;
1540       else
1541         break;
1542     }
1543   for (sepdirs1 = 0; s1[i]; i++)
1544     if (s1[i] == '/')
1545       ++sepdirs1;
1546   /* Now, construct the file as of:
1547      - ../ repeated sepdirs1 time
1548      - all the non-mutual directories of S2.  */
1549   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1550   for (i = 0; i < sepdirs1; i++)
1551     memcpy (res + 3 * i, "../", 3);
1552   strcpy (res + 3 * i, s2 + cnt);
1553   return res;
1554 }
1555 \f
1556 /* Add URL to the head of the list L.  */
1557 urlpos *
1558 add_url (urlpos *l, const char *url, const char *file)
1559 {
1560   urlpos *t;
1561
1562   t = (urlpos *)xmalloc (sizeof (urlpos));
1563   memset (t, 0, sizeof (*t));
1564   t->url = xstrdup (url);
1565   t->local_name = xstrdup (file);
1566   t->next = l;
1567   return t;
1568 }
1569
1570
1571 /* Remembers which files have been downloaded.  Should be called with
1572    add_or_check == ADD_FILE for each file we actually download successfully
1573    (i.e. not for ones we have failures on or that we skip due to -N).  If you
1574    just want to check if a file has been previously added without adding it,
1575    call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
1576    function with local filenames, not remote URLs -- by some means that isn't
1577    commented well enough for me understand, multiple remote URLs can apparently
1578    correspond to a single local file. */
1579 boolean
1580 downloaded_file (downloaded_file_t  add_or_check, const char*  file)
1581 {
1582   boolean        found_file = FALSE;
1583   static slist*  downloaded_files = NULL;
1584   slist*         rover = downloaded_files;
1585
1586   while (rover != NULL)
1587     if (strcmp(rover->string, file) == 0)
1588       {
1589         found_file = TRUE;
1590         break;
1591       }
1592     else
1593       rover = rover->next;
1594
1595   if (found_file)
1596     return TRUE;  /* file had already been downloaded */
1597   else
1598     {
1599       if (add_or_check == ADD_FILE)
1600         {
1601           rover = malloc(sizeof(slist));
1602           rover->string = xstrdup(file);  /* die on out-of-mem. */
1603           rover->next = downloaded_files;
1604           downloaded_files = rover;
1605         }
1606
1607       return FALSE;  /* file had not already been downloaded */
1608     }
1609 }