sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  62 #endif /* WINDOWS */
  63
  64 #define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
  65                         || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
  66                         || strchr (URL_UNSAFE_CHARS, c))
  67
  68 /* If S contains unsafe characters, free it and replace it with a
  69    version that doesn't.  */
  70 #define URL_CLEANSE(s) do                       \
  71 {                                               \
  72   if (contains_unsafe (s))                      \
  73     {                                           \
  74       char *uc_tmp = encode_string (s);         \
  75       free (s);                                 \
  76       (s) = uc_tmp;                             \
  77     }                                           \
  78 } while (0)
  79
  80 /* Is a directory "."?  */
  81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  82 /* Is a directory ".."?  */
  83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  84
  85 /* NULL-terminated list of strings to be recognized as prototypes (URL
  86    schemes).  Note that recognized doesn't mean supported -- only HTTP
  87    and FTP are currently supported.
  88
  89    However, a string that does not match anything in the list will be
  90    considered a relative URL.  Thus it's important that this list has
  91    anything anyone could think of being legal.
  92
  93    There are wild things here.  :-) Take a look at
  94    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  95    fun.  */
  96 static char *protostrings[] =
  97 {
  98   "cid:",
  99   "clsid:",
 100   "file:",
 101   "finger:",
 102   "ftp:",
 103   "gopher:",
 104   "hdl:",
 105   "http:",
 106   "https:",
 107   "ilu:",
 108   "ior:",
 109   "irc:",
 110   "java:",
 111   "javascript:",
 112   "lifn:",
 113   "mailto:",
 114   "mid:",
 115   "news:",
 116   "nntp:",
 117   "path:",
 118   "prospero:",
 119   "rlogin:",
 120   "service:",
 121   "shttp:",
 122   "snews:",
 123   "stanf:",
 124   "telnet:",
 125   "tn3270:",
 126   "wais:",
 127   "whois++:",
 128   NULL
 129 };
 130
 131 struct proto
 132 {
 133   char *name;
 134   uerr_t ind;
 135   unsigned short port;
 136 };
 137
 138 /* Similar to former, but for supported protocols: */
 139 static struct proto sup_protos[] =
 140 {
 141   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 142   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 143   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 144 };
 145
 146 static void parse_dir PARAMS ((const char *, char **, char **));
 147 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 148 static char *construct PARAMS ((const char *, const char *, int , int));
 149 static char *construct_relative PARAMS ((const char *, const char *));
 150 static char process_ftp_type PARAMS ((char *));
 151
 152 \f
 153 /* Returns the number of characters to be skipped if the first thing
 154    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 155    URL: are also skipped.  */
 156 int
 157 skip_url (const char *url)
 158 {
 159   int i;
 160
 161   if (TOUPPER (url[0]) == 'U'
 162       && TOUPPER (url[1]) == 'R'
 163       && TOUPPER (url[2]) == 'L'
 164       && url[3] == ':')
 165     {
 166       /* Skip blanks.  */
 167       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 168       return i;
 169     }
 170   else
 171     return 0;
 172 }
 173
 174 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 175 int
 176 contains_unsafe (const char *s)
 177 {
 178   for (; *s; s++)
 179     if (UNSAFE_CHAR (*s))
 180       return 1;
 181   return 0;
 182 }
 183
 184 /* Decodes the forms %xy in a URL to the character the hexadecimal
 185    code of which is xy.  xy are hexadecimal digits from
 186    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 187    hex-digits or `%' precedes `\0', the sequence is inserted
 188    literally.  */
 189
 190 static void
 191 decode_string (char *s)
 192 {
 193   char *p = s;
 194
 195   for (; *s; s++, p++)
 196     {
 197       if (*s != '%')
 198         *p = *s;
 199       else
 200         {
 201           /* Do nothing if at the end of the string, or if the chars
 202              are not hex-digits.  */
 203           if (!*(s + 1) || !*(s + 2)
 204               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 205             {
 206               *p = *s;
 207               continue;
 208             }
 209           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 210           s += 2;
 211         }
 212     }
 213   *p = '\0';
 214 }
 215
 216 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 217    given string, returning a malloc-ed %XX encoded string.  */
 218 char *
 219 encode_string (const char *s)
 220 {
 221   const char *b;
 222   char *p, *res;
 223   int i;
 224
 225   b = s;
 226   for (i = 0; *s; s++, i++)
 227     if (UNSAFE_CHAR (*s))
 228       i += 2; /* Two more characters (hex digits) */
 229   res = (char *)xmalloc (i + 1);
 230   s = b;
 231   for (p = res; *s; s++)
 232     if (UNSAFE_CHAR (*s))
 233       {
 234         const unsigned char c = *s;
 235         *p++ = '%';
 236         *p++ = HEXD2ASC (c >> 4);
 237         *p++ = HEXD2ASC (c & 0xf);
 238       }
 239     else
 240       *p++ = *s;
 241   *p = '\0';
 242   return res;
 243 }
 244 \f
 245 /* Returns the proto-type if URL's protocol is supported, or
 246    URLUNKNOWN if not.  */
 247 uerr_t
 248 urlproto (const char *url)
 249 {
 250   int i;
 251
 252   url += skip_url (url);
 253   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 254     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 255       return sup_protos[i].ind;
 256   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 257   if (url[i] == ':')
 258     {
 259       for (++i; url[i] && url[i] != '/'; i++)
 260         if (!ISDIGIT (url[i]))
 261           return URLBADPORT;
 262       if (url[i - 1] == ':')
 263         return URLFTP;
 264       else
 265         return URLHTTP;
 266     }
 267   else
 268     return URLHTTP;
 269 }
 270
 271 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 272    part is found, returns 0.  */
 273 int
 274 skip_proto (const char *url)
 275 {
 276   char **s;
 277   int l;
 278
 279   for (s = protostrings; *s; s++)
 280     if (!strncasecmp (*s, url, strlen (*s)))
 281       break;
 282   if (!*s)
 283     return 0;
 284   l = strlen (*s);
 285   /* HTTP and FTP protocols are expected to yield exact host names
 286      (i.e. the `//' part must be skipped, too).  */
 287   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 288     l += 2;
 289   return l;
 290 }
 291
 292 /* Returns 1 if the URL begins with a protocol (supported or
 293    unsupported), 0 otherwise.  */
 294 static int
 295 has_proto (const char *url)
 296 {
 297   char **s;
 298
 299   url += skip_url (url);
 300   for (s = protostrings; *s; s++)
 301     if (strncasecmp (url, *s, strlen (*s)) == 0)
 302       return 1;
 303   return 0;
 304 }
 305
 306 /* Skip the username and password, if present here.  The function
 307    should be called *not* with the complete URL, but with the part
 308    right after the protocol.
 309
 310    If no username and password are found, return 0.  */
 311 int
 312 skip_uname (const char *url)
 313 {
 314   const char *p;
 315   for (p = url; *p && *p != '/'; p++)
 316     if (*p == '@')
 317       break;
 318   /* If a `@' was found before the first occurrence of `/', skip
 319      it.  */
 320   if (*p == '@')
 321     return p - url + 1;
 322   else
 323     return 0;
 324 }
 325 \f
 326 /* Allocate a new urlinfo structure, fill it with default values and
 327    return a pointer to it.  */
 328 struct urlinfo *
 329 newurl (void)
 330 {
 331   struct urlinfo *u;
 332
 333   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 334   memset (u, 0, sizeof (*u));
 335   u->proto = URLUNKNOWN;
 336   return u;
 337 }
 338
 339 /* Perform a "deep" free of the urlinfo structure.  The structure
 340    should have been created with newurl, but need not have been used.
 341    If free_pointer is non-0, free the pointer itself.  */
 342 void
 343 freeurl (struct urlinfo *u, int complete)
 344 {
 345   assert (u != NULL);
 346   FREE_MAYBE (u->url);
 347   FREE_MAYBE (u->host);
 348   FREE_MAYBE (u->path);
 349   FREE_MAYBE (u->file);
 350   FREE_MAYBE (u->dir);
 351   FREE_MAYBE (u->user);
 352   FREE_MAYBE (u->passwd);
 353   FREE_MAYBE (u->local);
 354   FREE_MAYBE (u->referer);
 355   if (u->proxy)
 356     freeurl (u->proxy, 1);
 357   if (complete)
 358     free (u);
 359   return;
 360 }
 361 \f
 362 /* Extract the given URL of the form
 363    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 364    1. hostname (terminated with `/' or `:')
 365    2. port number (terminated with `/'), or chosen for the protocol
 366    3. dirname (everything after hostname)
 367    Most errors are handled.  No allocation is done, you must supply
 368    pointers to allocated memory.
 369    ...and a host of other stuff :-)
 370
 371    - Recognizes hostname:dir/file for FTP and
 372      hostname (:portnum)?/dir/file for HTTP.
 373    - Parses the path to yield directory and file
 374    - Parses the URL to yield the username and passwd (if present)
 375    - Decodes the strings, in case they contain "forbidden" characters
 376    - Writes the result to struct urlinfo
 377
 378    If the argument STRICT is set, it recognizes only the canonical
 379    form.  */
 380 uerr_t
 381 parseurl (const char *url, struct urlinfo *u, int strict)
 382 {
 383   int i, l, abs_ftp;
 384   int recognizable;            /* Recognizable URL is the one where
 385                                   the protocol name was explicitly
 386                                   named, i.e. it wasn't deduced from
 387                                   the URL format.  */
 388   uerr_t type;
 389
 390   DEBUGP (("parseurl (\"%s\") -> ", url));
 391   url += skip_url (url);
 392   recognizable = has_proto (url);
 393   if (strict && !recognizable)
 394     return URLUNKNOWN;
 395   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 396     {
 397       l = strlen (sup_protos[i].name);
 398       if (!strncasecmp (sup_protos[i].name, url, l))
 399         break;
 400     }
 401   /* If protocol is recognizable, but unsupported, bail out, else
 402      suppose unknown.  */
 403   if (recognizable && !sup_protos[i].name)
 404     return URLUNKNOWN;
 405   else if (i == ARRAY_SIZE (sup_protos))
 406     type = URLUNKNOWN;
 407   else
 408     u->proto = type = sup_protos[i].ind;
 409
 410   if (type == URLUNKNOWN)
 411     l = 0;
 412   /* Allow a username and password to be specified (i.e. just skip
 413      them for now).  */
 414   if (recognizable)
 415     l += skip_uname (url + l);
 416   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 417   if (i == l)
 418     return URLBADHOST;
 419   /* Get the hostname.  */
 420   u->host = strdupdelim (url + l, url + i);
 421   DEBUGP (("host %s -> ", u->host));
 422
 423   /* Assume no port has been given.  */
 424   u->port = 0;
 425   if (url[i] == ':')
 426     {
 427       /* We have a colon delimiting the hostname.  It could mean that
 428          a port number is following it, or a directory.  */
 429       if (ISDIGIT (url[++i]))    /* A port number */
 430         {
 431           if (type == URLUNKNOWN)
 432             u->proto = type = URLHTTP;
 433           for (; url[i] && url[i] != '/'; i++)
 434             if (ISDIGIT (url[i]))
 435               u->port = 10 * u->port + (url[i] - '0');
 436             else
 437               return URLBADPORT;
 438           if (!u->port)
 439             return URLBADPORT;
 440           DEBUGP (("port %hu -> ", u->port));
 441         }
 442       else if (type == URLUNKNOWN) /* or a directory */
 443         u->proto = type = URLFTP;
 444       else                      /* or just a misformed port number */
 445         return URLBADPORT;
 446     }
 447   else if (type == URLUNKNOWN)
 448     u->proto = type = URLHTTP;
 449   if (!u->port)
 450     {
 451       int i;
 452       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 453         if (sup_protos[i].ind == type)
 454           break;
 455       if (i == ARRAY_SIZE (sup_protos))
 456         return URLUNKNOWN;
 457       u->port = sup_protos[i].port;
 458     }
 459   /* Some delimiter troubles...  */
 460   if (url[i] == '/' && url[i - 1] != ':')
 461     ++i;
 462   if (type == URLHTTP)
 463     while (url[i] && url[i] == '/')
 464       ++i;
 465   u->path = (char *)xmalloc (strlen (url + i) + 8);
 466   strcpy (u->path, url + i);
 467   if (type == URLFTP)
 468     {
 469       u->ftp_type = process_ftp_type (u->path);
 470       /* #### We don't handle type `d' correctly yet.  */
 471       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 472         u->ftp_type = 'I';
 473     }
 474   DEBUGP (("opath %s -> ", u->path));
 475   /* Parse the username and password (if existing).  */
 476   parse_uname (url, &u->user, &u->passwd);
 477   /* Decode the strings, as per RFC 1738.  */
 478   decode_string (u->host);
 479   decode_string (u->path);
 480   if (u->user)
 481     decode_string (u->user);
 482   if (u->passwd)
 483     decode_string (u->passwd);
 484   /* Parse the directory.  */
 485   parse_dir (u->path, &u->dir, &u->file);
 486   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 487   /* Simplify the directory.  */
 488   path_simplify (u->dir);
 489   /* Remove the leading `/' in HTTP.  */
 490   if (type == URLHTTP && *u->dir == '/')
 491     strcpy (u->dir, u->dir + 1);
 492   DEBUGP (("ndir %s\n", u->dir));
 493   /* Strip trailing `/'.  */
 494   l = strlen (u->dir);
 495   if (l && u->dir[l - 1] == '/')
 496     u->dir[l - 1] = '\0';
 497   /* Re-create the path: */
 498   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 499   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 500       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 501   strcpy (u->path, abs_ftp ? "%2F" : "/");
 502   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 503   strcat (u->path, *u->dir ? "/" : "");
 504   strcat (u->path, u->file);
 505   URL_CLEANSE (u->path);
 506   /* Create the clean URL.  */
 507   u->url = str_url (u, 0);
 508   return URLOK;
 509 }
 510 \f
 511 /* Build the directory and filename components of the path.  Both
 512    components are *separately* malloc-ed strings!  It does not change
 513    the contents of path.
 514
 515    If the path ends with "." or "..", they are (correctly) counted as
 516    directories.  */
 517 static void
 518 parse_dir (const char *path, char **dir, char **file)
 519 {
 520   int i, l;
 521
 522   for (i = l = strlen (path); i && path[i] != '/'; i--);
 523   if (!i && *path != '/')   /* Just filename */
 524     {
 525       if (DOTP (path) || DDOTP (path))
 526         {
 527           *dir = xstrdup (path);
 528           *file = xstrdup ("");
 529         }
 530       else
 531         {
 532           *dir = xstrdup ("");     /* This is required because of FTP */
 533           *file = xstrdup (path);
 534         }
 535     }
 536   else if (!i)                 /* /filename */
 537     {
 538       if (DOTP (path + 1) || DDOTP (path + 1))
 539         {
 540           *dir = xstrdup (path);
 541           *file = xstrdup ("");
 542         }
 543       else
 544         {
 545           *dir = xstrdup ("/");
 546           *file = xstrdup (path + 1);
 547         }
 548     }
 549   else /* Nonempty directory with or without a filename */
 550     {
 551       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 552         {
 553           *dir = xstrdup (path);
 554           *file = xstrdup ("");
 555         }
 556       else
 557         {
 558           *dir = strdupdelim (path, path + i);
 559           *file = strdupdelim (path + i + 1, path + l + 1);
 560         }
 561     }
 562 }
 563
 564 /* Find the optional username and password within the URL, as per
 565    RFC1738.  The returned user and passwd char pointers are
 566    malloc-ed.  */
 567 static uerr_t
 568 parse_uname (const char *url, char **user, char **passwd)
 569 {
 570   int l;
 571   const char *p, *col;
 572   char **where;
 573
 574   *user = NULL;
 575   *passwd = NULL;
 576   url += skip_url (url);
 577   /* Look for end of protocol string.  */
 578   l = skip_proto (url);
 579   if (!l)
 580     return URLUNKNOWN;
 581   /* Add protocol offset.  */
 582   url += l;
 583   /* Is there an `@' character?  */
 584   for (p = url; *p && *p != '/'; p++)
 585     if (*p == '@')
 586       break;
 587   /* If not, return.  */
 588   if (*p != '@')
 589     return URLOK;
 590   /* Else find the username and password.  */
 591   for (p = col = url; *p != '@'; p++)
 592     {
 593       if (*p == ':' && !*user)
 594         {
 595           *user = (char *)xmalloc (p - url + 1);
 596           memcpy (*user, url, p - url);
 597           (*user)[p - url] = '\0';
 598           col = p + 1;
 599         }
 600     }
 601   /* Decide whether you have only the username or both.  */
 602   where = *user ? passwd : user;
 603   *where = (char *)xmalloc (p - col + 1);
 604   memcpy (*where, col, p - col);
 605   (*where)[p - col] = '\0';
 606   return URLOK;
 607 }
 608
 609 /* If PATH ends with `;type=X', return the character X.  */
 610 static char
 611 process_ftp_type (char *path)
 612 {
 613   int len = strlen (path);
 614
 615   if (len >= 7
 616       && !memcmp (path + len - 7, ";type=", 6))
 617     {
 618       path[len - 7] = '\0';
 619       return path[len - 1];
 620     }
 621   else
 622     return '\0';
 623 }
 624 \f
 625 /* Return the URL as fine-formed string, with a proper protocol, port
 626    number, directory and optional user/password.  If HIDE is non-zero,
 627    password will be hidden.  The forbidden characters in the URL will
 628    be cleansed.  */
 629 char *
 630 str_url (const struct urlinfo *u, int hide)
 631 {
 632   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 633   int i, l, ln, lu, lh, lp, lf, ld;
 634   unsigned short proto_default_port;
 635
 636   /* Look for the protocol name.  */
 637   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 638     if (sup_protos[i].ind == u->proto)
 639       break;
 640   if (i == ARRAY_SIZE (sup_protos))
 641     return NULL;
 642   proto_name = sup_protos[i].name;
 643   proto_default_port = sup_protos[i].port;
 644   host = CLEANDUP (u->host);
 645   dir = CLEANDUP (u->dir);
 646   file = CLEANDUP (u->file);
 647   user = passwd = NULL;
 648   if (u->user)
 649     user = CLEANDUP (u->user);
 650   if (u->passwd)
 651     {
 652       int i;
 653       passwd = CLEANDUP (u->passwd);
 654       if (hide)
 655         for (i = 0; passwd[i]; i++)
 656           passwd[i] = 'x';
 657     }
 658   if (u->proto == URLFTP && *dir == '/')
 659     {
 660       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 661       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 662       *tmp = '%';
 663       tmp[1] = '2';
 664       tmp[2] = 'F';
 665       strcpy (tmp + 3, dir + 1);
 666       free (dir);
 667       dir = tmp;
 668     }
 669
 670   ln = strlen (proto_name);
 671   lu = user ? strlen (user) : 0;
 672   lp = passwd ? strlen (passwd) : 0;
 673   lh = strlen (host);
 674   ld = strlen (dir);
 675   lf = strlen (file);
 676   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 677   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 678      (user ? user : ""), (passwd ? ":" : ""),
 679      (passwd ? passwd : ""), (user ? "@" : ""),
 680      host, u->port, dir, *dir ? "/" : "", file); */
 681   l = 0;
 682   memcpy (res, proto_name, ln);
 683   l += ln;
 684   if (user)
 685     {
 686       memcpy (res + l, user, lu);
 687       l += lu;
 688       if (passwd)
 689         {
 690           res[l++] = ':';
 691           memcpy (res + l, passwd, lp);
 692           l += lp;
 693         }
 694       res[l++] = '@';
 695     }
 696   memcpy (res + l, host, lh);
 697   l += lh;
 698   if (u->port != proto_default_port)
 699     {
 700       res[l++] = ':';
 701       long_to_string (res + l, (long)u->port);
 702       l += numdigit (u->port);
 703     }
 704   res[l++] = '/';
 705   memcpy (res + l, dir, ld);
 706   l += ld;
 707   if (*dir)
 708     res[l++] = '/';
 709   strcpy (res + l, file);
 710   free (host);
 711   free (dir);
 712   free (file);
 713   FREE_MAYBE (user);
 714   FREE_MAYBE (passwd);
 715   return res;
 716 }
 717
 718 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 719    location.  Uses parseurl to parse them, and compares the canonical
 720    forms.
 721
 722    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 723    return 0 on error.  */
 724 int
 725 url_equal (const char *url1, const char *url2)
 726 {
 727   struct urlinfo *u1, *u2;
 728   uerr_t err;
 729   int res;
 730
 731   u1 = newurl ();
 732   err = parseurl (url1, u1, 0);
 733   if (err != URLOK)
 734     {
 735       freeurl (u1, 1);
 736       return 0;
 737     }
 738   u2 = newurl ();
 739   err = parseurl (url2, u2, 0);
 740   if (err != URLOK)
 741     {
 742       freeurl (u2, 1);
 743       return 0;
 744     }
 745   res = !strcmp (u1->url, u2->url);
 746   freeurl (u1, 1);
 747   freeurl (u2, 1);
 748   return res;
 749 }
 750 \f
 751 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 752    buffer may contain pretty much anything; no errors are signaled.  */
 753 static const char *
 754 findurl (const char *buf, int howmuch, int *count)
 755 {
 756   char **prot;
 757   const char *s1, *s2;
 758
 759   for (s1 = buf; howmuch; s1++, howmuch--)
 760     for (prot = protostrings; *prot; prot++)
 761       if (howmuch <= strlen (*prot))
 762         continue;
 763       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 764         {
 765           for (s2 = s1, *count = 0;
 766                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 767                  !strchr (URL_SEPARATOR, *s2);
 768                s2++, (*count)++, howmuch--);
 769           return s1;
 770         }
 771   return NULL;
 772 }
 773
 774 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 775    each pointer representing a URL string.  The file is *not* assumed
 776    to be HTML.  */
 777 urlpos *
 778 get_urls_file (const char *file)
 779 {
 780   long nread;
 781   FILE *fp;
 782   char *buf;
 783   const char *pbuf;
 784   int size;
 785   urlpos *first, *current, *old;
 786
 787   if (file && !HYPHENP (file))
 788     {
 789       fp = fopen (file, "rb");
 790       if (!fp)
 791         {
 792           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 793           return NULL;
 794         }
 795     }
 796   else
 797     fp = stdin;
 798   /* Load the file.  */
 799   load_file (fp, &buf, &nread);
 800   if (file && !HYPHENP (file))
 801     fclose (fp);
 802   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 803   first = current = NULL;
 804   /* Fill the linked list with URLs.  */
 805   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 806        pbuf += size)
 807     {
 808       /* Allocate the space.  */
 809       old = current;
 810       current = (urlpos *)xmalloc (sizeof (urlpos));
 811       if (old)
 812         old->next = current;
 813       memset (current, 0, sizeof (*current));
 814       current->next = NULL;
 815       current->url = (char *)xmalloc (size + 1);
 816       memcpy (current->url, pbuf, size);
 817       current->url[size] = '\0';
 818       if (!first)
 819         first = current;
 820     }
 821   /* Free the buffer.  */
 822   free (buf);
 823
 824   return first;
 825 }
 826
 827 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 828    an HTML document using htmlfindurl(), which see.  get_urls_html()
 829    constructs the HTML-s from the relative href-s.
 830
 831    If SILENT is non-zero, do not barf on baseless relative links.  */
 832 urlpos *
 833 get_urls_html (const char *file, const char *this_url, int silent,
 834                int dash_p_leaf_HTML)
 835 {
 836   long nread;
 837   FILE *fp;
 838   char *orig_buf;
 839   const char *buf;
 840   int step, first_time;
 841   urlpos *first, *current, *old;
 842
 843   if (file && !HYPHENP (file))
 844     {
 845       fp = fopen (file, "rb");
 846       if (!fp)
 847         {
 848           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 849           return NULL;
 850         }
 851     }
 852   else
 853     fp = stdin;
 854   /* Load the file.  */
 855   load_file (fp, &orig_buf, &nread);
 856   if (file && !HYPHENP (file))
 857     fclose (fp);
 858   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 859   first = current = NULL;
 860   first_time = 1;
 861   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 862   for (buf = orig_buf;
 863        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
 864                            dash_p_leaf_HTML));
 865        buf += step)
 866     {
 867       int i, no_proto;
 868       int size = step;
 869       const char *pbuf = buf;
 870       char *constr, *base;
 871       const char *cbase;
 872
 873       first_time = 0;
 874
 875       /* A frequent phenomenon that needs to be handled are pages
 876          generated by brain-damaged HTML generators, which refer to to
 877          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 878          any spaces at the beginning or at the end of the string.
 879          This is probably not strictly correct, but that's what the
 880          browsers do, so we may follow.  May the authors of "WYSIWYG"
 881          HTML tools burn in hell for the damage they've inflicted!  */
 882       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 883         {
 884           ++pbuf;
 885           --size;
 886         }
 887       while (size && ISSPACE (pbuf[size - 1]))
 888         --size;
 889       if (!size)
 890         break;
 891
 892       for (i = 0; protostrings[i]; i++)
 893         {
 894           if (!strncasecmp (protostrings[i], pbuf,
 895                             MINVAL (strlen (protostrings[i]), size)))
 896             break;
 897         }
 898       /* Check for http:RELATIVE_URI.  See below for details.  */
 899       if (protostrings[i]
 900           && !(strncasecmp (pbuf, "http:", 5) == 0
 901                && strncasecmp (pbuf, "http://", 7) != 0))
 902         {
 903           no_proto = 0;
 904         }
 905       else
 906         {
 907           no_proto = 1;
 908           /* This is for extremely brain-damaged pages that refer to
 909              relative URI-s as <a href="http:URL">.  Just strip off the
 910              silly leading "http:" (as well as any leading blanks
 911              before it).  */
 912           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 913             pbuf += 5, size -= 5;
 914         }
 915       if (!no_proto)
 916         {
 917           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 918             {
 919               if (!strncasecmp (sup_protos[i].name, pbuf,
 920                                MINVAL (strlen (sup_protos[i].name), size)))
 921                 break;
 922             }
 923           /* Do *not* accept a non-supported protocol.  */
 924           if (i == ARRAY_SIZE (sup_protos))
 925             continue;
 926         }
 927       if (no_proto)
 928         {
 929           /* First, construct the base, which can be relative itself.
 930
 931              Criteria for creating the base are:
 932              1) html_base created by <base href="...">
 933              2) current URL
 934              3) base provided from the command line */
 935           cbase = html_base ();
 936           if (!cbase)
 937             cbase = this_url;
 938           if (!cbase)
 939             cbase = opt.base_href;
 940           if (!cbase)             /* Error condition -- a baseless
 941                                      relative link.  */
 942             {
 943               if (!opt.quiet && !silent)
 944                 {
 945                   /* Use malloc, not alloca because this is called in
 946                      a loop. */
 947                   char *temp = (char *)malloc (size + 1);
 948                   strncpy (temp, pbuf, size);
 949                   temp[size] = '\0';
 950                   logprintf (LOG_NOTQUIET,
 951                              _("Error (%s): Link %s without a base provided.\n"),
 952                              file, temp);
 953                   free (temp);
 954                 }
 955               continue;
 956             }
 957           if (this_url)
 958             base = construct (this_url, cbase, strlen (cbase),
 959                               !has_proto (cbase));
 960           else
 961             {
 962               /* Base must now be absolute, with host name and
 963                  protocol.  */
 964               if (!has_proto (cbase))
 965                 {
 966                   logprintf (LOG_NOTQUIET, _("\
 967 Error (%s): Base %s relative, without referer URL.\n"),
 968                              file, cbase);
 969                   continue;
 970                 }
 971               base = xstrdup (cbase);
 972             }
 973           constr = construct (base, pbuf, size, no_proto);
 974           free (base);
 975         }
 976       else /* has proto */
 977         {
 978           constr = (char *)xmalloc (size + 1);
 979           strncpy (constr, pbuf, size);
 980           constr[size] = '\0';
 981         }
 982 #ifdef DEBUG
 983       if (opt.debug)
 984         {
 985           char *tmp;
 986           const char *tmp2;
 987
 988           tmp2 = html_base ();
 989           /* Use malloc, not alloca because this is called in a loop. */
 990           tmp = (char *)xmalloc (size + 1);
 991           strncpy (tmp, pbuf, size);
 992           tmp[size] = '\0';
 993           logprintf (LOG_ALWAYS,
 994                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
 995                      file, this_url ? this_url : "(null)",
 996                      tmp2 ? tmp2 : "(null)", tmp, constr);
 997           free (tmp);
 998         }
 999 #endif
1000
1001       /* Allocate the space.  */
1002       old = current;
1003       current = (urlpos *)xmalloc (sizeof (urlpos));
1004       if (old)
1005         old->next = current;
1006       if (!first)
1007         first = current;
1008       /* Fill the values.  */
1009       memset (current, 0, sizeof (*current));
1010       current->next = NULL;
1011       current->url = constr;
1012       current->size = size;
1013       current->pos = pbuf - orig_buf;
1014       /* A URL is relative if the host and protocol are not named,
1015          and the name does not start with `/'.  */
1016       if (no_proto && *pbuf != '/')
1017         current->flags |= (URELATIVE | UNOPROTO);
1018       else if (no_proto)
1019         current->flags |= UNOPROTO;
1020     }
1021   free (orig_buf);
1022
1023   return first;
1024 }
1025 \f
1026 /* Free the linked list of urlpos.  */
1027 void
1028 free_urlpos (urlpos *l)
1029 {
1030   while (l)
1031     {
1032       urlpos *next = l->next;
1033       free (l->url);
1034       FREE_MAYBE (l->local_name);
1035       free (l);
1036       l = next;
1037     }
1038 }
1039
1040 /* Rotate FNAME opt.backups times */
1041 void
1042 rotate_backups(const char *fname)
1043 {
1044   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1045   char *from = (char *)alloca (maxlen);
1046   char *to = (char *)alloca (maxlen);
1047   struct stat sb;
1048   int i;
1049
1050   if (stat (fname, &sb) == 0)
1051     if (S_ISREG (sb.st_mode) == 0)
1052       return;
1053
1054   for (i = opt.backups; i > 1; i--)
1055     {
1056       sprintf (from, "%s.%d", fname, i - 1);
1057       sprintf (to, "%s.%d", fname, i);
1058       /* #### This will fail on machines without the rename() system
1059          call.  */
1060       rename (from, to);
1061     }
1062
1063   sprintf (to, "%s.%d", fname, 1);
1064   rename(fname, to);
1065 }
1066
1067 /* Create all the necessary directories for PATH (a file).  Calls
1068    mkdirhier() internally.  */
1069 int
1070 mkalldirs (const char *path)
1071 {
1072   const char *p;
1073   char *t;
1074   struct stat st;
1075   int res;
1076
1077   p = path + strlen (path);
1078   for (; *p != '/' && p != path; p--);
1079   /* Don't create if it's just a file.  */
1080   if ((p == path) && (*p != '/'))
1081     return 0;
1082   t = strdupdelim (path, p);
1083   /* Check whether the directory exists.  */
1084   if ((stat (t, &st) == 0))
1085     {
1086       if (S_ISDIR (st.st_mode))
1087         {
1088           free (t);
1089           return 0;
1090         }
1091       else
1092         {
1093           /* If the dir exists as a file name, remove it first.  This
1094              is *only* for Wget to work with buggy old CERN http
1095              servers.  Here is the scenario: When Wget tries to
1096              retrieve a directory without a slash, e.g.
1097              http://foo/bar (bar being a directory), CERN server will
1098              not redirect it too http://foo/bar/ -- it will generate a
1099              directory listing containing links to bar/file1,
1100              bar/file2, etc.  Wget will lose because it saves this
1101              HTML listing to a file `bar', so it cannot create the
1102              directory.  To work around this, if the file of the same
1103              name exists, we just remove it and create the directory
1104              anyway.  */
1105           DEBUGP (("Removing %s because of directory danger!\n", t));
1106           unlink (t);
1107         }
1108     }
1109   res = make_directory (t);
1110   if (res != 0)
1111     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1112   free (t);
1113   return res;
1114 }
1115
1116 static int
1117 count_slashes (const char *s)
1118 {
1119   int i = 0;
1120   while (*s)
1121     if (*s++ == '/')
1122       ++i;
1123   return i;
1124 }
1125
1126 /* Return the path name of the URL-equivalent file name, with a
1127    remote-like structure of directories.  */
1128 static char *
1129 mkstruct (const struct urlinfo *u)
1130 {
1131   char *host, *dir, *file, *res, *dirpref;
1132   int l;
1133
1134   assert (u->dir != NULL);
1135   assert (u->host != NULL);
1136
1137   if (opt.cut_dirs)
1138     {
1139       char *ptr = u->dir + (*u->dir == '/');
1140       int slash_count = 1 + count_slashes (ptr);
1141       int cut = MINVAL (opt.cut_dirs, slash_count);
1142       for (; cut && *ptr; ptr++)
1143         if (*ptr == '/')
1144           --cut;
1145       STRDUP_ALLOCA (dir, ptr);
1146     }
1147   else
1148     dir = u->dir + (*u->dir == '/');
1149
1150   host = xstrdup (u->host);
1151   /* Check for the true name (or at least a consistent name for saving
1152      to directory) of HOST, reusing the hlist if possible.  */
1153   if (opt.add_hostdir && !opt.simple_check)
1154     {
1155       char *nhost = realhost (host);
1156       free (host);
1157       host = nhost;
1158     }
1159   /* Add dir_prefix and hostname (if required) to the beginning of
1160      dir.  */
1161   if (opt.add_hostdir)
1162     {
1163       if (!DOTP (opt.dir_prefix))
1164         {
1165           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1166                                     + strlen (host) + 1);
1167           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1168         }
1169       else
1170         STRDUP_ALLOCA (dirpref, host);
1171     }
1172   else                         /* not add_hostdir */
1173     {
1174       if (!DOTP (opt.dir_prefix))
1175         dirpref = opt.dir_prefix;
1176       else
1177         dirpref = "";
1178     }
1179   free (host);
1180
1181   /* If there is a prefix, prepend it.  */
1182   if (*dirpref)
1183     {
1184       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1185       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1186       dir = newdir;
1187     }
1188   dir = xstrdup (dir);
1189   URL_CLEANSE (dir);
1190   l = strlen (dir);
1191   if (l && dir[l - 1] == '/')
1192     dir[l - 1] = '\0';
1193
1194   if (!*u->file)
1195     file = "index.html";
1196   else
1197     file = u->file;
1198
1199   /* Finally, construct the full name.  */
1200   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1201   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1202   free (dir);
1203   return res;
1204 }
1205
1206 /* Create a unique filename, corresponding to a given URL.  Calls
1207    mkstruct if necessary.  Does *not* actually create any directories.  */
1208 char *
1209 url_filename (const struct urlinfo *u)
1210 {
1211   char *file, *name;
1212   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1213
1214   if (opt.dirstruct)
1215     {
1216       file = mkstruct (u);
1217       have_prefix = 1;
1218     }
1219   else
1220     {
1221       if (!*u->file)
1222         file = xstrdup ("index.html");
1223       else
1224         file = xstrdup (u->file);
1225     }
1226
1227   if (!have_prefix)
1228     {
1229       /* Check whether the prefix directory is something other than "."
1230          before prepending it.  */
1231       if (!DOTP (opt.dir_prefix))
1232         {
1233           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1234                                          + 1 + strlen (file) + 1);
1235           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1236           free (file);
1237           file = nfile;
1238         }
1239     }
1240   /* DOS-ish file systems don't like `%' signs in them; we change it
1241      to `@'.  */
1242 #ifdef WINDOWS
1243   {
1244     char *p = file;
1245     for (p = file; *p; p++)
1246       if (*p == '%')
1247         *p = '@';
1248   }
1249 #endif /* WINDOWS */
1250
1251   /* Check the cases in which the unique extensions are not used:
1252      1) Clobbering is turned off (-nc).
1253      2) Retrieval with regetting.
1254      3) Timestamping is used.
1255      4) Hierarchy is built.
1256
1257      The exception is the case when file does exist and is a
1258      directory (actually support for bad httpd-s).  */
1259   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1260       && !(file_exists_p (file) && !file_non_directory_p (file)))
1261     return file;
1262
1263   /* Find a unique name.  */
1264   name = unique_name (file);
1265   free (file);
1266   return name;
1267 }
1268
1269 /* Construct an absolute URL, given a (possibly) relative one.  This
1270    is more tricky than it might seem, but it works.  */
1271 static char *
1272 construct (const char *url, const char *sub, int subsize, int no_proto)
1273 {
1274   char *constr;
1275
1276   if (no_proto)
1277     {
1278       int i;
1279
1280       if (*sub != '/')
1281         {
1282           for (i = strlen (url); i && url[i] != '/'; i--);
1283           if (!i || (url[i] == url[i - 1]))
1284             {
1285               int l = strlen (url);
1286               char *t = (char *)alloca (l + 2);
1287               strcpy (t, url);
1288               t[l] = '/';
1289               t[l + 1] = '\0';
1290               url = t;
1291               i = l;
1292             }
1293           constr = (char *)xmalloc (i + 1 + subsize + 1);
1294           strncpy (constr, url, i + 1);
1295           constr[i + 1] = '\0';
1296           strncat (constr, sub, subsize);
1297         }
1298       else /* *sub == `/' */
1299         {
1300           int fl;
1301
1302           i = 0;
1303           do
1304             {
1305               for (; url[i] && url[i] != '/'; i++);
1306               if (!url[i])
1307                 break;
1308               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1309               if (fl)
1310                 i += 2;
1311             }
1312           while (fl);
1313           if (!url[i])
1314             {
1315               int l = strlen (url);
1316               char *t = (char *)alloca (l + 2);
1317               strcpy (t, url);
1318               t[l] = '/';
1319               t[l + 1] = '\0';
1320               url = t;
1321             }
1322           constr = (char *)xmalloc (i + 1 + subsize + 1);
1323           strncpy (constr, url, i);
1324           constr[i] = '\0';
1325           strncat (constr + i, sub, subsize);
1326           constr[i + subsize] = '\0';
1327         } /* *sub == `/' */
1328     }
1329   else /* !no_proto */
1330     {
1331       constr = (char *)xmalloc (subsize + 1);
1332       strncpy (constr, sub, subsize);
1333       constr[subsize] = '\0';
1334     }
1335   return constr;
1336 }
1337 \f
1338 /* Optimize URL by host, destructively replacing u->host with realhost
1339    (u->host).  Do this regardless of opt.simple_check.  */
1340 void
1341 opt_url (struct urlinfo *u)
1342 {
1343   /* Find the "true" host.  */
1344   char *host = realhost (u->host);
1345   free (u->host);
1346   u->host = host;
1347   assert (u->dir != NULL);      /* the URL must have been parsed */
1348   /* Refresh the printed representation.  */
1349   free (u->url);
1350   u->url = str_url (u, 0);
1351 }
1352 \f
1353 /* Returns proxy host address, in accordance with PROTO.  */
1354 char *
1355 getproxy (uerr_t proto)
1356 {
1357   if (proto == URLHTTP)
1358     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1359   else if (proto == URLFTP)
1360     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1361   else
1362     return NULL;
1363 }
1364
1365 /* Should a host be accessed through proxy, concerning no_proxy?  */
1366 int
1367 no_proxy_match (const char *host, const char **no_proxy)
1368 {
1369   if (!no_proxy)
1370     return 1;
1371   else
1372     return !sufmatch (no_proxy, host);
1373 }
1374 \f
1375 /* Change the links in an HTML document.  Accepts a structure that
1376    defines the positions of all the links.  */
1377 void
1378 convert_links (const char *file, urlpos *l)
1379 {
1380   FILE               *fp;
1381   char               *buf, *p, *p2;
1382   downloaded_file_t  downloaded_file_return;
1383   long               size;
1384
1385   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1386   /* Read from the file....  */
1387   fp = fopen (file, "rb");
1388   if (!fp)
1389     {
1390       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1391                  file, strerror (errno));
1392       return;
1393     }
1394   /* ...to a buffer.  */
1395   load_file (fp, &buf, &size);
1396   fclose (fp);
1397
1398   downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
1399
1400   if (opt.backup_converted && downloaded_file_return)
1401     /* Rather than just writing over the original .html file with the converted
1402        version, save the former to *.orig.  Note we only do this for files we've
1403        _successfully_ downloaded, so we don't clobber .orig files sitting around
1404        from previous invocations. */
1405     {
1406       /* Construct the backup filename as the original name plus ".orig". */
1407       size_t         filename_len = strlen(file);
1408       char*          filename_plus_orig_suffix;
1409       boolean        already_wrote_backup_file = FALSE;
1410       slist*         converted_file_ptr;
1411       static slist*  converted_files = NULL;
1412
1413       if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1414         {
1415           /* Just write "orig" over "html".  We need to do it this way because
1416              when we're checking to see if we've downloaded the file before (to
1417              see if we can skip downloading it), we don't know if it's a
1418              text/html file.  Therefore we don't know yet at that stage that -E
1419              is going to cause us to tack on ".html", so we need to compare
1420              vs. the original URL plus ".orig", not the original URL plus
1421              ".html.orig". */
1422           filename_plus_orig_suffix = xmalloc(filename_len + 1);
1423           strcpy(filename_plus_orig_suffix, file);
1424           strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1425         }
1426       else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1427         {
1428           /* Append ".orig" to the name. */
1429           filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
1430           strcpy(filename_plus_orig_suffix, file);
1431           strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1432         }
1433
1434       /* We can get called twice on the same URL thanks to the
1435          convert_all_links() call in main().  If we write the .orig file each
1436          time in such a case, it'll end up containing the first-pass conversion,
1437          not the original file.  So, see if we've already been called on this
1438          file. */
1439       converted_file_ptr = converted_files;
1440       while (converted_file_ptr != NULL)
1441         if (strcmp(converted_file_ptr->string, file) == 0)
1442           {
1443             already_wrote_backup_file = TRUE;
1444             break;
1445           }
1446         else
1447           converted_file_ptr = converted_file_ptr->next;
1448
1449       if (!already_wrote_backup_file)
1450         {
1451           /* Rename <file> to <file>.orig before former gets written over. */
1452           if (rename(file, filename_plus_orig_suffix) != 0)
1453             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1454                        file, filename_plus_orig_suffix, strerror (errno));
1455
1456           /* Remember that we've already written a .orig backup for this file.
1457              Note that we never free this memory since we need it till the
1458              convert_all_links() call, which is one of the last things the
1459              program does before terminating.  BTW, I'm not sure if it would be
1460              safe to just set 'converted_file_ptr->string' to 'file' below,
1461              rather than making a copy of the string...  Another note is that I
1462              thought I could just add a field to the urlpos structure saying
1463              that we'd written a .orig file for this URL, but that didn't work,
1464              so I had to make this separate list. */
1465           converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1466           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1467           converted_file_ptr->next = converted_files;
1468           converted_files = converted_file_ptr;
1469         }
1470
1471       free(filename_plus_orig_suffix);
1472     }
1473   /* Now open the file for writing.  */
1474   fp = fopen (file, "wb");
1475   if (!fp)
1476     {
1477       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1478                  file, strerror (errno));
1479       free (buf);
1480       return;
1481     }
1482   /* Presumably we have to loop through multiple URLs here (even though we're
1483      only talking about a single local file) because of the -O option. */
1484   for (p = buf; l; l = l->next)
1485     {
1486       if (l->pos >= size)
1487         {
1488           DEBUGP (("Something strange is going on.  Please investigate."));
1489           break;
1490         }
1491       /* If the URL already is relative or it is not to be converted
1492          for some other reason (e.g. because of not having been
1493          downloaded in the first place), skip it.  */
1494       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1495         {
1496           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1497                    l->pos, l->flags));
1498           continue;
1499         }
1500       /* Else, reach the position of the offending URL, echoing
1501          everything up to it to the outfile.  */
1502       for (p2 = buf + l->pos; p < p2; p++)
1503         putc (*p, fp);
1504       if (l->flags & UABS2REL)
1505         /* Convert absolute URL to relative. */
1506         {
1507           char *newname = construct_relative (file, l->local_name);
1508           fprintf (fp, "%s", newname);
1509           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1510                    l->url, newname, l->pos, file));
1511           free (newname);
1512         }
1513       p += l->size;
1514     }
1515   /* Output the rest of the file. */
1516   if (p - buf < size)
1517     {
1518       for (p2 = buf + size; p < p2; p++)
1519         putc (*p, fp);
1520     }
1521   fclose (fp);
1522   free (buf);
1523   logputs (LOG_VERBOSE, _("done.\n"));
1524 }
1525
1526 /* Construct and return a malloced copy of the relative link from two
1527    pieces of information: local name S1 of the referring file and
1528    local name S2 of the referred file.
1529
1530    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1531    "jagor.srce.hr/images/news.gif", the function will return
1532    "images/news.gif".
1533
1534    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1535    "fly.cc.fer.hr/images/fly.gif", the function will return
1536    "../images/fly.gif".
1537
1538    Caveats: S1 should not begin with `/', unless S2 also begins with
1539    '/'.  S1 should not contain things like ".." and such --
1540    construct_relative ("fly/ioccc/../index.html",
1541    "fly/images/fly.gif") will fail.  (A workaround is to call
1542    something like path_simplify() on S1).  */
1543 static char *
1544 construct_relative (const char *s1, const char *s2)
1545 {
1546   int i, cnt, sepdirs1;
1547   char *res;
1548
1549   if (*s2 == '/')
1550     return xstrdup (s2);
1551   /* S1 should *not* be absolute, if S2 wasn't.  */
1552   assert (*s1 != '/');
1553   i = cnt = 0;
1554   /* Skip the directories common to both strings.  */
1555   while (1)
1556     {
1557       while (s1[i] && s2[i]
1558              && (s1[i] == s2[i])
1559              && (s1[i] != '/')
1560              && (s2[i] != '/'))
1561         ++i;
1562       if (s1[i] == '/' && s2[i] == '/')
1563         cnt = ++i;
1564       else
1565         break;
1566     }
1567   for (sepdirs1 = 0; s1[i]; i++)
1568     if (s1[i] == '/')
1569       ++sepdirs1;
1570   /* Now, construct the file as of:
1571      - ../ repeated sepdirs1 time
1572      - all the non-mutual directories of S2.  */
1573   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1574   for (i = 0; i < sepdirs1; i++)
1575     memcpy (res + 3 * i, "../", 3);
1576   strcpy (res + 3 * i, s2 + cnt);
1577   return res;
1578 }
1579 \f
1580 /* Add URL to the head of the list L.  */
1581 urlpos *
1582 add_url (urlpos *l, const char *url, const char *file)
1583 {
1584   urlpos *t;
1585
1586   t = (urlpos *)xmalloc (sizeof (urlpos));
1587   memset (t, 0, sizeof (*t));
1588   t->url = xstrdup (url);
1589   t->local_name = xstrdup (file);
1590   t->next = l;
1591   return t;
1592 }
1593
1594
1595 /* Remembers which files have been downloaded.  In the standard case, should be
1596    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1597    download successfully (i.e. not for ones we have failures on or that we skip
1598    due to -N).
1599
1600    When we've downloaded a file and tacked on a ".html" extension due to -E,
1601    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1602    FILE_DOWNLOADED_NORMALLY.
1603
1604    If you just want to check if a file has been previously added without adding
1605    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1606    with local filenames, not remote URLs. */
1607 downloaded_file_t
1608 downloaded_file (downloaded_file_t  mode, const char*  file)
1609 {
1610   typedef struct _downloaded_file_list
1611   {
1612     char*                          file;
1613     downloaded_file_t              download_type;
1614     struct _downloaded_file_list*  next;
1615   } downloaded_file_list;
1616
1617   boolean                       found_file = FALSE;
1618   static downloaded_file_list*  downloaded_files = NULL;
1619   downloaded_file_list*         rover = downloaded_files;
1620
1621   while (rover != NULL)
1622     if (strcmp(rover->file, file) == 0)
1623       {
1624         found_file = TRUE;
1625         break;
1626       }
1627     else
1628       rover = rover->next;
1629
1630   if (found_file)
1631     return rover->download_type;  /* file had already been downloaded */
1632   else
1633     {
1634       if (mode != CHECK_FOR_FILE)
1635         {
1636           rover = xmalloc(sizeof(*rover));
1637           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1638           rover->download_type = mode;
1639           rover->next = downloaded_files;
1640           downloaded_files = rover;
1641         }
1642
1643       return FILE_NOT_ALREADY_DOWNLOADED;
1644     }
1645 }