sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  62 #endif /* WINDOWS */
  63
  64 #define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
  65                         || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
  66                         || strchr (URL_UNSAFE_CHARS, c))
  67
  68 /* If S contains unsafe characters, free it and replace it with a
  69    version that doesn't.  */
  70 #define URL_CLEANSE(s) do                       \
  71 {                                               \
  72   if (contains_unsafe (s))                      \
  73     {                                           \
  74       char *uc_tmp = encode_string (s);         \
  75       free (s);                                 \
  76       (s) = uc_tmp;                             \
  77     }                                           \
  78 } while (0)
  79
  80 /* Is a directory "."?  */
  81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  82 /* Is a directory ".."?  */
  83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  84
  85 /* NULL-terminated list of strings to be recognized as prototypes (URL
  86    schemes).  Note that recognized doesn't mean supported -- only HTTP
  87    and FTP are currently supported.
  88
  89    However, a string that does not match anything in the list will be
  90    considered a relative URL.  Thus it's important that this list has
  91    anything anyone could think of being legal.
  92
  93    There are wild things here.  :-) Take a look at
  94    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  95    fun.  */
  96 static char *protostrings[] =
  97 {
  98   "cid:",
  99   "clsid:",
 100   "file:",
 101   "finger:",
 102   "ftp:",
 103   "gopher:",
 104   "hdl:",
 105   "http:",
 106   "https:",
 107   "ilu:",
 108   "ior:",
 109   "irc:",
 110   "java:",
 111   "javascript:",
 112   "lifn:",
 113   "mailto:",
 114   "mid:",
 115   "news:",
 116   "nntp:",
 117   "path:",
 118   "prospero:",
 119   "rlogin:",
 120   "service:",
 121   "shttp:",
 122   "snews:",
 123   "stanf:",
 124   "telnet:",
 125   "tn3270:",
 126   "wais:",
 127   "whois++:",
 128   NULL
 129 };
 130
 131 struct proto
 132 {
 133   char *name;
 134   uerr_t ind;
 135   unsigned short port;
 136 };
 137
 138 /* Similar to former, but for supported protocols: */
 139 static struct proto sup_protos[] =
 140 {
 141   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 142   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 143   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 144 };
 145
 146 static void parse_dir PARAMS ((const char *, char **, char **));
 147 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 148 static char *construct PARAMS ((const char *, const char *, int , int));
 149 static char *construct_relative PARAMS ((const char *, const char *));
 150 static char process_ftp_type PARAMS ((char *));
 151
 152 \f
 153 /* Returns the number of characters to be skipped if the first thing
 154    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 155    URL: are also skipped.  */
 156 int
 157 skip_url (const char *url)
 158 {
 159   int i;
 160
 161   if (TOUPPER (url[0]) == 'U'
 162       && TOUPPER (url[1]) == 'R'
 163       && TOUPPER (url[2]) == 'L'
 164       && url[3] == ':')
 165     {
 166       /* Skip blanks.  */
 167       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 168       return i;
 169     }
 170   else
 171     return 0;
 172 }
 173
 174 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 175 int
 176 contains_unsafe (const char *s)
 177 {
 178   for (; *s; s++)
 179     if (UNSAFE_CHAR (*s))
 180       return 1;
 181   return 0;
 182 }
 183
 184 /* Decodes the forms %xy in a URL to the character the hexadecimal
 185    code of which is xy.  xy are hexadecimal digits from
 186    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 187    hex-digits or `%' precedes `\0', the sequence is inserted
 188    literally.  */
 189
 190 static void
 191 decode_string (char *s)
 192 {
 193   char *p = s;
 194
 195   for (; *s; s++, p++)
 196     {
 197       if (*s != '%')
 198         *p = *s;
 199       else
 200         {
 201           /* Do nothing if at the end of the string, or if the chars
 202              are not hex-digits.  */
 203           if (!*(s + 1) || !*(s + 2)
 204               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 205             {
 206               *p = *s;
 207               continue;
 208             }
 209           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 210           s += 2;
 211         }
 212     }
 213   *p = '\0';
 214 }
 215
 216 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 217    given string, returning a malloc-ed %XX encoded string.  */
 218 char *
 219 encode_string (const char *s)
 220 {
 221   const char *b;
 222   char *p, *res;
 223   int i;
 224
 225   b = s;
 226   for (i = 0; *s; s++, i++)
 227     if (UNSAFE_CHAR (*s))
 228       i += 2; /* Two more characters (hex digits) */
 229   res = (char *)xmalloc (i + 1);
 230   s = b;
 231   for (p = res; *s; s++)
 232     if (UNSAFE_CHAR (*s))
 233       {
 234         const unsigned char c = *s;
 235         *p++ = '%';
 236         *p++ = HEXD2ASC (c >> 4);
 237         *p++ = HEXD2ASC (c & 0xf);
 238       }
 239     else
 240       *p++ = *s;
 241   *p = '\0';
 242   return res;
 243 }
 244 \f
 245 /* Returns the proto-type if URL's protocol is supported, or
 246    URLUNKNOWN if not.  */
 247 uerr_t
 248 urlproto (const char *url)
 249 {
 250   int i;
 251
 252   url += skip_url (url);
 253   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 254     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 255       return sup_protos[i].ind;
 256   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 257   if (url[i] == ':')
 258     {
 259       for (++i; url[i] && url[i] != '/'; i++)
 260         if (!ISDIGIT (url[i]))
 261           return URLBADPORT;
 262       if (url[i - 1] == ':')
 263         return URLFTP;
 264       else
 265         return URLHTTP;
 266     }
 267   else
 268     return URLHTTP;
 269 }
 270
 271 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 272    part is found, returns 0.  */
 273 int
 274 skip_proto (const char *url)
 275 {
 276   char **s;
 277   int l;
 278
 279   for (s = protostrings; *s; s++)
 280     if (!strncasecmp (*s, url, strlen (*s)))
 281       break;
 282   if (!*s)
 283     return 0;
 284   l = strlen (*s);
 285   /* HTTP and FTP protocols are expected to yield exact host names
 286      (i.e. the `//' part must be skipped, too).  */
 287   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 288     l += 2;
 289   return l;
 290 }
 291
 292 /* Returns 1 if the URL begins with a protocol (supported or
 293    unsupported), 0 otherwise.  */
 294 static int
 295 has_proto (const char *url)
 296 {
 297   char **s;
 298
 299   url += skip_url (url);
 300   for (s = protostrings; *s; s++)
 301     if (strncasecmp (url, *s, strlen (*s)) == 0)
 302       return 1;
 303   return 0;
 304 }
 305
 306 /* Skip the username and password, if present here.  The function
 307    should be called *not* with the complete URL, but with the part
 308    right after the protocol.
 309
 310    If no username and password are found, return 0.  */
 311 int
 312 skip_uname (const char *url)
 313 {
 314   const char *p;
 315   for (p = url; *p && *p != '/'; p++)
 316     if (*p == '@')
 317       break;
 318   /* If a `@' was found before the first occurrence of `/', skip
 319      it.  */
 320   if (*p == '@')
 321     return p - url + 1;
 322   else
 323     return 0;
 324 }
 325 \f
 326 /* Allocate a new urlinfo structure, fill it with default values and
 327    return a pointer to it.  */
 328 struct urlinfo *
 329 newurl (void)
 330 {
 331   struct urlinfo *u;
 332
 333   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 334   memset (u, 0, sizeof (*u));
 335   u->proto = URLUNKNOWN;
 336   return u;
 337 }
 338
 339 /* Perform a "deep" free of the urlinfo structure.  The structure
 340    should have been created with newurl, but need not have been used.
 341    If free_pointer is non-0, free the pointer itself.  */
 342 void
 343 freeurl (struct urlinfo *u, int complete)
 344 {
 345   assert (u != NULL);
 346   FREE_MAYBE (u->url);
 347   FREE_MAYBE (u->host);
 348   FREE_MAYBE (u->path);
 349   FREE_MAYBE (u->file);
 350   FREE_MAYBE (u->dir);
 351   FREE_MAYBE (u->user);
 352   FREE_MAYBE (u->passwd);
 353   FREE_MAYBE (u->local);
 354   FREE_MAYBE (u->referer);
 355   if (u->proxy)
 356     freeurl (u->proxy, 1);
 357   if (complete)
 358     free (u);
 359   return;
 360 }
 361 \f
 362 /* Extract the given URL of the form
 363    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 364    1. hostname (terminated with `/' or `:')
 365    2. port number (terminated with `/'), or chosen for the protocol
 366    3. dirname (everything after hostname)
 367    Most errors are handled.  No allocation is done, you must supply
 368    pointers to allocated memory.
 369    ...and a host of other stuff :-)
 370
 371    - Recognizes hostname:dir/file for FTP and
 372      hostname (:portnum)?/dir/file for HTTP.
 373    - Parses the path to yield directory and file
 374    - Parses the URL to yield the username and passwd (if present)
 375    - Decodes the strings, in case they contain "forbidden" characters
 376    - Writes the result to struct urlinfo
 377
 378    If the argument STRICT is set, it recognizes only the canonical
 379    form.  */
 380 uerr_t
 381 parseurl (const char *url, struct urlinfo *u, int strict)
 382 {
 383   int i, l, abs_ftp;
 384   int recognizable;            /* Recognizable URL is the one where
 385                                   the protocol name was explicitly
 386                                   named, i.e. it wasn't deduced from
 387                                   the URL format.  */
 388   uerr_t type;
 389
 390   DEBUGP (("parseurl (\"%s\") -> ", url));
 391   url += skip_url (url);
 392   recognizable = has_proto (url);
 393   if (strict && !recognizable)
 394     return URLUNKNOWN;
 395   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 396     {
 397       l = strlen (sup_protos[i].name);
 398       if (!strncasecmp (sup_protos[i].name, url, l))
 399         break;
 400     }
 401   /* If protocol is recognizable, but unsupported, bail out, else
 402      suppose unknown.  */
 403   if (recognizable && !sup_protos[i].name)
 404     return URLUNKNOWN;
 405   else if (i == ARRAY_SIZE (sup_protos))
 406     type = URLUNKNOWN;
 407   else
 408     u->proto = type = sup_protos[i].ind;
 409
 410   if (type == URLUNKNOWN)
 411     l = 0;
 412   /* Allow a username and password to be specified (i.e. just skip
 413      them for now).  */
 414   if (recognizable)
 415     l += skip_uname (url + l);
 416   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 417   if (i == l)
 418     return URLBADHOST;
 419   /* Get the hostname.  */
 420   u->host = strdupdelim (url + l, url + i);
 421   DEBUGP (("host %s -> ", u->host));
 422
 423   /* Assume no port has been given.  */
 424   u->port = 0;
 425   if (url[i] == ':')
 426     {
 427       /* We have a colon delimiting the hostname.  It could mean that
 428          a port number is following it, or a directory.  */
 429       if (ISDIGIT (url[++i]))    /* A port number */
 430         {
 431           if (type == URLUNKNOWN)
 432             u->proto = type = URLHTTP;
 433           for (; url[i] && url[i] != '/'; i++)
 434             if (ISDIGIT (url[i]))
 435               u->port = 10 * u->port + (url[i] - '0');
 436             else
 437               return URLBADPORT;
 438           if (!u->port)
 439             return URLBADPORT;
 440           DEBUGP (("port %hu -> ", u->port));
 441         }
 442       else if (type == URLUNKNOWN) /* or a directory */
 443         u->proto = type = URLFTP;
 444       else                      /* or just a misformed port number */
 445         return URLBADPORT;
 446     }
 447   else if (type == URLUNKNOWN)
 448     u->proto = type = URLHTTP;
 449   if (!u->port)
 450     {
 451       int i;
 452       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 453         if (sup_protos[i].ind == type)
 454           break;
 455       if (i == ARRAY_SIZE (sup_protos))
 456         return URLUNKNOWN;
 457       u->port = sup_protos[i].port;
 458     }
 459   /* Some delimiter troubles...  */
 460   if (url[i] == '/' && url[i - 1] != ':')
 461     ++i;
 462   if (type == URLHTTP)
 463     while (url[i] && url[i] == '/')
 464       ++i;
 465   u->path = (char *)xmalloc (strlen (url + i) + 8);
 466   strcpy (u->path, url + i);
 467   if (type == URLFTP)
 468     {
 469       u->ftp_type = process_ftp_type (u->path);
 470       /* #### We don't handle type `d' correctly yet.  */
 471       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 472         u->ftp_type = 'I';
 473     }
 474   DEBUGP (("opath %s -> ", u->path));
 475   /* Parse the username and password (if existing).  */
 476   parse_uname (url, &u->user, &u->passwd);
 477   /* Decode the strings, as per RFC 1738.  */
 478   decode_string (u->host);
 479   decode_string (u->path);
 480   if (u->user)
 481     decode_string (u->user);
 482   if (u->passwd)
 483     decode_string (u->passwd);
 484   /* Parse the directory.  */
 485   parse_dir (u->path, &u->dir, &u->file);
 486   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 487   /* Simplify the directory.  */
 488   path_simplify (u->dir);
 489   /* Remove the leading `/' in HTTP.  */
 490   if (type == URLHTTP && *u->dir == '/')
 491     strcpy (u->dir, u->dir + 1);
 492   DEBUGP (("ndir %s\n", u->dir));
 493   /* Strip trailing `/'.  */
 494   l = strlen (u->dir);
 495   if (l && u->dir[l - 1] == '/')
 496     u->dir[l - 1] = '\0';
 497   /* Re-create the path: */
 498   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 499   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 500       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 501   strcpy (u->path, abs_ftp ? "%2F" : "/");
 502   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 503   strcat (u->path, *u->dir ? "/" : "");
 504   strcat (u->path, u->file);
 505   URL_CLEANSE (u->path);
 506   /* Create the clean URL.  */
 507   u->url = str_url (u, 0);
 508   return URLOK;
 509 }
 510 \f
 511 /* Build the directory and filename components of the path.  Both
 512    components are *separately* malloc-ed strings!  It does not change
 513    the contents of path.
 514
 515    If the path ends with "." or "..", they are (correctly) counted as
 516    directories.  */
 517 static void
 518 parse_dir (const char *path, char **dir, char **file)
 519 {
 520   int i, l;
 521
 522   for (i = l = strlen (path); i && path[i] != '/'; i--);
 523   if (!i && *path != '/')   /* Just filename */
 524     {
 525       if (DOTP (path) || DDOTP (path))
 526         {
 527           *dir = xstrdup (path);
 528           *file = xstrdup ("");
 529         }
 530       else
 531         {
 532           *dir = xstrdup ("");     /* This is required because of FTP */
 533           *file = xstrdup (path);
 534         }
 535     }
 536   else if (!i)                 /* /filename */
 537     {
 538       if (DOTP (path + 1) || DDOTP (path + 1))
 539         {
 540           *dir = xstrdup (path);
 541           *file = xstrdup ("");
 542         }
 543       else
 544         {
 545           *dir = xstrdup ("/");
 546           *file = xstrdup (path + 1);
 547         }
 548     }
 549   else /* Nonempty directory with or without a filename */
 550     {
 551       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 552         {
 553           *dir = xstrdup (path);
 554           *file = xstrdup ("");
 555         }
 556       else
 557         {
 558           *dir = strdupdelim (path, path + i);
 559           *file = strdupdelim (path + i + 1, path + l + 1);
 560         }
 561     }
 562 }
 563
 564 /* Find the optional username and password within the URL, as per
 565    RFC1738.  The returned user and passwd char pointers are
 566    malloc-ed.  */
 567 static uerr_t
 568 parse_uname (const char *url, char **user, char **passwd)
 569 {
 570   int l;
 571   const char *p, *col;
 572   char **where;
 573
 574   *user = NULL;
 575   *passwd = NULL;
 576   url += skip_url (url);
 577   /* Look for end of protocol string.  */
 578   l = skip_proto (url);
 579   if (!l)
 580     return URLUNKNOWN;
 581   /* Add protocol offset.  */
 582   url += l;
 583   /* Is there an `@' character?  */
 584   for (p = url; *p && *p != '/'; p++)
 585     if (*p == '@')
 586       break;
 587   /* If not, return.  */
 588   if (*p != '@')
 589     return URLOK;
 590   /* Else find the username and password.  */
 591   for (p = col = url; *p != '@'; p++)
 592     {
 593       if (*p == ':' && !*user)
 594         {
 595           *user = (char *)xmalloc (p - url + 1);
 596           memcpy (*user, url, p - url);
 597           (*user)[p - url] = '\0';
 598           col = p + 1;
 599         }
 600     }
 601   /* Decide whether you have only the username or both.  */
 602   where = *user ? passwd : user;
 603   *where = (char *)xmalloc (p - col + 1);
 604   memcpy (*where, col, p - col);
 605   (*where)[p - col] = '\0';
 606   return URLOK;
 607 }
 608
 609 /* If PATH ends with `;type=X', return the character X.  */
 610 static char
 611 process_ftp_type (char *path)
 612 {
 613   int len = strlen (path);
 614
 615   if (len >= 7
 616       && !memcmp (path + len - 7, ";type=", 6))
 617     {
 618       path[len - 7] = '\0';
 619       return path[len - 1];
 620     }
 621   else
 622     return '\0';
 623 }
 624 \f
 625 /* Return the URL as fine-formed string, with a proper protocol, port
 626    number, directory and optional user/password.  If HIDE is non-zero,
 627    password will be hidden.  The forbidden characters in the URL will
 628    be cleansed.  */
 629 char *
 630 str_url (const struct urlinfo *u, int hide)
 631 {
 632   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 633   int i, l, ln, lu, lh, lp, lf, ld;
 634   unsigned short proto_default_port;
 635
 636   /* Look for the protocol name.  */
 637   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 638     if (sup_protos[i].ind == u->proto)
 639       break;
 640   if (i == ARRAY_SIZE (sup_protos))
 641     return NULL;
 642   proto_name = sup_protos[i].name;
 643   proto_default_port = sup_protos[i].port;
 644   host = CLEANDUP (u->host);
 645   dir = CLEANDUP (u->dir);
 646   file = CLEANDUP (u->file);
 647   user = passwd = NULL;
 648   if (u->user)
 649     user = CLEANDUP (u->user);
 650   if (u->passwd)
 651     {
 652       int i;
 653       passwd = CLEANDUP (u->passwd);
 654       if (hide)
 655         for (i = 0; passwd[i]; i++)
 656           passwd[i] = 'x';
 657     }
 658   if (u->proto == URLFTP && *dir == '/')
 659     {
 660       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 661       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 662       *tmp = '%';
 663       tmp[1] = '2';
 664       tmp[2] = 'F';
 665       strcpy (tmp + 3, dir + 1);
 666       free (dir);
 667       dir = tmp;
 668     }
 669
 670   ln = strlen (proto_name);
 671   lu = user ? strlen (user) : 0;
 672   lp = passwd ? strlen (passwd) : 0;
 673   lh = strlen (host);
 674   ld = strlen (dir);
 675   lf = strlen (file);
 676   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 677   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 678      (user ? user : ""), (passwd ? ":" : ""),
 679      (passwd ? passwd : ""), (user ? "@" : ""),
 680      host, u->port, dir, *dir ? "/" : "", file); */
 681   l = 0;
 682   memcpy (res, proto_name, ln);
 683   l += ln;
 684   if (user)
 685     {
 686       memcpy (res + l, user, lu);
 687       l += lu;
 688       if (passwd)
 689         {
 690           res[l++] = ':';
 691           memcpy (res + l, passwd, lp);
 692           l += lp;
 693         }
 694       res[l++] = '@';
 695     }
 696   memcpy (res + l, host, lh);
 697   l += lh;
 698   if (u->port != proto_default_port)
 699     {
 700       res[l++] = ':';
 701       long_to_string (res + l, (long)u->port);
 702       l += numdigit (u->port);
 703     }
 704   res[l++] = '/';
 705   memcpy (res + l, dir, ld);
 706   l += ld;
 707   if (*dir)
 708     res[l++] = '/';
 709   strcpy (res + l, file);
 710   free (host);
 711   free (dir);
 712   free (file);
 713   FREE_MAYBE (user);
 714   FREE_MAYBE (passwd);
 715   return res;
 716 }
 717
 718 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 719    location.  Uses parseurl to parse them, and compares the canonical
 720    forms.
 721
 722    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 723    return 0 on error.  */
 724 int
 725 url_equal (const char *url1, const char *url2)
 726 {
 727   struct urlinfo *u1, *u2;
 728   uerr_t err;
 729   int res;
 730
 731   u1 = newurl ();
 732   err = parseurl (url1, u1, 0);
 733   if (err != URLOK)
 734     {
 735       freeurl (u1, 1);
 736       return 0;
 737     }
 738   u2 = newurl ();
 739   err = parseurl (url2, u2, 0);
 740   if (err != URLOK)
 741     {
 742       freeurl (u2, 1);
 743       return 0;
 744     }
 745   res = !strcmp (u1->url, u2->url);
 746   freeurl (u1, 1);
 747   freeurl (u2, 1);
 748   return res;
 749 }
 750 \f
 751 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 752    buffer may contain pretty much anything; no errors are signaled.  */
 753 static const char *
 754 findurl (const char *buf, int howmuch, int *count)
 755 {
 756   char **prot;
 757   const char *s1, *s2;
 758
 759   for (s1 = buf; howmuch; s1++, howmuch--)
 760     for (prot = protostrings; *prot; prot++)
 761       if (howmuch <= strlen (*prot))
 762         continue;
 763       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 764         {
 765           for (s2 = s1, *count = 0;
 766                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 767                  !strchr (URL_SEPARATOR, *s2);
 768                s2++, (*count)++, howmuch--);
 769           return s1;
 770         }
 771   return NULL;
 772 }
 773
 774 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 775    each pointer representing a URL string.  The file is *not* assumed
 776    to be HTML.  */
 777 urlpos *
 778 get_urls_file (const char *file)
 779 {
 780   long nread;
 781   FILE *fp;
 782   char *buf;
 783   const char *pbuf;
 784   int size;
 785   urlpos *first, *current, *old;
 786
 787   if (file && !HYPHENP (file))
 788     {
 789       fp = fopen (file, "rb");
 790       if (!fp)
 791         {
 792           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 793           return NULL;
 794         }
 795     }
 796   else
 797     fp = stdin;
 798   /* Load the file.  */
 799   load_file (fp, &buf, &nread);
 800   if (file && !HYPHENP (file))
 801     fclose (fp);
 802   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 803   first = current = NULL;
 804   /* Fill the linked list with URLs.  */
 805   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 806        pbuf += size)
 807     {
 808       /* Allocate the space.  */
 809       old = current;
 810       current = (urlpos *)xmalloc (sizeof (urlpos));
 811       if (old)
 812         old->next = current;
 813       memset (current, 0, sizeof (*current));
 814       current->next = NULL;
 815       current->url = (char *)xmalloc (size + 1);
 816       memcpy (current->url, pbuf, size);
 817       current->url[size] = '\0';
 818       if (!first)
 819         first = current;
 820     }
 821   /* Free the buffer.  */
 822   free (buf);
 823
 824   return first;
 825 }
 826
 827 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 828    an HTML document using htmlfindurl(), which see.  get_urls_html()
 829    constructs the HTML-s from the relative href-s.
 830
 831    If SILENT is non-zero, do not barf on baseless relative links.  */
 832 urlpos *
 833 get_urls_html (const char *file, const char *this_url, int silent,
 834                int dash_p_leaf_HTML)
 835 {
 836   long nread;
 837   FILE *fp;
 838   char *orig_buf;
 839   const char *buf;
 840   int step, first_time;
 841   urlpos *first, *current, *old;
 842
 843   if (file && !HYPHENP (file))
 844     {
 845       fp = fopen (file, "rb");
 846       if (!fp)
 847         {
 848           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 849           return NULL;
 850         }
 851     }
 852   else
 853     fp = stdin;
 854   /* Load the file.  */
 855   load_file (fp, &orig_buf, &nread);
 856   if (file && !HYPHENP (file))
 857     fclose (fp);
 858   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 859   first = current = NULL;
 860   first_time = 1;
 861   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 862   for (buf = orig_buf;
 863        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
 864                            dash_p_leaf_HTML));
 865        buf += step)
 866     {
 867       int i, no_proto;
 868       int size = step;
 869       const char *pbuf = buf;
 870       char *constr, *base;
 871       const char *cbase;
 872
 873       first_time = 0;
 874
 875       /* A frequent phenomenon that needs to be handled are pages
 876          generated by brain-damaged HTML generators, which refer to to
 877          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 878          any spaces at the beginning or at the end of the string.
 879          This is probably not strictly correct, but that's what the
 880          browsers do, so we may follow.  May the authors of "WYSIWYG"
 881          HTML tools burn in hell for the damage they've inflicted!  */
 882       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 883         {
 884           ++pbuf;
 885           --size;
 886         }
 887       while (size && ISSPACE (pbuf[size - 1]))
 888         --size;
 889       if (!size)
 890         break;
 891
 892       for (i = 0; protostrings[i]; i++)
 893         {
 894           if (!strncasecmp (protostrings[i], pbuf,
 895                             MINVAL (strlen (protostrings[i]), size)))
 896             break;
 897         }
 898       /* Check for http:RELATIVE_URI.  See below for details.  */
 899       if (protostrings[i]
 900           && !(strncasecmp (pbuf, "http:", 5) == 0
 901                && strncasecmp (pbuf, "http://", 7) != 0))
 902         {
 903           no_proto = 0;
 904         }
 905       else
 906         {
 907           no_proto = 1;
 908           /* This is for extremely brain-damaged pages that refer to
 909              relative URI-s as <a href="http:URL">.  Just strip off the
 910              silly leading "http:" (as well as any leading blanks
 911              before it).  */
 912           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 913             pbuf += 5, size -= 5;
 914         }
 915       if (!no_proto)
 916         {
 917           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 918             {
 919               if (!strncasecmp (sup_protos[i].name, pbuf,
 920                                MINVAL (strlen (sup_protos[i].name), size)))
 921                 break;
 922             }
 923           /* Do *not* accept a non-supported protocol.  */
 924           if (i == ARRAY_SIZE (sup_protos))
 925             continue;
 926         }
 927       if (no_proto)
 928         {
 929           /* First, construct the base, which can be relative itself.
 930
 931              Criteria for creating the base are:
 932              1) html_base created by <base href="...">
 933              2) current URL
 934              3) base provided from the command line */
 935           cbase = html_base ();
 936           if (!cbase)
 937             cbase = this_url;
 938           if (!cbase)
 939             cbase = opt.base_href;
 940           if (!cbase)             /* Error condition -- a baseless
 941                                      relative link.  */
 942             {
 943               if (!opt.quiet && !silent)
 944                 {
 945                   /* Use malloc, not alloca because this is called in
 946                      a loop. */
 947                   char *temp = (char *)malloc (size + 1);
 948                   strncpy (temp, pbuf, size);
 949                   temp[size] = '\0';
 950                   logprintf (LOG_NOTQUIET,
 951                              _("Error (%s): Link %s without a base provided.\n"),
 952                              file, temp);
 953                   free (temp);
 954                 }
 955               continue;
 956             }
 957           if (this_url)
 958             base = construct (this_url, cbase, strlen (cbase),
 959                               !has_proto (cbase));
 960           else
 961             {
 962               /* Base must now be absolute, with host name and
 963                  protocol.  */
 964               if (!has_proto (cbase))
 965                 {
 966                   logprintf (LOG_NOTQUIET, _("\
 967 Error (%s): Base %s relative, without referer URL.\n"),
 968                              file, cbase);
 969                   continue;
 970                 }
 971               base = xstrdup (cbase);
 972             }
 973           constr = construct (base, pbuf, size, no_proto);
 974           free (base);
 975         }
 976       else /* has proto */
 977         {
 978           constr = (char *)xmalloc (size + 1);
 979           strncpy (constr, pbuf, size);
 980           constr[size] = '\0';
 981         }
 982 #ifdef DEBUG
 983       if (opt.debug)
 984         {
 985           char *tmp;
 986           const char *tmp2;
 987
 988           tmp2 = html_base ();
 989           /* Use malloc, not alloca because this is called in a loop. */
 990           tmp = (char *)xmalloc (size + 1);
 991           strncpy (tmp, pbuf, size);
 992           tmp[size] = '\0';
 993           logprintf (LOG_ALWAYS,
 994                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
 995                      file, this_url ? this_url : "(null)",
 996                      tmp2 ? tmp2 : "(null)", tmp, constr);
 997           free (tmp);
 998         }
 999 #endif
1000
1001       /* Allocate the space.  */
1002       old = current;
1003       current = (urlpos *)xmalloc (sizeof (urlpos));
1004       if (old)
1005         old->next = current;
1006       if (!first)
1007         first = current;
1008       /* Fill the values.  */
1009       memset (current, 0, sizeof (*current));
1010       current->next = NULL;
1011       current->url = constr;
1012       current->size = size;
1013       current->pos = pbuf - orig_buf;
1014       /* A URL is relative if the host and protocol are not named,
1015          and the name does not start with `/'.  */
1016       if (no_proto && *pbuf != '/')
1017         current->flags |= (URELATIVE | UNOPROTO);
1018       else if (no_proto)
1019         current->flags |= UNOPROTO;
1020     }
1021   free (orig_buf);
1022
1023   return first;
1024 }
1025 \f
1026 /* Free the linked list of urlpos.  */
1027 void
1028 free_urlpos (urlpos *l)
1029 {
1030   while (l)
1031     {
1032       urlpos *next = l->next;
1033       free (l->url);
1034       FREE_MAYBE (l->local_name);
1035       free (l);
1036       l = next;
1037     }
1038 }
1039
1040 /* Rotate FNAME opt.backups times */
1041 void
1042 rotate_backups(const char *fname)
1043 {
1044   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1045   char *from = (char *)alloca (maxlen);
1046   char *to = (char *)alloca (maxlen);
1047   struct stat sb;
1048   int i;
1049
1050   if (stat (fname, &sb) == 0)
1051     if (S_ISREG (sb.st_mode) == 0)
1052       return;
1053
1054   for (i = opt.backups; i > 1; i--)
1055     {
1056       sprintf (from, "%s.%d", fname, i - 1);
1057       sprintf (to, "%s.%d", fname, i);
1058       /* #### This will fail on machines without the rename() system
1059          call.  */
1060       rename (from, to);
1061     }
1062
1063   sprintf (to, "%s.%d", fname, 1);
1064   rename(fname, to);
1065 }
1066
1067 /* Create all the necessary directories for PATH (a file).  Calls
1068    mkdirhier() internally.  */
1069 int
1070 mkalldirs (const char *path)
1071 {
1072   const char *p;
1073   char *t;
1074   struct stat st;
1075   int res;
1076
1077   p = path + strlen (path);
1078   for (; *p != '/' && p != path; p--);
1079   /* Don't create if it's just a file.  */
1080   if ((p == path) && (*p != '/'))
1081     return 0;
1082   t = strdupdelim (path, p);
1083   /* Check whether the directory exists.  */
1084   if ((stat (t, &st) == 0))
1085     {
1086       if (S_ISDIR (st.st_mode))
1087         {
1088           free (t);
1089           return 0;
1090         }
1091       else
1092         {
1093           /* If the dir exists as a file name, remove it first.  This
1094              is *only* for Wget to work with buggy old CERN http
1095              servers.  Here is the scenario: When Wget tries to
1096              retrieve a directory without a slash, e.g.
1097              http://foo/bar (bar being a directory), CERN server will
1098              not redirect it too http://foo/bar/ -- it will generate a
1099              directory listing containing links to bar/file1,
1100              bar/file2, etc.  Wget will lose because it saves this
1101              HTML listing to a file `bar', so it cannot create the
1102              directory.  To work around this, if the file of the same
1103              name exists, we just remove it and create the directory
1104              anyway.  */
1105           DEBUGP (("Removing %s because of directory danger!\n", t));
1106           unlink (t);
1107         }
1108     }
1109   res = make_directory (t);
1110   if (res != 0)
1111     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1112   free (t);
1113   return res;
1114 }
1115
1116 static int
1117 count_slashes (const char *s)
1118 {
1119   int i = 0;
1120   while (*s)
1121     if (*s++ == '/')
1122       ++i;
1123   return i;
1124 }
1125
1126 /* Return the path name of the URL-equivalent file name, with a
1127    remote-like structure of directories.  */
1128 static char *
1129 mkstruct (const struct urlinfo *u)
1130 {
1131   char *host, *dir, *file, *res, *dirpref;
1132   int l;
1133
1134   assert (u->dir != NULL);
1135   assert (u->host != NULL);
1136
1137   if (opt.cut_dirs)
1138     {
1139       char *ptr = u->dir + (*u->dir == '/');
1140       int slash_count = 1 + count_slashes (ptr);
1141       int cut = MINVAL (opt.cut_dirs, slash_count);
1142       for (; cut && *ptr; ptr++)
1143         if (*ptr == '/')
1144           --cut;
1145       STRDUP_ALLOCA (dir, ptr);
1146     }
1147   else
1148     dir = u->dir + (*u->dir == '/');
1149
1150   host = xstrdup (u->host);
1151   /* Check for the true name (or at least a consistent name for saving
1152      to directory) of HOST, reusing the hlist if possible.  */
1153   if (opt.add_hostdir && !opt.simple_check)
1154     {
1155       char *nhost = realhost (host);
1156       free (host);
1157       host = nhost;
1158     }
1159   /* Add dir_prefix and hostname (if required) to the beginning of
1160      dir.  */
1161   if (opt.add_hostdir)
1162     {
1163       if (!DOTP (opt.dir_prefix))
1164         {
1165           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1166                                     + strlen (host) + 1);
1167           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1168         }
1169       else
1170         STRDUP_ALLOCA (dirpref, host);
1171     }
1172   else                         /* not add_hostdir */
1173     {
1174       if (!DOTP (opt.dir_prefix))
1175         dirpref = opt.dir_prefix;
1176       else
1177         dirpref = "";
1178     }
1179   free (host);
1180
1181   /* If there is a prefix, prepend it.  */
1182   if (*dirpref)
1183     {
1184       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1185       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1186       dir = newdir;
1187     }
1188   dir = xstrdup (dir);
1189   URL_CLEANSE (dir);
1190   l = strlen (dir);
1191   if (l && dir[l - 1] == '/')
1192     dir[l - 1] = '\0';
1193
1194   if (!*u->file)
1195     file = "index.html";
1196   else
1197     file = u->file;
1198
1199   /* Finally, construct the full name.  */
1200   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1201   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1202   free (dir);
1203   return res;
1204 }
1205
1206 /* Create a unique filename, corresponding to a given URL.  Calls
1207    mkstruct if necessary.  Does *not* actually create any directories.  */
1208 char *
1209 url_filename (const struct urlinfo *u)
1210 {
1211   char *file, *name;
1212   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1213
1214   if (opt.dirstruct)
1215     {
1216       file = mkstruct (u);
1217       have_prefix = 1;
1218     }
1219   else
1220     {
1221       if (!*u->file)
1222         file = xstrdup ("index.html");
1223       else
1224         file = xstrdup (u->file);
1225     }
1226
1227   if (!have_prefix)
1228     {
1229       /* Check whether the prefix directory is something other than "."
1230          before prepending it.  */
1231       if (!DOTP (opt.dir_prefix))
1232         {
1233           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1234                                          + 1 + strlen (file) + 1);
1235           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1236           free (file);
1237           file = nfile;
1238         }
1239     }
1240   /* DOS-ish file systems don't like `%' signs in them; we change it
1241      to `@'.  */
1242 #ifdef WINDOWS
1243   {
1244     char *p = file;
1245     for (p = file; *p; p++)
1246       if (*p == '%')
1247         *p = '@';
1248   }
1249 #endif /* WINDOWS */
1250
1251   /* Check the cases in which the unique extensions are not used:
1252      1) Clobbering is turned off (-nc).
1253      2) Retrieval with regetting.
1254      3) Timestamping is used.
1255      4) Hierarchy is built.
1256
1257      The exception is the case when file does exist and is a
1258      directory (actually support for bad httpd-s).  */
1259   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1260       && !(file_exists_p (file) && !file_non_directory_p (file)))
1261     return file;
1262
1263   /* Find a unique name.  */
1264   name = unique_name (file);
1265   free (file);
1266   return name;
1267 }
1268
1269 /* Like strlen(), except if `?' is present in the URL and its protocol
1270    is HTTP, act as if `?' is the end of the string.  Needed for the
1271    correct implementation of `construct' below, at least until we code
1272    up proper parsing of URLs.  */
1273 static int
1274 urllen_http_hack (const char *url)
1275 {
1276   if ((!strncmp (url, "http://", 7)
1277        || !strncmp (url, "https://", 7)))
1278     {
1279       const char *q = strchr (url, '?');
1280       if (q)
1281         return q - url;
1282     }
1283   return strlen (url);
1284 }
1285
1286 /* Construct an absolute URL, given a (possibly) relative one.  This
1287    is more tricky than it might seem, but it works.  */
1288 static char *
1289 construct (const char *url, const char *sub, int subsize, int no_proto)
1290 {
1291   char *constr;
1292
1293   if (no_proto)
1294     {
1295       int i;
1296
1297       if (*sub != '/')
1298         {
1299           for (i = urllen_http_hack (url); i && url[i] != '/'; i--);
1300           if (!i || (url[i] == url[i - 1]))
1301             {
1302               int l = urllen_http_hack (url);
1303               char *t = (char *)alloca (l + 2);
1304               memcpy (t, url, l);
1305               t[l] = '/';
1306               t[l + 1] = '\0';
1307               url = t;
1308               i = l;
1309             }
1310           constr = (char *)xmalloc (i + 1 + subsize + 1);
1311           strncpy (constr, url, i + 1);
1312           constr[i + 1] = '\0';
1313           strncat (constr, sub, subsize);
1314         }
1315       else /* *sub == `/' */
1316         {
1317           int fl;
1318
1319           i = 0;
1320           do
1321             {
1322               for (; url[i] && url[i] != '/'; i++);
1323               if (!url[i])
1324                 break;
1325               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1326               if (fl)
1327                 i += 2;
1328             }
1329           while (fl);
1330           if (!url[i])
1331             {
1332               int l = urllen_http_hack (url);
1333               char *t = (char *)alloca (l + 2);
1334               strcpy (t, url);
1335               t[l] = '/';
1336               t[l + 1] = '\0';
1337               url = t;
1338             }
1339           constr = (char *)xmalloc (i + 1 + subsize + 1);
1340           strncpy (constr, url, i);
1341           constr[i] = '\0';
1342           strncat (constr + i, sub, subsize);
1343           constr[i + subsize] = '\0';
1344         } /* *sub == `/' */
1345     }
1346   else /* !no_proto */
1347     {
1348       constr = (char *)xmalloc (subsize + 1);
1349       strncpy (constr, sub, subsize);
1350       constr[subsize] = '\0';
1351     }
1352   return constr;
1353 }
1354
1355 /* Like the function above, but with a saner caller interface. */
1356 char *
1357 url_concat (const char *base_url, const char *new_url)
1358 {
1359   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1360 }
1361 \f
1362 /* Optimize URL by host, destructively replacing u->host with realhost
1363    (u->host).  Do this regardless of opt.simple_check.  */
1364 void
1365 opt_url (struct urlinfo *u)
1366 {
1367   /* Find the "true" host.  */
1368   char *host = realhost (u->host);
1369   free (u->host);
1370   u->host = host;
1371   assert (u->dir != NULL);      /* the URL must have been parsed */
1372   /* Refresh the printed representation.  */
1373   free (u->url);
1374   u->url = str_url (u, 0);
1375 }
1376 \f
1377 /* Returns proxy host address, in accordance with PROTO.  */
1378 char *
1379 getproxy (uerr_t proto)
1380 {
1381   if (proto == URLHTTP)
1382     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1383   else if (proto == URLFTP)
1384     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1385   else
1386     return NULL;
1387 }
1388
1389 /* Should a host be accessed through proxy, concerning no_proxy?  */
1390 int
1391 no_proxy_match (const char *host, const char **no_proxy)
1392 {
1393   if (!no_proxy)
1394     return 1;
1395   else
1396     return !sufmatch (no_proxy, host);
1397 }
1398 \f
1399 /* Change the links in an HTML document.  Accepts a structure that
1400    defines the positions of all the links.  */
1401 void
1402 convert_links (const char *file, urlpos *l)
1403 {
1404   FILE               *fp;
1405   char               *buf, *p, *p2;
1406   downloaded_file_t  downloaded_file_return;
1407   long               size;
1408
1409   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1410   /* Read from the file....  */
1411   fp = fopen (file, "rb");
1412   if (!fp)
1413     {
1414       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1415                  file, strerror (errno));
1416       return;
1417     }
1418   /* ...to a buffer.  */
1419   load_file (fp, &buf, &size);
1420   fclose (fp);
1421
1422   downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
1423
1424   if (opt.backup_converted && downloaded_file_return)
1425     /* Rather than just writing over the original .html file with the converted
1426        version, save the former to *.orig.  Note we only do this for files we've
1427        _successfully_ downloaded, so we don't clobber .orig files sitting around
1428        from previous invocations. */
1429     {
1430       /* Construct the backup filename as the original name plus ".orig". */
1431       size_t         filename_len = strlen(file);
1432       char*          filename_plus_orig_suffix;
1433       boolean        already_wrote_backup_file = FALSE;
1434       slist*         converted_file_ptr;
1435       static slist*  converted_files = NULL;
1436
1437       if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1438         {
1439           /* Just write "orig" over "html".  We need to do it this way because
1440              when we're checking to see if we've downloaded the file before (to
1441              see if we can skip downloading it), we don't know if it's a
1442              text/html file.  Therefore we don't know yet at that stage that -E
1443              is going to cause us to tack on ".html", so we need to compare
1444              vs. the original URL plus ".orig", not the original URL plus
1445              ".html.orig". */
1446           filename_plus_orig_suffix = xmalloc(filename_len + 1);
1447           strcpy(filename_plus_orig_suffix, file);
1448           strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1449         }
1450       else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1451         {
1452           /* Append ".orig" to the name. */
1453           filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
1454           strcpy(filename_plus_orig_suffix, file);
1455           strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1456         }
1457
1458       /* We can get called twice on the same URL thanks to the
1459          convert_all_links() call in main().  If we write the .orig file each
1460          time in such a case, it'll end up containing the first-pass conversion,
1461          not the original file.  So, see if we've already been called on this
1462          file. */
1463       converted_file_ptr = converted_files;
1464       while (converted_file_ptr != NULL)
1465         if (strcmp(converted_file_ptr->string, file) == 0)
1466           {
1467             already_wrote_backup_file = TRUE;
1468             break;
1469           }
1470         else
1471           converted_file_ptr = converted_file_ptr->next;
1472
1473       if (!already_wrote_backup_file)
1474         {
1475           /* Rename <file> to <file>.orig before former gets written over. */
1476           if (rename(file, filename_plus_orig_suffix) != 0)
1477             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1478                        file, filename_plus_orig_suffix, strerror (errno));
1479
1480           /* Remember that we've already written a .orig backup for this file.
1481              Note that we never free this memory since we need it till the
1482              convert_all_links() call, which is one of the last things the
1483              program does before terminating.  BTW, I'm not sure if it would be
1484              safe to just set 'converted_file_ptr->string' to 'file' below,
1485              rather than making a copy of the string...  Another note is that I
1486              thought I could just add a field to the urlpos structure saying
1487              that we'd written a .orig file for this URL, but that didn't work,
1488              so I had to make this separate list. */
1489           converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1490           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1491           converted_file_ptr->next = converted_files;
1492           converted_files = converted_file_ptr;
1493         }
1494
1495       free(filename_plus_orig_suffix);
1496     }
1497   /* Now open the file for writing.  */
1498   fp = fopen (file, "wb");
1499   if (!fp)
1500     {
1501       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1502                  file, strerror (errno));
1503       free (buf);
1504       return;
1505     }
1506   /* Presumably we have to loop through multiple URLs here (even though we're
1507      only talking about a single local file) because of the -O option. */
1508   for (p = buf; l; l = l->next)
1509     {
1510       if (l->pos >= size)
1511         {
1512           DEBUGP (("Something strange is going on.  Please investigate."));
1513           break;
1514         }
1515       /* If the URL already is relative or it is not to be converted
1516          for some other reason (e.g. because of not having been
1517          downloaded in the first place), skip it.  */
1518       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1519         {
1520           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1521                    l->pos, l->flags));
1522           continue;
1523         }
1524       /* Else, reach the position of the offending URL, echoing
1525          everything up to it to the outfile.  */
1526       for (p2 = buf + l->pos; p < p2; p++)
1527         putc (*p, fp);
1528       if (l->flags & UABS2REL)
1529         /* Convert absolute URL to relative. */
1530         {
1531           char *newname = construct_relative (file, l->local_name);
1532           fprintf (fp, "%s", newname);
1533           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1534                    l->url, newname, l->pos, file));
1535           free (newname);
1536         }
1537       p += l->size;
1538     }
1539   /* Output the rest of the file. */
1540   if (p - buf < size)
1541     {
1542       for (p2 = buf + size; p < p2; p++)
1543         putc (*p, fp);
1544     }
1545   fclose (fp);
1546   free (buf);
1547   logputs (LOG_VERBOSE, _("done.\n"));
1548 }
1549
1550 /* Construct and return a malloced copy of the relative link from two
1551    pieces of information: local name S1 of the referring file and
1552    local name S2 of the referred file.
1553
1554    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1555    "jagor.srce.hr/images/news.gif", the function will return
1556    "images/news.gif".
1557
1558    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1559    "fly.cc.fer.hr/images/fly.gif", the function will return
1560    "../images/fly.gif".
1561
1562    Caveats: S1 should not begin with `/', unless S2 also begins with
1563    '/'.  S1 should not contain things like ".." and such --
1564    construct_relative ("fly/ioccc/../index.html",
1565    "fly/images/fly.gif") will fail.  (A workaround is to call
1566    something like path_simplify() on S1).  */
1567 static char *
1568 construct_relative (const char *s1, const char *s2)
1569 {
1570   int i, cnt, sepdirs1;
1571   char *res;
1572
1573   if (*s2 == '/')
1574     return xstrdup (s2);
1575   /* S1 should *not* be absolute, if S2 wasn't.  */
1576   assert (*s1 != '/');
1577   i = cnt = 0;
1578   /* Skip the directories common to both strings.  */
1579   while (1)
1580     {
1581       while (s1[i] && s2[i]
1582              && (s1[i] == s2[i])
1583              && (s1[i] != '/')
1584              && (s2[i] != '/'))
1585         ++i;
1586       if (s1[i] == '/' && s2[i] == '/')
1587         cnt = ++i;
1588       else
1589         break;
1590     }
1591   for (sepdirs1 = 0; s1[i]; i++)
1592     if (s1[i] == '/')
1593       ++sepdirs1;
1594   /* Now, construct the file as of:
1595      - ../ repeated sepdirs1 time
1596      - all the non-mutual directories of S2.  */
1597   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1598   for (i = 0; i < sepdirs1; i++)
1599     memcpy (res + 3 * i, "../", 3);
1600   strcpy (res + 3 * i, s2 + cnt);
1601   return res;
1602 }
1603 \f
1604 /* Add URL to the head of the list L.  */
1605 urlpos *
1606 add_url (urlpos *l, const char *url, const char *file)
1607 {
1608   urlpos *t;
1609
1610   t = (urlpos *)xmalloc (sizeof (urlpos));
1611   memset (t, 0, sizeof (*t));
1612   t->url = xstrdup (url);
1613   t->local_name = xstrdup (file);
1614   t->next = l;
1615   return t;
1616 }
1617
1618
1619 /* Remembers which files have been downloaded.  In the standard case, should be
1620    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1621    download successfully (i.e. not for ones we have failures on or that we skip
1622    due to -N).
1623
1624    When we've downloaded a file and tacked on a ".html" extension due to -E,
1625    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1626    FILE_DOWNLOADED_NORMALLY.
1627
1628    If you just want to check if a file has been previously added without adding
1629    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1630    with local filenames, not remote URLs. */
1631 downloaded_file_t
1632 downloaded_file (downloaded_file_t  mode, const char*  file)
1633 {
1634   typedef struct _downloaded_file_list
1635   {
1636     char*                          file;
1637     downloaded_file_t              download_type;
1638     struct _downloaded_file_list*  next;
1639   } downloaded_file_list;
1640
1641   boolean                       found_file = FALSE;
1642   static downloaded_file_list*  downloaded_files = NULL;
1643   downloaded_file_list*         rover = downloaded_files;
1644
1645   while (rover != NULL)
1646     if (strcmp(rover->file, file) == 0)
1647       {
1648         found_file = TRUE;
1649         break;
1650       }
1651     else
1652       rover = rover->next;
1653
1654   if (found_file)
1655     return rover->download_type;  /* file had already been downloaded */
1656   else
1657     {
1658       if (mode != CHECK_FOR_FILE)
1659         {
1660           rover = xmalloc(sizeof(*rover));
1661           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1662           rover->download_type = mode;
1663           rover->next = downloaded_files;
1664           downloaded_files = rover;
1665         }
1666
1667       return FILE_NOT_ALREADY_DOWNLOADED;
1668     }
1669 }