sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  62 #endif /* WINDOWS */
  63
  64 #define UNSAFE_CHAR(c) (   ((unsigned char)(c) <= ' ')  /* ASCII 32  */  \
  65                         || ((unsigned char)(c) >  '~')  /* ASCII 127 */  \
  66                         || strchr (URL_UNSAFE_CHARS, c))
  67
  68 /* If S contains unsafe characters, free it and replace it with a
  69    version that doesn't.  */
  70 #define URL_CLEANSE(s) do                       \
  71 {                                               \
  72   if (contains_unsafe (s))                      \
  73     {                                           \
  74       char *uc_tmp = encode_string (s);         \
  75       free (s);                                 \
  76       (s) = uc_tmp;                             \
  77     }                                           \
  78 } while (0)
  79
  80 /* Is a directory "."?  */
  81 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  82 /* Is a directory ".."?  */
  83 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  84
  85 /* NULL-terminated list of strings to be recognized as prototypes (URL
  86    schemes).  Note that recognized doesn't mean supported -- only HTTP
  87    and FTP are currently supported.
  88
  89    However, a string that does not match anything in the list will be
  90    considered a relative URL.  Thus it's important that this list has
  91    anything anyone could think of being legal.
  92
  93    There are wild things here.  :-) Take a look at
  94    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  95    fun.  */
  96 static char *protostrings[] =
  97 {
  98   "cid:",
  99   "clsid:",
 100   "file:",
 101   "finger:",
 102   "ftp:",
 103   "gopher:",
 104   "hdl:",
 105   "http:",
 106   "https:",
 107   "ilu:",
 108   "ior:",
 109   "irc:",
 110   "java:",
 111   "javascript:",
 112   "lifn:",
 113   "mailto:",
 114   "mid:",
 115   "news:",
 116   "nntp:",
 117   "path:",
 118   "prospero:",
 119   "rlogin:",
 120   "service:",
 121   "shttp:",
 122   "snews:",
 123   "stanf:",
 124   "telnet:",
 125   "tn3270:",
 126   "wais:",
 127   "whois++:",
 128   NULL
 129 };
 130
 131 struct proto
 132 {
 133   char *name;
 134   uerr_t ind;
 135   unsigned short port;
 136 };
 137
 138 /* Similar to former, but for supported protocols: */
 139 static struct proto sup_protos[] =
 140 {
 141   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 142   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 143   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 144 };
 145
 146 static void parse_dir PARAMS ((const char *, char **, char **));
 147 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 148 static char *construct PARAMS ((const char *, const char *, int , int));
 149 static char *construct_relative PARAMS ((const char *, const char *));
 150 static char process_ftp_type PARAMS ((char *));
 151
 152 \f
 153 /* Returns the number of characters to be skipped if the first thing
 154    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 155    URL: are also skipped.  */
 156 int
 157 skip_url (const char *url)
 158 {
 159   int i;
 160
 161   if (TOUPPER (url[0]) == 'U'
 162       && TOUPPER (url[1]) == 'R'
 163       && TOUPPER (url[2]) == 'L'
 164       && url[3] == ':')
 165     {
 166       /* Skip blanks.  */
 167       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 168       return i;
 169     }
 170   else
 171     return 0;
 172 }
 173
 174 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 175 int
 176 contains_unsafe (const char *s)
 177 {
 178   for (; *s; s++)
 179     if (UNSAFE_CHAR (*s))
 180       return 1;
 181   return 0;
 182 }
 183
 184 /* Decodes the forms %xy in a URL to the character the hexadecimal
 185    code of which is xy.  xy are hexadecimal digits from
 186    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 187    hex-digits or `%' precedes `\0', the sequence is inserted
 188    literally.  */
 189
 190 static void
 191 decode_string (char *s)
 192 {
 193   char *p = s;
 194
 195   for (; *s; s++, p++)
 196     {
 197       if (*s != '%')
 198         *p = *s;
 199       else
 200         {
 201           /* Do nothing if at the end of the string, or if the chars
 202              are not hex-digits.  */
 203           if (!*(s + 1) || !*(s + 2)
 204               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 205             {
 206               *p = *s;
 207               continue;
 208             }
 209           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 210           s += 2;
 211         }
 212     }
 213   *p = '\0';
 214 }
 215
 216 /* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
 217    given string, returning a malloc-ed %XX encoded string.  */
 218 char *
 219 encode_string (const char *s)
 220 {
 221   const char *b;
 222   char *p, *res;
 223   int i;
 224
 225   b = s;
 226   for (i = 0; *s; s++, i++)
 227     if (UNSAFE_CHAR (*s))
 228       i += 2; /* Two more characters (hex digits) */
 229   res = (char *)xmalloc (i + 1);
 230   s = b;
 231   for (p = res; *s; s++)
 232     if (UNSAFE_CHAR (*s))
 233       {
 234         const unsigned char c = *s;
 235         *p++ = '%';
 236         *p++ = HEXD2ASC (c >> 4);
 237         *p++ = HEXD2ASC (c & 0xf);
 238       }
 239     else
 240       *p++ = *s;
 241   *p = '\0';
 242   return res;
 243 }
 244 \f
 245 /* Returns the proto-type if URL's protocol is supported, or
 246    URLUNKNOWN if not.  */
 247 uerr_t
 248 urlproto (const char *url)
 249 {
 250   int i;
 251
 252   url += skip_url (url);
 253   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 254     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 255       return sup_protos[i].ind;
 256   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 257   if (url[i] == ':')
 258     {
 259       for (++i; url[i] && url[i] != '/'; i++)
 260         if (!ISDIGIT (url[i]))
 261           return URLBADPORT;
 262       if (url[i - 1] == ':')
 263         return URLFTP;
 264       else
 265         return URLHTTP;
 266     }
 267   else
 268     return URLHTTP;
 269 }
 270
 271 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 272    part is found, returns 0.  */
 273 int
 274 skip_proto (const char *url)
 275 {
 276   char **s;
 277   int l;
 278
 279   for (s = protostrings; *s; s++)
 280     if (!strncasecmp (*s, url, strlen (*s)))
 281       break;
 282   if (!*s)
 283     return 0;
 284   l = strlen (*s);
 285   /* HTTP and FTP protocols are expected to yield exact host names
 286      (i.e. the `//' part must be skipped, too).  */
 287   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 288     l += 2;
 289   return l;
 290 }
 291
 292 /* Returns 1 if the URL begins with a protocol (supported or
 293    unsupported), 0 otherwise.  */
 294 static int
 295 has_proto (const char *url)
 296 {
 297   char **s;
 298
 299   url += skip_url (url);
 300   for (s = protostrings; *s; s++)
 301     if (strncasecmp (url, *s, strlen (*s)) == 0)
 302       return 1;
 303   return 0;
 304 }
 305
 306 /* Skip the username and password, if present here.  The function
 307    should be called *not* with the complete URL, but with the part
 308    right after the protocol.
 309
 310    If no username and password are found, return 0.  */
 311 int
 312 skip_uname (const char *url)
 313 {
 314   const char *p;
 315   for (p = url; *p && *p != '/'; p++)
 316     if (*p == '@')
 317       break;
 318   /* If a `@' was found before the first occurrence of `/', skip
 319      it.  */
 320   if (*p == '@')
 321     return p - url + 1;
 322   else
 323     return 0;
 324 }
 325 \f
 326 /* Allocate a new urlinfo structure, fill it with default values and
 327    return a pointer to it.  */
 328 struct urlinfo *
 329 newurl (void)
 330 {
 331   struct urlinfo *u;
 332
 333   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 334   memset (u, 0, sizeof (*u));
 335   u->proto = URLUNKNOWN;
 336   return u;
 337 }
 338
 339 /* Perform a "deep" free of the urlinfo structure.  The structure
 340    should have been created with newurl, but need not have been used.
 341    If free_pointer is non-0, free the pointer itself.  */
 342 void
 343 freeurl (struct urlinfo *u, int complete)
 344 {
 345   assert (u != NULL);
 346   FREE_MAYBE (u->url);
 347   FREE_MAYBE (u->host);
 348   FREE_MAYBE (u->path);
 349   FREE_MAYBE (u->file);
 350   FREE_MAYBE (u->dir);
 351   FREE_MAYBE (u->user);
 352   FREE_MAYBE (u->passwd);
 353   FREE_MAYBE (u->local);
 354   FREE_MAYBE (u->referer);
 355   if (u->proxy)
 356     freeurl (u->proxy, 1);
 357   if (complete)
 358     free (u);
 359   return;
 360 }
 361 \f
 362 /* Extract the given URL of the form
 363    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 364    1. hostname (terminated with `/' or `:')
 365    2. port number (terminated with `/'), or chosen for the protocol
 366    3. dirname (everything after hostname)
 367    Most errors are handled.  No allocation is done, you must supply
 368    pointers to allocated memory.
 369    ...and a host of other stuff :-)
 370
 371    - Recognizes hostname:dir/file for FTP and
 372      hostname (:portnum)?/dir/file for HTTP.
 373    - Parses the path to yield directory and file
 374    - Parses the URL to yield the username and passwd (if present)
 375    - Decodes the strings, in case they contain "forbidden" characters
 376    - Writes the result to struct urlinfo
 377
 378    If the argument STRICT is set, it recognizes only the canonical
 379    form.  */
 380 uerr_t
 381 parseurl (const char *url, struct urlinfo *u, int strict)
 382 {
 383   int i, l, abs_ftp;
 384   int recognizable;            /* Recognizable URL is the one where
 385                                   the protocol name was explicitly
 386                                   named, i.e. it wasn't deduced from
 387                                   the URL format.  */
 388   uerr_t type;
 389
 390   DEBUGP (("parseurl (\"%s\") -> ", url));
 391   url += skip_url (url);
 392   recognizable = has_proto (url);
 393   if (strict && !recognizable)
 394     return URLUNKNOWN;
 395   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 396     {
 397       l = strlen (sup_protos[i].name);
 398       if (!strncasecmp (sup_protos[i].name, url, l))
 399         break;
 400     }
 401   /* If protocol is recognizable, but unsupported, bail out, else
 402      suppose unknown.  */
 403   if (recognizable && !sup_protos[i].name)
 404     return URLUNKNOWN;
 405   else if (i == ARRAY_SIZE (sup_protos))
 406     type = URLUNKNOWN;
 407   else
 408     u->proto = type = sup_protos[i].ind;
 409
 410   if (type == URLUNKNOWN)
 411     l = 0;
 412   /* Allow a username and password to be specified (i.e. just skip
 413      them for now).  */
 414   if (recognizable)
 415     l += skip_uname (url + l);
 416   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 417   if (i == l)
 418     return URLBADHOST;
 419   /* Get the hostname.  */
 420   u->host = strdupdelim (url + l, url + i);
 421   DEBUGP (("host %s -> ", u->host));
 422
 423   /* Assume no port has been given.  */
 424   u->port = 0;
 425   if (url[i] == ':')
 426     {
 427       /* We have a colon delimiting the hostname.  It could mean that
 428          a port number is following it, or a directory.  */
 429       if (ISDIGIT (url[++i]))    /* A port number */
 430         {
 431           if (type == URLUNKNOWN)
 432             u->proto = type = URLHTTP;
 433           for (; url[i] && url[i] != '/'; i++)
 434             if (ISDIGIT (url[i]))
 435               u->port = 10 * u->port + (url[i] - '0');
 436             else
 437               return URLBADPORT;
 438           if (!u->port)
 439             return URLBADPORT;
 440           DEBUGP (("port %hu -> ", u->port));
 441         }
 442       else if (type == URLUNKNOWN) /* or a directory */
 443         u->proto = type = URLFTP;
 444       else                      /* or just a misformed port number */
 445         return URLBADPORT;
 446     }
 447   else if (type == URLUNKNOWN)
 448     u->proto = type = URLHTTP;
 449   if (!u->port)
 450     {
 451       int i;
 452       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 453         if (sup_protos[i].ind == type)
 454           break;
 455       if (i == ARRAY_SIZE (sup_protos))
 456         return URLUNKNOWN;
 457       u->port = sup_protos[i].port;
 458     }
 459   /* Some delimiter troubles...  */
 460   if (url[i] == '/' && url[i - 1] != ':')
 461     ++i;
 462   if (type == URLHTTP)
 463     while (url[i] && url[i] == '/')
 464       ++i;
 465   u->path = (char *)xmalloc (strlen (url + i) + 8);
 466   strcpy (u->path, url + i);
 467   if (type == URLFTP)
 468     {
 469       u->ftp_type = process_ftp_type (u->path);
 470       /* #### We don't handle type `d' correctly yet.  */
 471       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 472         u->ftp_type = 'I';
 473     }
 474   DEBUGP (("opath %s -> ", u->path));
 475   /* Parse the username and password (if existing).  */
 476   parse_uname (url, &u->user, &u->passwd);
 477   /* Decode the strings, as per RFC 1738.  */
 478   decode_string (u->host);
 479   decode_string (u->path);
 480   if (u->user)
 481     decode_string (u->user);
 482   if (u->passwd)
 483     decode_string (u->passwd);
 484   /* Parse the directory.  */
 485   parse_dir (u->path, &u->dir, &u->file);
 486   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 487   /* Simplify the directory.  */
 488   path_simplify (u->dir);
 489   /* Remove the leading `/' in HTTP.  */
 490   if (type == URLHTTP && *u->dir == '/')
 491     strcpy (u->dir, u->dir + 1);
 492   DEBUGP (("ndir %s\n", u->dir));
 493   /* Strip trailing `/'.  */
 494   l = strlen (u->dir);
 495   if (l && u->dir[l - 1] == '/')
 496     u->dir[l - 1] = '\0';
 497   /* Re-create the path: */
 498   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 499   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 500       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 501   strcpy (u->path, abs_ftp ? "%2F" : "/");
 502   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 503   strcat (u->path, *u->dir ? "/" : "");
 504   strcat (u->path, u->file);
 505   URL_CLEANSE (u->path);
 506   /* Create the clean URL.  */
 507   u->url = str_url (u, 0);
 508   return URLOK;
 509 }
 510 \f
 511 /* Build the directory and filename components of the path.  Both
 512    components are *separately* malloc-ed strings!  It does not change
 513    the contents of path.
 514
 515    If the path ends with "." or "..", they are (correctly) counted as
 516    directories.  */
 517 static void
 518 parse_dir (const char *path, char **dir, char **file)
 519 {
 520   int i, l;
 521
 522   for (i = l = strlen (path); i && path[i] != '/'; i--);
 523   if (!i && *path != '/')   /* Just filename */
 524     {
 525       if (DOTP (path) || DDOTP (path))
 526         {
 527           *dir = xstrdup (path);
 528           *file = xstrdup ("");
 529         }
 530       else
 531         {
 532           *dir = xstrdup ("");     /* This is required because of FTP */
 533           *file = xstrdup (path);
 534         }
 535     }
 536   else if (!i)                 /* /filename */
 537     {
 538       if (DOTP (path + 1) || DDOTP (path + 1))
 539         {
 540           *dir = xstrdup (path);
 541           *file = xstrdup ("");
 542         }
 543       else
 544         {
 545           *dir = xstrdup ("/");
 546           *file = xstrdup (path + 1);
 547         }
 548     }
 549   else /* Nonempty directory with or without a filename */
 550     {
 551       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 552         {
 553           *dir = xstrdup (path);
 554           *file = xstrdup ("");
 555         }
 556       else
 557         {
 558           *dir = strdupdelim (path, path + i);
 559           *file = strdupdelim (path + i + 1, path + l + 1);
 560         }
 561     }
 562 }
 563
 564 /* Find the optional username and password within the URL, as per
 565    RFC1738.  The returned user and passwd char pointers are
 566    malloc-ed.  */
 567 static uerr_t
 568 parse_uname (const char *url, char **user, char **passwd)
 569 {
 570   int l;
 571   const char *p, *col;
 572   char **where;
 573
 574   *user = NULL;
 575   *passwd = NULL;
 576   url += skip_url (url);
 577   /* Look for end of protocol string.  */
 578   l = skip_proto (url);
 579   if (!l)
 580     return URLUNKNOWN;
 581   /* Add protocol offset.  */
 582   url += l;
 583   /* Is there an `@' character?  */
 584   for (p = url; *p && *p != '/'; p++)
 585     if (*p == '@')
 586       break;
 587   /* If not, return.  */
 588   if (*p != '@')
 589     return URLOK;
 590   /* Else find the username and password.  */
 591   for (p = col = url; *p != '@'; p++)
 592     {
 593       if (*p == ':' && !*user)
 594         {
 595           *user = (char *)xmalloc (p - url + 1);
 596           memcpy (*user, url, p - url);
 597           (*user)[p - url] = '\0';
 598           col = p + 1;
 599         }
 600     }
 601   /* Decide whether you have only the username or both.  */
 602   where = *user ? passwd : user;
 603   *where = (char *)xmalloc (p - col + 1);
 604   memcpy (*where, col, p - col);
 605   (*where)[p - col] = '\0';
 606   return URLOK;
 607 }
 608
 609 /* If PATH ends with `;type=X', return the character X.  */
 610 static char
 611 process_ftp_type (char *path)
 612 {
 613   int len = strlen (path);
 614
 615   if (len >= 7
 616       && !memcmp (path + len - 7, ";type=", 6))
 617     {
 618       path[len - 7] = '\0';
 619       return path[len - 1];
 620     }
 621   else
 622     return '\0';
 623 }
 624 \f
 625 /* Return the URL as fine-formed string, with a proper protocol, port
 626    number, directory and optional user/password.  If HIDE is non-zero,
 627    password will be hidden.  The forbidden characters in the URL will
 628    be cleansed.  */
 629 char *
 630 str_url (const struct urlinfo *u, int hide)
 631 {
 632   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 633   int i, l, ln, lu, lh, lp, lf, ld;
 634
 635   /* Look for the protocol name.  */
 636   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 637     if (sup_protos[i].ind == u->proto)
 638       break;
 639   if (i == ARRAY_SIZE (sup_protos))
 640     return NULL;
 641   proto_name = sup_protos[i].name;
 642   host = CLEANDUP (u->host);
 643   dir = CLEANDUP (u->dir);
 644   file = CLEANDUP (u->file);
 645   user = passwd = NULL;
 646   if (u->user)
 647     user = CLEANDUP (u->user);
 648   if (u->passwd)
 649     {
 650       int i;
 651       passwd = CLEANDUP (u->passwd);
 652       if (hide)
 653         for (i = 0; passwd[i]; i++)
 654           passwd[i] = 'x';
 655     }
 656   if (u->proto == URLFTP && *dir == '/')
 657     {
 658       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 659       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 660       *tmp = '%';
 661       tmp[1] = '2';
 662       tmp[2] = 'F';
 663       strcpy (tmp + 3, dir + 1);
 664       free (dir);
 665       dir = tmp;
 666     }
 667
 668   ln = strlen (proto_name);
 669   lu = user ? strlen (user) : 0;
 670   lp = passwd ? strlen (passwd) : 0;
 671   lh = strlen (host);
 672   ld = strlen (dir);
 673   lf = strlen (file);
 674   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 675   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 676      (user ? user : ""), (passwd ? ":" : ""),
 677      (passwd ? passwd : ""), (user ? "@" : ""),
 678      host, u->port, dir, *dir ? "/" : "", file); */
 679   l = 0;
 680   memcpy (res, proto_name, ln);
 681   l += ln;
 682   if (user)
 683     {
 684       memcpy (res + l, user, lu);
 685       l += lu;
 686       if (passwd)
 687         {
 688           res[l++] = ':';
 689           memcpy (res + l, passwd, lp);
 690           l += lp;
 691         }
 692       res[l++] = '@';
 693     }
 694   memcpy (res + l, host, lh);
 695   l += lh;
 696   res[l++] = ':';
 697   long_to_string (res + l, (long)u->port);
 698   l += numdigit (u->port);
 699   res[l++] = '/';
 700   memcpy (res + l, dir, ld);
 701   l += ld;
 702   if (*dir)
 703     res[l++] = '/';
 704   strcpy (res + l, file);
 705   free (host);
 706   free (dir);
 707   free (file);
 708   FREE_MAYBE (user);
 709   FREE_MAYBE (passwd);
 710   return res;
 711 }
 712
 713 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 714    location.  Uses parseurl to parse them, and compares the canonical
 715    forms.
 716
 717    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 718    return 0 on error.  */
 719 int
 720 url_equal (const char *url1, const char *url2)
 721 {
 722   struct urlinfo *u1, *u2;
 723   uerr_t err;
 724   int res;
 725
 726   u1 = newurl ();
 727   err = parseurl (url1, u1, 0);
 728   if (err != URLOK)
 729     {
 730       freeurl (u1, 1);
 731       return 0;
 732     }
 733   u2 = newurl ();
 734   err = parseurl (url2, u2, 0);
 735   if (err != URLOK)
 736     {
 737       freeurl (u2, 1);
 738       return 0;
 739     }
 740   res = !strcmp (u1->url, u2->url);
 741   freeurl (u1, 1);
 742   freeurl (u2, 1);
 743   return res;
 744 }
 745 \f
 746 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 747    buffer may contain pretty much anything; no errors are signaled.  */
 748 static const char *
 749 findurl (const char *buf, int howmuch, int *count)
 750 {
 751   char **prot;
 752   const char *s1, *s2;
 753
 754   for (s1 = buf; howmuch; s1++, howmuch--)
 755     for (prot = protostrings; *prot; prot++)
 756       if (howmuch <= strlen (*prot))
 757         continue;
 758       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 759         {
 760           for (s2 = s1, *count = 0;
 761                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 762                  !strchr (URL_SEPARATOR, *s2);
 763                s2++, (*count)++, howmuch--);
 764           return s1;
 765         }
 766   return NULL;
 767 }
 768
 769 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 770    each pointer representing a URL string.  The file is *not* assumed
 771    to be HTML.  */
 772 urlpos *
 773 get_urls_file (const char *file)
 774 {
 775   long nread;
 776   FILE *fp;
 777   char *buf;
 778   const char *pbuf;
 779   int size;
 780   urlpos *first, *current, *old;
 781
 782   if (file && !HYPHENP (file))
 783     {
 784       fp = fopen (file, "rb");
 785       if (!fp)
 786         {
 787           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 788           return NULL;
 789         }
 790     }
 791   else
 792     fp = stdin;
 793   /* Load the file.  */
 794   load_file (fp, &buf, &nread);
 795   if (file && !HYPHENP (file))
 796     fclose (fp);
 797   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 798   first = current = NULL;
 799   /* Fill the linked list with URLs.  */
 800   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 801        pbuf += size)
 802     {
 803       /* Allocate the space.  */
 804       old = current;
 805       current = (urlpos *)xmalloc (sizeof (urlpos));
 806       if (old)
 807         old->next = current;
 808       memset (current, 0, sizeof (*current));
 809       current->next = NULL;
 810       current->url = (char *)xmalloc (size + 1);
 811       memcpy (current->url, pbuf, size);
 812       current->url[size] = '\0';
 813       if (!first)
 814         first = current;
 815     }
 816   /* Free the buffer.  */
 817   free (buf);
 818
 819   return first;
 820 }
 821
 822 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 823    an HTML document using htmlfindurl(), which see.  get_urls_html()
 824    constructs the HTML-s from the relative href-s.
 825
 826    If SILENT is non-zero, do not barf on baseless relative links.  */
 827 urlpos *
 828 get_urls_html (const char *file, const char *this_url, int silent)
 829 {
 830   long nread;
 831   FILE *fp;
 832   char *orig_buf;
 833   const char *buf;
 834   int step, first_time;
 835   urlpos *first, *current, *old;
 836
 837   if (file && !HYPHENP (file))
 838     {
 839       fp = fopen (file, "rb");
 840       if (!fp)
 841         {
 842           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 843           return NULL;
 844         }
 845     }
 846   else
 847     fp = stdin;
 848   /* Load the file.  */
 849   load_file (fp, &orig_buf, &nread);
 850   if (file && !HYPHENP (file))
 851     fclose (fp);
 852   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 853   first = current = NULL;
 854   first_time = 1;
 855   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 856   for (buf = orig_buf;
 857        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
 858        buf += step)
 859     {
 860       int i, no_proto;
 861       int size = step;
 862       const char *pbuf = buf;
 863       char *constr, *base;
 864       const char *cbase;
 865
 866       first_time = 0;
 867
 868       /* A frequent phenomenon that needs to be handled are pages
 869          generated by brain-damaged HTML generators, which refer to to
 870          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 871          any spaces at the beginning or at the end of the string.
 872          This is probably not strictly correct, but that's what the
 873          browsers do, so we may follow.  May the authors of "WYSIWYG"
 874          HTML tools burn in hell for the damage they've inflicted!  */
 875       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 876         {
 877           ++pbuf;
 878           --size;
 879         }
 880       while (size && ISSPACE (pbuf[size - 1]))
 881         --size;
 882       if (!size)
 883         break;
 884
 885       for (i = 0; protostrings[i]; i++)
 886         {
 887           if (!strncasecmp (protostrings[i], pbuf,
 888                             MINVAL (strlen (protostrings[i]), size)))
 889             break;
 890         }
 891       /* Check for http:RELATIVE_URI.  See below for details.  */
 892       if (protostrings[i]
 893           && !(strncasecmp (pbuf, "http:", 5) == 0
 894                && strncasecmp (pbuf, "http://", 7) != 0))
 895         {
 896           no_proto = 0;
 897         }
 898       else
 899         {
 900           no_proto = 1;
 901           /* This is for extremely brain-damaged pages that refer to
 902              relative URI-s as <a href="http:URL">.  Just strip off the
 903              silly leading "http:" (as well as any leading blanks
 904              before it).  */
 905           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 906             pbuf += 5, size -= 5;
 907         }
 908       if (!no_proto)
 909         {
 910           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 911             {
 912               if (!strncasecmp (sup_protos[i].name, pbuf,
 913                                MINVAL (strlen (sup_protos[i].name), size)))
 914                 break;
 915             }
 916           /* Do *not* accept a non-supported protocol.  */
 917           if (i == ARRAY_SIZE (sup_protos))
 918             continue;
 919         }
 920       if (no_proto)
 921         {
 922           /* First, construct the base, which can be relative itself.
 923
 924              Criteria for creating the base are:
 925              1) html_base created by <base href="...">
 926              2) current URL
 927              3) base provided from the command line */
 928           cbase = html_base ();
 929           if (!cbase)
 930             cbase = this_url;
 931           if (!cbase)
 932             cbase = opt.base_href;
 933           if (!cbase)             /* Error condition -- a baseless
 934                                      relative link.  */
 935             {
 936               if (!opt.quiet && !silent)
 937                 {
 938                   /* Use malloc, not alloca because this is called in
 939                      a loop. */
 940                   char *temp = (char *)malloc (size + 1);
 941                   strncpy (temp, pbuf, size);
 942                   temp[size] = '\0';
 943                   logprintf (LOG_NOTQUIET,
 944                              _("Error (%s): Link %s without a base provided.\n"),
 945                              file, temp);
 946                   free (temp);
 947                 }
 948               continue;
 949             }
 950           if (this_url)
 951             base = construct (this_url, cbase, strlen (cbase),
 952                               !has_proto (cbase));
 953           else
 954             {
 955               /* Base must now be absolute, with host name and
 956                  protocol.  */
 957               if (!has_proto (cbase))
 958                 {
 959                   logprintf (LOG_NOTQUIET, _("\
 960 Error (%s): Base %s relative, without referer URL.\n"),
 961                              file, cbase);
 962                   continue;
 963                 }
 964               base = xstrdup (cbase);
 965             }
 966           constr = construct (base, pbuf, size, no_proto);
 967           free (base);
 968         }
 969       else /* has proto */
 970         {
 971           constr = (char *)xmalloc (size + 1);
 972           strncpy (constr, pbuf, size);
 973           constr[size] = '\0';
 974         }
 975 #ifdef DEBUG
 976       if (opt.debug)
 977         {
 978           char *tmp;
 979           const char *tmp2;
 980
 981           tmp2 = html_base ();
 982           /* Use malloc, not alloca because this is called in a loop. */
 983           tmp = (char *)xmalloc (size + 1);
 984           strncpy (tmp, pbuf, size);
 985           tmp[size] = '\0';
 986           logprintf (LOG_ALWAYS,
 987                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
 988                      file, this_url ? this_url : "(null)",
 989                      tmp2 ? tmp2 : "(null)", tmp, constr);
 990           free (tmp);
 991         }
 992 #endif
 993
 994       /* Allocate the space.  */
 995       old = current;
 996       current = (urlpos *)xmalloc (sizeof (urlpos));
 997       if (old)
 998         old->next = current;
 999       if (!first)
1000         first = current;
1001       /* Fill the values.  */
1002       memset (current, 0, sizeof (*current));
1003       current->next = NULL;
1004       current->url = constr;
1005       current->size = size;
1006       current->pos = pbuf - orig_buf;
1007       /* A URL is relative if the host and protocol are not named,
1008          and the name does not start with `/'.  */
1009       if (no_proto && *pbuf != '/')
1010         current->flags |= (URELATIVE | UNOPROTO);
1011       else if (no_proto)
1012         current->flags |= UNOPROTO;
1013     }
1014   free (orig_buf);
1015
1016   return first;
1017 }
1018 \f
1019 /* Free the linked list of urlpos.  */
1020 void
1021 free_urlpos (urlpos *l)
1022 {
1023   while (l)
1024     {
1025       urlpos *next = l->next;
1026       free (l->url);
1027       FREE_MAYBE (l->local_name);
1028       free (l);
1029       l = next;
1030     }
1031 }
1032
1033 /* Rotate FNAME opt.backups times */
1034 void
1035 rotate_backups(const char *fname)
1036 {
1037   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1038   char *from = (char *)alloca (maxlen);
1039   char *to = (char *)alloca (maxlen);
1040   struct stat sb;
1041   int i;
1042
1043   if (stat (fname, &sb) == 0)
1044     if (S_ISREG (sb.st_mode) == 0)
1045       return;
1046
1047   for (i = opt.backups; i > 1; i--)
1048     {
1049       sprintf (from, "%s.%d", fname, i - 1);
1050       sprintf (to, "%s.%d", fname, i);
1051       /* #### This will fail on machines without the rename() system
1052          call.  */
1053       rename (from, to);
1054     }
1055
1056   sprintf (to, "%s.%d", fname, 1);
1057   rename(fname, to);
1058 }
1059
1060 /* Create all the necessary directories for PATH (a file).  Calls
1061    mkdirhier() internally.  */
1062 int
1063 mkalldirs (const char *path)
1064 {
1065   const char *p;
1066   char *t;
1067   struct stat st;
1068   int res;
1069
1070   p = path + strlen (path);
1071   for (; *p != '/' && p != path; p--);
1072   /* Don't create if it's just a file.  */
1073   if ((p == path) && (*p != '/'))
1074     return 0;
1075   t = strdupdelim (path, p);
1076   /* Check whether the directory exists.  */
1077   if ((stat (t, &st) == 0))
1078     {
1079       if (S_ISDIR (st.st_mode))
1080         {
1081           free (t);
1082           return 0;
1083         }
1084       else
1085         {
1086           /* If the dir exists as a file name, remove it first.  This
1087              is *only* for Wget to work with buggy old CERN http
1088              servers.  Here is the scenario: When Wget tries to
1089              retrieve a directory without a slash, e.g.
1090              http://foo/bar (bar being a directory), CERN server will
1091              not redirect it too http://foo/bar/ -- it will generate a
1092              directory listing containing links to bar/file1,
1093              bar/file2, etc.  Wget will lose because it saves this
1094              HTML listing to a file `bar', so it cannot create the
1095              directory.  To work around this, if the file of the same
1096              name exists, we just remove it and create the directory
1097              anyway.  */
1098           DEBUGP (("Removing %s because of directory danger!\n", t));
1099           unlink (t);
1100         }
1101     }
1102   res = make_directory (t);
1103   if (res != 0)
1104     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1105   free (t);
1106   return res;
1107 }
1108
1109 static int
1110 count_slashes (const char *s)
1111 {
1112   int i = 0;
1113   while (*s)
1114     if (*s++ == '/')
1115       ++i;
1116   return i;
1117 }
1118
1119 /* Return the path name of the URL-equivalent file name, with a
1120    remote-like structure of directories.  */
1121 static char *
1122 mkstruct (const struct urlinfo *u)
1123 {
1124   char *host, *dir, *file, *res, *dirpref;
1125   int l;
1126
1127   assert (u->dir != NULL);
1128   assert (u->host != NULL);
1129
1130   if (opt.cut_dirs)
1131     {
1132       char *ptr = u->dir + (*u->dir == '/');
1133       int slash_count = 1 + count_slashes (ptr);
1134       int cut = MINVAL (opt.cut_dirs, slash_count);
1135       for (; cut && *ptr; ptr++)
1136         if (*ptr == '/')
1137           --cut;
1138       STRDUP_ALLOCA (dir, ptr);
1139     }
1140   else
1141     dir = u->dir + (*u->dir == '/');
1142
1143   host = xstrdup (u->host);
1144   /* Check for the true name (or at least a consistent name for saving
1145      to directory) of HOST, reusing the hlist if possible.  */
1146   if (opt.add_hostdir && !opt.simple_check)
1147     {
1148       char *nhost = realhost (host);
1149       free (host);
1150       host = nhost;
1151     }
1152   /* Add dir_prefix and hostname (if required) to the beginning of
1153      dir.  */
1154   if (opt.add_hostdir)
1155     {
1156       if (!DOTP (opt.dir_prefix))
1157         {
1158           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1159                                     + strlen (host) + 1);
1160           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1161         }
1162       else
1163         STRDUP_ALLOCA (dirpref, host);
1164     }
1165   else                         /* not add_hostdir */
1166     {
1167       if (!DOTP (opt.dir_prefix))
1168         dirpref = opt.dir_prefix;
1169       else
1170         dirpref = "";
1171     }
1172   free (host);
1173
1174   /* If there is a prefix, prepend it.  */
1175   if (*dirpref)
1176     {
1177       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1178       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1179       dir = newdir;
1180     }
1181   dir = xstrdup (dir);
1182   URL_CLEANSE (dir);
1183   l = strlen (dir);
1184   if (l && dir[l - 1] == '/')
1185     dir[l - 1] = '\0';
1186
1187   if (!*u->file)
1188     file = "index.html";
1189   else
1190     file = u->file;
1191
1192   /* Finally, construct the full name.  */
1193   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1194   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1195   free (dir);
1196   return res;
1197 }
1198
1199 /* Create a unique filename, corresponding to a given URL.  Calls
1200    mkstruct if necessary.  Does *not* actually create any directories.  */
1201 char *
1202 url_filename (const struct urlinfo *u)
1203 {
1204   char *file, *name;
1205   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1206
1207   if (opt.dirstruct)
1208     {
1209       file = mkstruct (u);
1210       have_prefix = 1;
1211     }
1212   else
1213     {
1214       if (!*u->file)
1215         file = xstrdup ("index.html");
1216       else
1217         file = xstrdup (u->file);
1218     }
1219
1220   if (!have_prefix)
1221     {
1222       /* Check whether the prefix directory is something other than "."
1223          before prepending it.  */
1224       if (!DOTP (opt.dir_prefix))
1225         {
1226           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1227                                          + 1 + strlen (file) + 1);
1228           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1229           free (file);
1230           file = nfile;
1231         }
1232     }
1233   /* DOS-ish file systems don't like `%' signs in them; we change it
1234      to `@'.  */
1235 #ifdef WINDOWS
1236   {
1237     char *p = file;
1238     for (p = file; *p; p++)
1239       if (*p == '%')
1240         *p = '@';
1241   }
1242 #endif /* WINDOWS */
1243
1244   /* Check the cases in which the unique extensions are not used:
1245      1) Clobbering is turned off (-nc).
1246      2) Retrieval with regetting.
1247      3) Timestamping is used.
1248      4) Hierarchy is built.
1249
1250      The exception is the case when file does exist and is a
1251      directory (actually support for bad httpd-s).  */
1252   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1253       && !(file_exists_p (file) && !file_non_directory_p (file)))
1254     return file;
1255
1256   /* Find a unique name.  */
1257   name = unique_name (file);
1258   free (file);
1259   return name;
1260 }
1261
1262 /* Construct an absolute URL, given a (possibly) relative one.  This
1263    is more tricky than it might seem, but it works.  */
1264 static char *
1265 construct (const char *url, const char *sub, int subsize, int no_proto)
1266 {
1267   char *constr;
1268
1269   if (no_proto)
1270     {
1271       int i;
1272
1273       if (*sub != '/')
1274         {
1275           for (i = strlen (url); i && url[i] != '/'; i--);
1276           if (!i || (url[i] == url[i - 1]))
1277             {
1278               int l = strlen (url);
1279               char *t = (char *)alloca (l + 2);
1280               strcpy (t, url);
1281               t[l] = '/';
1282               t[l + 1] = '\0';
1283               url = t;
1284               i = l;
1285             }
1286           constr = (char *)xmalloc (i + 1 + subsize + 1);
1287           strncpy (constr, url, i + 1);
1288           constr[i + 1] = '\0';
1289           strncat (constr, sub, subsize);
1290         }
1291       else /* *sub == `/' */
1292         {
1293           int fl;
1294
1295           i = 0;
1296           do
1297             {
1298               for (; url[i] && url[i] != '/'; i++);
1299               if (!url[i])
1300                 break;
1301               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1302               if (fl)
1303                 i += 2;
1304             }
1305           while (fl);
1306           if (!url[i])
1307             {
1308               int l = strlen (url);
1309               char *t = (char *)alloca (l + 2);
1310               strcpy (t, url);
1311               t[l] = '/';
1312               t[l + 1] = '\0';
1313               url = t;
1314             }
1315           constr = (char *)xmalloc (i + 1 + subsize + 1);
1316           strncpy (constr, url, i);
1317           constr[i] = '\0';
1318           strncat (constr + i, sub, subsize);
1319           constr[i + subsize] = '\0';
1320         } /* *sub == `/' */
1321     }
1322   else /* !no_proto */
1323     {
1324       constr = (char *)xmalloc (subsize + 1);
1325       strncpy (constr, sub, subsize);
1326       constr[subsize] = '\0';
1327     }
1328   return constr;
1329 }
1330 \f
1331 /* Optimize URL by host, destructively replacing u->host with realhost
1332    (u->host).  Do this regardless of opt.simple_check.  */
1333 void
1334 opt_url (struct urlinfo *u)
1335 {
1336   /* Find the "true" host.  */
1337   char *host = realhost (u->host);
1338   free (u->host);
1339   u->host = host;
1340   assert (u->dir != NULL);      /* the URL must have been parsed */
1341   /* Refresh the printed representation.  */
1342   free (u->url);
1343   u->url = str_url (u, 0);
1344 }
1345 \f
1346 /* Returns proxy host address, in accordance with PROTO.  */
1347 char *
1348 getproxy (uerr_t proto)
1349 {
1350   if (proto == URLHTTP)
1351     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1352   else if (proto == URLFTP)
1353     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1354   else
1355     return NULL;
1356 }
1357
1358 /* Should a host be accessed through proxy, concerning no_proxy?  */
1359 int
1360 no_proxy_match (const char *host, const char **no_proxy)
1361 {
1362   if (!no_proxy)
1363     return 1;
1364   else
1365     return !sufmatch (no_proxy, host);
1366 }
1367 \f
1368 /* Change the links in an HTML document.  Accepts a structure that
1369    defines the positions of all the links.  */
1370 void
1371 convert_links (const char *file, urlpos *l)
1372 {
1373   FILE *fp;
1374   char *buf, *p, *p2;
1375   long size;
1376
1377   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1378   /* Read from the file....  */
1379   fp = fopen (file, "rb");
1380   if (!fp)
1381     {
1382       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1383                  file, strerror (errno));
1384       return;
1385     }
1386   /* ...to a buffer.  */
1387   load_file (fp, &buf, &size);
1388   fclose (fp);
1389   if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1390     /* Rather than just writing over the original .html file with the converted
1391        version, save the former to *.orig.  Note we only do this for files we've
1392        _successfully_ downloaded, so we don't clobber .orig files sitting around
1393        from previous invocations. */
1394     {
1395       /* Construct the backup filename as the original name plus ".orig". */
1396       size_t         filename_len = strlen(file);
1397       char*          filename_plus_orig_suffix = malloc(filename_len +
1398                                                         sizeof(".orig"));
1399       boolean        already_wrote_backup_file = FALSE;
1400       slist*         converted_file_ptr;
1401       static slist*  converted_files = NULL;
1402
1403       /* Would a single s[n]printf() call be faster? */
1404       strcpy(filename_plus_orig_suffix, file);
1405       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1406
1407       /* We can get called twice on the same URL thanks to the
1408          convert_all_links() call in main().  If we write the .orig file each
1409          time in such a case, it'll end up containing the first-pass conversion,
1410          not the original file.  So, see if we've already been called on this
1411          file. */
1412       converted_file_ptr = converted_files;
1413       while (converted_file_ptr != NULL)
1414         if (strcmp(converted_file_ptr->string, file) == 0)
1415           {
1416             already_wrote_backup_file = TRUE;
1417             break;
1418           }
1419         else
1420           converted_file_ptr = converted_file_ptr->next;
1421
1422       if (!already_wrote_backup_file)
1423         {
1424           /* Rename <file> to <file>.orig before former gets written over. */
1425           if (rename(file, filename_plus_orig_suffix) != 0)
1426             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1427                        file, filename_plus_orig_suffix, strerror (errno));
1428
1429           /* Remember that we've already written a .orig backup for this file.
1430              Note that we never free this memory since we need it till the
1431              convert_all_links() call, which is one of the last things the
1432              program does before terminating.  BTW, I'm not sure if it would be
1433              safe to just set 'converted_file_ptr->string' to 'file' below,
1434              rather than making a copy of the string...  Another note is that I
1435              thought I could just add a field to the urlpos structure saying
1436              that we'd written a .orig file for this URL, but that didn't work,
1437              so I had to make this separate list. */
1438           converted_file_ptr = malloc(sizeof(slist));
1439           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1440           converted_file_ptr->next = converted_files;
1441           converted_files = converted_file_ptr;
1442         }
1443
1444       free(filename_plus_orig_suffix);
1445     }
1446   /* Now open the file for writing.  */
1447   fp = fopen (file, "wb");
1448   if (!fp)
1449     {
1450       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1451                  file, strerror (errno));
1452       free (buf);
1453       return;
1454     }
1455   /* [If someone understands why multiple URLs can correspond to one local file,
1456      can they please add a comment here...?] */
1457   for (p = buf; l; l = l->next)
1458     {
1459       if (l->pos >= size)
1460         {
1461           DEBUGP (("Something strange is going on.  Please investigate."));
1462           break;
1463         }
1464       /* If the URL already is relative or it is not to be converted
1465          for some other reason (e.g. because of not having been
1466          downloaded in the first place), skip it.  */
1467       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1468         {
1469           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1470                    l->pos, l->flags));
1471           continue;
1472         }
1473       /* Else, reach the position of the offending URL, echoing
1474          everything up to it to the outfile.  */
1475       for (p2 = buf + l->pos; p < p2; p++)
1476         putc (*p, fp);
1477       if (l->flags & UABS2REL)
1478         {
1479           char *newname = construct_relative (file, l->local_name);
1480           fprintf (fp, "%s", newname);
1481           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1482                    l->url, newname, l->pos, file));
1483           free (newname);
1484         }
1485       p += l->size;
1486     }
1487   if (p - buf < size)
1488     {
1489       for (p2 = buf + size; p < p2; p++)
1490         putc (*p, fp);
1491     }
1492   fclose (fp);
1493   free (buf);
1494   logputs (LOG_VERBOSE, _("done.\n"));
1495 }
1496
1497 /* Construct and return a malloced copy of the relative link from two
1498    pieces of information: local name S1 of the referring file and
1499    local name S2 of the referred file.
1500
1501    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1502    "jagor.srce.hr/images/news.gif", the function will return
1503    "images/news.gif".
1504
1505    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1506    "fly.cc.fer.hr/images/fly.gif", the function will return
1507    "../images/fly.gif".
1508
1509    Caveats: S1 should not begin with `/', unless S2 also begins with
1510    '/'.  S1 should not contain things like ".." and such --
1511    construct_relative ("fly/ioccc/../index.html",
1512    "fly/images/fly.gif") will fail.  (A workaround is to call
1513    something like path_simplify() on S1).  */
1514 static char *
1515 construct_relative (const char *s1, const char *s2)
1516 {
1517   int i, cnt, sepdirs1;
1518   char *res;
1519
1520   if (*s2 == '/')
1521     return xstrdup (s2);
1522   /* S1 should *not* be absolute, if S2 wasn't.  */
1523   assert (*s1 != '/');
1524   i = cnt = 0;
1525   /* Skip the directories common to both strings.  */
1526   while (1)
1527     {
1528       while (s1[i] && s2[i]
1529              && (s1[i] == s2[i])
1530              && (s1[i] != '/')
1531              && (s2[i] != '/'))
1532         ++i;
1533       if (s1[i] == '/' && s2[i] == '/')
1534         cnt = ++i;
1535       else
1536         break;
1537     }
1538   for (sepdirs1 = 0; s1[i]; i++)
1539     if (s1[i] == '/')
1540       ++sepdirs1;
1541   /* Now, construct the file as of:
1542      - ../ repeated sepdirs1 time
1543      - all the non-mutual directories of S2.  */
1544   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1545   for (i = 0; i < sepdirs1; i++)
1546     memcpy (res + 3 * i, "../", 3);
1547   strcpy (res + 3 * i, s2 + cnt);
1548   return res;
1549 }
1550 \f
1551 /* Add URL to the head of the list L.  */
1552 urlpos *
1553 add_url (urlpos *l, const char *url, const char *file)
1554 {
1555   urlpos *t;
1556
1557   t = (urlpos *)xmalloc (sizeof (urlpos));
1558   memset (t, 0, sizeof (*t));
1559   t->url = xstrdup (url);
1560   t->local_name = xstrdup (file);
1561   t->next = l;
1562   return t;
1563 }
1564
1565
1566 /* Remembers which files have been downloaded.  Should be called with
1567    add_or_check == ADD_FILE for each file we actually download successfully
1568    (i.e. not for ones we have failures on or that we skip due to -N).  If you
1569    just want to check if a file has been previously added without adding it,
1570    call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
1571    function with local filenames, not remote URLs -- by some means that isn't
1572    commented well enough for me understand, multiple remote URLs can apparently
1573    correspond to a single local file. */
1574 boolean
1575 downloaded_file (downloaded_file_t  add_or_check, const char*  file)
1576 {
1577   boolean        found_file = FALSE;
1578   static slist*  downloaded_files = NULL;
1579   slist*         rover = downloaded_files;
1580
1581   while (rover != NULL)
1582     if (strcmp(rover->string, file) == 0)
1583       {
1584         found_file = TRUE;
1585         break;
1586       }
1587     else
1588       rover = rover->next;
1589
1590   if (found_file)
1591     return TRUE;  /* file had already been downloaded */
1592   else
1593     {
1594       if (add_or_check == ADD_FILE)
1595         {
1596           rover = malloc(sizeof(slist));
1597           rover->string = xstrdup(file);  /* die on out-of-mem. */
1598           rover->next = downloaded_files;
1599           downloaded_files = rover;
1600         }
1601
1602       return FALSE;  /* file had not already been downloaded */
1603     }
1604 }