sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
  62 #endif /* WINDOWS */
  63
  64 #define UNSAFE_CHAR(c) (((c) >= 0 && (c) <= 32)                 \
  65                         || strchr (URL_UNSAFE_CHARS, c))
  66
  67 /* If S contains unsafe characters, free it and replace it with a
  68    version that doesn't.  */
  69 #define URL_CLEANSE(s) do                       \
  70 {                                               \
  71   if (contains_unsafe (s))                      \
  72     {                                           \
  73       char *uc_tmp = encode_string (s);         \
  74       free (s);                                 \
  75       (s) = uc_tmp;                             \
  76     }                                           \
  77 } while (0)
  78
  79 /* Is a directory "."?  */
  80 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  81 /* Is a directory ".."?  */
  82 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  83
  84 /* NULL-terminated list of strings to be recognized as prototypes (URL
  85    schemes).  Note that recognized doesn't mean supported -- only HTTP
  86    and FTP are currently supported.
  87
  88    However, a string that does not match anything in the list will be
  89    considered a relative URL.  Thus it's important that this list has
  90    anything anyone could think of being legal.
  91
  92    There are wild things here.  :-) Take a look at
  93    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  94    fun.  */
  95 static char *protostrings[] =
  96 {
  97   "cid:",
  98   "clsid:",
  99   "file:",
 100   "finger:",
 101   "ftp:",
 102   "gopher:",
 103   "hdl:",
 104   "http:",
 105   "https:",
 106   "ilu:",
 107   "ior:",
 108   "irc:",
 109   "java:",
 110   "javascript:",
 111   "lifn:",
 112   "mailto:",
 113   "mid:",
 114   "news:",
 115   "nntp:",
 116   "path:",
 117   "prospero:",
 118   "rlogin:",
 119   "service:",
 120   "shttp:",
 121   "snews:",
 122   "stanf:",
 123   "telnet:",
 124   "tn3270:",
 125   "wais:",
 126   "whois++:",
 127   NULL
 128 };
 129
 130 struct proto
 131 {
 132   char *name;
 133   uerr_t ind;
 134   unsigned short port;
 135 };
 136
 137 /* Similar to former, but for supported protocols: */
 138 static struct proto sup_protos[] =
 139 {
 140   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 141   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 142   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 143 };
 144
 145 static void parse_dir PARAMS ((const char *, char **, char **));
 146 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 147 static char *construct PARAMS ((const char *, const char *, int , int));
 148 static char *construct_relative PARAMS ((const char *, const char *));
 149 static char process_ftp_type PARAMS ((char *));
 150
 151 \f
 152 /* Returns the number of characters to be skipped if the first thing
 153    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 154    URL: are also skipped.  */
 155 int
 156 skip_url (const char *url)
 157 {
 158   int i;
 159
 160   if (TOUPPER (url[0]) == 'U'
 161       && TOUPPER (url[1]) == 'R'
 162       && TOUPPER (url[2]) == 'L'
 163       && url[3] == ':')
 164     {
 165       /* Skip blanks.  */
 166       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 167       return i;
 168     }
 169   else
 170     return 0;
 171 }
 172
 173 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 174 int
 175 contains_unsafe (const char *s)
 176 {
 177   for (; *s; s++)
 178     if (UNSAFE_CHAR (*s))
 179       return 1;
 180   return 0;
 181 }
 182
 183 /* Decodes the forms %xy in a URL to the character the hexadecimal
 184    code of which is xy.  xy are hexadecimal digits from
 185    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 186    hex-digits or `%' precedes `\0', the sequence is inserted
 187    literally.  */
 188
 189 static void
 190 decode_string (char *s)
 191 {
 192   char *p = s;
 193
 194   for (; *s; s++, p++)
 195     {
 196       if (*s != '%')
 197         *p = *s;
 198       else
 199         {
 200           /* Do nothing if at the end of the string, or if the chars
 201              are not hex-digits.  */
 202           if (!*(s + 1) || !*(s + 2)
 203               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 204             {
 205               *p = *s;
 206               continue;
 207             }
 208           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 209           s += 2;
 210         }
 211     }
 212   *p = '\0';
 213 }
 214
 215 /* Encodes the unsafe characters (listed in URL_UNSAFE_CHARS) in a
 216    given string, returning a malloc-ed %XX encoded string.  */
 217 char *
 218 encode_string (const char *s)
 219 {
 220   const char *b;
 221   char *p, *res;
 222   int i;
 223
 224   b = s;
 225   for (i = 0; *s; s++, i++)
 226     if (UNSAFE_CHAR (*s))
 227       i += 2; /* Two more characters (hex digits) */
 228   res = (char *)xmalloc (i + 1);
 229   s = b;
 230   for (p = res; *s; s++)
 231     if (UNSAFE_CHAR (*s))
 232       {
 233         const unsigned char c = *s;
 234         *p++ = '%';
 235         *p++ = HEXD2ASC (c >> 4);
 236         *p++ = HEXD2ASC (c & 0xf);
 237       }
 238     else
 239       *p++ = *s;
 240   *p = '\0';
 241   return res;
 242 }
 243 \f
 244 /* Returns the proto-type if URL's protocol is supported, or
 245    URLUNKNOWN if not.  */
 246 uerr_t
 247 urlproto (const char *url)
 248 {
 249   int i;
 250
 251   url += skip_url (url);
 252   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 253     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 254       return sup_protos[i].ind;
 255   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 256   if (url[i] == ':')
 257     {
 258       for (++i; url[i] && url[i] != '/'; i++)
 259         if (!ISDIGIT (url[i]))
 260           return URLBADPORT;
 261       if (url[i - 1] == ':')
 262         return URLFTP;
 263       else
 264         return URLHTTP;
 265     }
 266   else
 267     return URLHTTP;
 268 }
 269
 270 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 271    part is found, returns 0.  */
 272 int
 273 skip_proto (const char *url)
 274 {
 275   char **s;
 276   int l;
 277
 278   for (s = protostrings; *s; s++)
 279     if (!strncasecmp (*s, url, strlen (*s)))
 280       break;
 281   if (!*s)
 282     return 0;
 283   l = strlen (*s);
 284   /* HTTP and FTP protocols are expected to yield exact host names
 285      (i.e. the `//' part must be skipped, too).  */
 286   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 287     l += 2;
 288   return l;
 289 }
 290
 291 /* Returns 1 if the URL begins with a protocol (supported or
 292    unsupported), 0 otherwise.  */
 293 static int
 294 has_proto (const char *url)
 295 {
 296   char **s;
 297
 298   url += skip_url (url);
 299   for (s = protostrings; *s; s++)
 300     if (strncasecmp (url, *s, strlen (*s)) == 0)
 301       return 1;
 302   return 0;
 303 }
 304
 305 /* Skip the username and password, if present here.  The function
 306    should be called *not* with the complete URL, but with the part
 307    right after the protocol.
 308
 309    If no username and password are found, return 0.  */
 310 int
 311 skip_uname (const char *url)
 312 {
 313   const char *p;
 314   for (p = url; *p && *p != '/'; p++)
 315     if (*p == '@')
 316       break;
 317   /* If a `@' was found before the first occurrence of `/', skip
 318      it.  */
 319   if (*p == '@')
 320     return p - url + 1;
 321   else
 322     return 0;
 323 }
 324 \f
 325 /* Allocate a new urlinfo structure, fill it with default values and
 326    return a pointer to it.  */
 327 struct urlinfo *
 328 newurl (void)
 329 {
 330   struct urlinfo *u;
 331
 332   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 333   memset (u, 0, sizeof (*u));
 334   u->proto = URLUNKNOWN;
 335   return u;
 336 }
 337
 338 /* Perform a "deep" free of the urlinfo structure.  The structure
 339    should have been created with newurl, but need not have been used.
 340    If free_pointer is non-0, free the pointer itself.  */
 341 void
 342 freeurl (struct urlinfo *u, int complete)
 343 {
 344   assert (u != NULL);
 345   FREE_MAYBE (u->url);
 346   FREE_MAYBE (u->host);
 347   FREE_MAYBE (u->path);
 348   FREE_MAYBE (u->file);
 349   FREE_MAYBE (u->dir);
 350   FREE_MAYBE (u->user);
 351   FREE_MAYBE (u->passwd);
 352   FREE_MAYBE (u->local);
 353   FREE_MAYBE (u->referer);
 354   if (u->proxy)
 355     freeurl (u->proxy, 1);
 356   if (complete)
 357     free (u);
 358   return;
 359 }
 360 \f
 361 /* Extract the given URL of the form
 362    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 363    1. hostname (terminated with `/' or `:')
 364    2. port number (terminated with `/'), or chosen for the protocol
 365    3. dirname (everything after hostname)
 366    Most errors are handled.  No allocation is done, you must supply
 367    pointers to allocated memory.
 368    ...and a host of other stuff :-)
 369
 370    - Recognizes hostname:dir/file for FTP and
 371      hostname (:portnum)?/dir/file for HTTP.
 372    - Parses the path to yield directory and file
 373    - Parses the URL to yield the username and passwd (if present)
 374    - Decodes the strings, in case they contain "forbidden" characters
 375    - Writes the result to struct urlinfo
 376
 377    If the argument STRICT is set, it recognizes only the canonical
 378    form.  */
 379 uerr_t
 380 parseurl (const char *url, struct urlinfo *u, int strict)
 381 {
 382   int i, l, abs_ftp;
 383   int recognizable;            /* Recognizable URL is the one where
 384                                   the protocol name was explicitly
 385                                   named, i.e. it wasn't deduced from
 386                                   the URL format.  */
 387   uerr_t type;
 388
 389   DEBUGP (("parseurl (\"%s\") -> ", url));
 390   url += skip_url (url);
 391   recognizable = has_proto (url);
 392   if (strict && !recognizable)
 393     return URLUNKNOWN;
 394   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 395     {
 396       l = strlen (sup_protos[i].name);
 397       if (!strncasecmp (sup_protos[i].name, url, l))
 398         break;
 399     }
 400   /* If protocol is recognizable, but unsupported, bail out, else
 401      suppose unknown.  */
 402   if (recognizable && !sup_protos[i].name)
 403     return URLUNKNOWN;
 404   else if (i == ARRAY_SIZE (sup_protos))
 405     type = URLUNKNOWN;
 406   else
 407     u->proto = type = sup_protos[i].ind;
 408
 409   if (type == URLUNKNOWN)
 410     l = 0;
 411   /* Allow a username and password to be specified (i.e. just skip
 412      them for now).  */
 413   if (recognizable)
 414     l += skip_uname (url + l);
 415   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 416   if (i == l)
 417     return URLBADHOST;
 418   /* Get the hostname.  */
 419   u->host = strdupdelim (url + l, url + i);
 420   DEBUGP (("host %s -> ", u->host));
 421
 422   /* Assume no port has been given.  */
 423   u->port = 0;
 424   if (url[i] == ':')
 425     {
 426       /* We have a colon delimiting the hostname.  It could mean that
 427          a port number is following it, or a directory.  */
 428       if (ISDIGIT (url[++i]))    /* A port number */
 429         {
 430           if (type == URLUNKNOWN)
 431             u->proto = type = URLHTTP;
 432           for (; url[i] && url[i] != '/'; i++)
 433             if (ISDIGIT (url[i]))
 434               u->port = 10 * u->port + (url[i] - '0');
 435             else
 436               return URLBADPORT;
 437           if (!u->port)
 438             return URLBADPORT;
 439           DEBUGP (("port %hu -> ", u->port));
 440         }
 441       else if (type == URLUNKNOWN) /* or a directory */
 442         u->proto = type = URLFTP;
 443       else                      /* or just a misformed port number */
 444         return URLBADPORT;
 445     }
 446   else if (type == URLUNKNOWN)
 447     u->proto = type = URLHTTP;
 448   if (!u->port)
 449     {
 450       int i;
 451       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 452         if (sup_protos[i].ind == type)
 453           break;
 454       if (i == ARRAY_SIZE (sup_protos))
 455         return URLUNKNOWN;
 456       u->port = sup_protos[i].port;
 457     }
 458   /* Some delimiter troubles...  */
 459   if (url[i] == '/' && url[i - 1] != ':')
 460     ++i;
 461   if (type == URLHTTP)
 462     while (url[i] && url[i] == '/')
 463       ++i;
 464   u->path = (char *)xmalloc (strlen (url + i) + 8);
 465   strcpy (u->path, url + i);
 466   if (type == URLFTP)
 467     {
 468       u->ftp_type = process_ftp_type (u->path);
 469       /* #### We don't handle type `d' correctly yet.  */
 470       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 471         u->ftp_type = 'I';
 472     }
 473   DEBUGP (("opath %s -> ", u->path));
 474   /* Parse the username and password (if existing).  */
 475   parse_uname (url, &u->user, &u->passwd);
 476   /* Decode the strings, as per RFC 1738.  */
 477   decode_string (u->host);
 478   decode_string (u->path);
 479   if (u->user)
 480     decode_string (u->user);
 481   if (u->passwd)
 482     decode_string (u->passwd);
 483   /* Parse the directory.  */
 484   parse_dir (u->path, &u->dir, &u->file);
 485   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 486   /* Simplify the directory.  */
 487   path_simplify (u->dir);
 488   /* Remove the leading `/' in HTTP.  */
 489   if (type == URLHTTP && *u->dir == '/')
 490     strcpy (u->dir, u->dir + 1);
 491   DEBUGP (("ndir %s\n", u->dir));
 492   /* Strip trailing `/'.  */
 493   l = strlen (u->dir);
 494   if (l && u->dir[l - 1] == '/')
 495     u->dir[l - 1] = '\0';
 496   /* Re-create the path: */
 497   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 498   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 499       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 500   strcpy (u->path, abs_ftp ? "%2F" : "/");
 501   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 502   strcat (u->path, *u->dir ? "/" : "");
 503   strcat (u->path, u->file);
 504   URL_CLEANSE (u->path);
 505   /* Create the clean URL.  */
 506   u->url = str_url (u, 0);
 507   return URLOK;
 508 }
 509 \f
 510 /* Build the directory and filename components of the path.  Both
 511    components are *separately* malloc-ed strings!  It does not change
 512    the contents of path.
 513
 514    If the path ends with "." or "..", they are (correctly) counted as
 515    directories.  */
 516 static void
 517 parse_dir (const char *path, char **dir, char **file)
 518 {
 519   int i, l;
 520
 521   for (i = l = strlen (path); i && path[i] != '/'; i--);
 522   if (!i && *path != '/')   /* Just filename */
 523     {
 524       if (DOTP (path) || DDOTP (path))
 525         {
 526           *dir = xstrdup (path);
 527           *file = xstrdup ("");
 528         }
 529       else
 530         {
 531           *dir = xstrdup ("");     /* This is required because of FTP */
 532           *file = xstrdup (path);
 533         }
 534     }
 535   else if (!i)                 /* /filename */
 536     {
 537       if (DOTP (path + 1) || DDOTP (path + 1))
 538         {
 539           *dir = xstrdup (path);
 540           *file = xstrdup ("");
 541         }
 542       else
 543         {
 544           *dir = xstrdup ("/");
 545           *file = xstrdup (path + 1);
 546         }
 547     }
 548   else /* Nonempty directory with or without a filename */
 549     {
 550       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 551         {
 552           *dir = xstrdup (path);
 553           *file = xstrdup ("");
 554         }
 555       else
 556         {
 557           *dir = strdupdelim (path, path + i);
 558           *file = strdupdelim (path + i + 1, path + l + 1);
 559         }
 560     }
 561 }
 562
 563 /* Find the optional username and password within the URL, as per
 564    RFC1738.  The returned user and passwd char pointers are
 565    malloc-ed.  */
 566 static uerr_t
 567 parse_uname (const char *url, char **user, char **passwd)
 568 {
 569   int l;
 570   const char *p, *col;
 571   char **where;
 572
 573   *user = NULL;
 574   *passwd = NULL;
 575   url += skip_url (url);
 576   /* Look for end of protocol string.  */
 577   l = skip_proto (url);
 578   if (!l)
 579     return URLUNKNOWN;
 580   /* Add protocol offset.  */
 581   url += l;
 582   /* Is there an `@' character?  */
 583   for (p = url; *p && *p != '/'; p++)
 584     if (*p == '@')
 585       break;
 586   /* If not, return.  */
 587   if (*p != '@')
 588     return URLOK;
 589   /* Else find the username and password.  */
 590   for (p = col = url; *p != '@'; p++)
 591     {
 592       if (*p == ':' && !*user)
 593         {
 594           *user = (char *)xmalloc (p - url + 1);
 595           memcpy (*user, url, p - url);
 596           (*user)[p - url] = '\0';
 597           col = p + 1;
 598         }
 599     }
 600   /* Decide whether you have only the username or both.  */
 601   where = *user ? passwd : user;
 602   *where = (char *)xmalloc (p - col + 1);
 603   memcpy (*where, col, p - col);
 604   (*where)[p - col] = '\0';
 605   return URLOK;
 606 }
 607
 608 /* If PATH ends with `;type=X', return the character X.  */
 609 static char
 610 process_ftp_type (char *path)
 611 {
 612   int len = strlen (path);
 613
 614   if (len >= 7
 615       && !memcmp (path + len - 7, ";type=", 6))
 616     {
 617       path[len - 7] = '\0';
 618       return path[len - 1];
 619     }
 620   else
 621     return '\0';
 622 }
 623 \f
 624 /* Return the URL as fine-formed string, with a proper protocol, port
 625    number, directory and optional user/password.  If HIDE is non-zero,
 626    password will be hidden.  The forbidden characters in the URL will
 627    be cleansed.  */
 628 char *
 629 str_url (const struct urlinfo *u, int hide)
 630 {
 631   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 632   int i, l, ln, lu, lh, lp, lf, ld;
 633
 634   /* Look for the protocol name.  */
 635   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 636     if (sup_protos[i].ind == u->proto)
 637       break;
 638   if (i == ARRAY_SIZE (sup_protos))
 639     return NULL;
 640   proto_name = sup_protos[i].name;
 641   host = CLEANDUP (u->host);
 642   dir = CLEANDUP (u->dir);
 643   file = CLEANDUP (u->file);
 644   user = passwd = NULL;
 645   if (u->user)
 646     user = CLEANDUP (u->user);
 647   if (u->passwd)
 648     {
 649       int i;
 650       passwd = CLEANDUP (u->passwd);
 651       if (hide)
 652         for (i = 0; passwd[i]; i++)
 653           passwd[i] = 'x';
 654     }
 655   if (u->proto == URLFTP && *dir == '/')
 656     {
 657       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 658       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 659       *tmp = '%';
 660       tmp[1] = '2';
 661       tmp[2] = 'F';
 662       strcpy (tmp + 3, dir + 1);
 663       free (dir);
 664       dir = tmp;
 665     }
 666
 667   ln = strlen (proto_name);
 668   lu = user ? strlen (user) : 0;
 669   lp = passwd ? strlen (passwd) : 0;
 670   lh = strlen (host);
 671   ld = strlen (dir);
 672   lf = strlen (file);
 673   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 674   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 675      (user ? user : ""), (passwd ? ":" : ""),
 676      (passwd ? passwd : ""), (user ? "@" : ""),
 677      host, u->port, dir, *dir ? "/" : "", file); */
 678   l = 0;
 679   memcpy (res, proto_name, ln);
 680   l += ln;
 681   if (user)
 682     {
 683       memcpy (res + l, user, lu);
 684       l += lu;
 685       if (passwd)
 686         {
 687           res[l++] = ':';
 688           memcpy (res + l, passwd, lp);
 689           l += lp;
 690         }
 691       res[l++] = '@';
 692     }
 693   memcpy (res + l, host, lh);
 694   l += lh;
 695   res[l++] = ':';
 696   long_to_string (res + l, (long)u->port);
 697   l += numdigit (u->port);
 698   res[l++] = '/';
 699   memcpy (res + l, dir, ld);
 700   l += ld;
 701   if (*dir)
 702     res[l++] = '/';
 703   strcpy (res + l, file);
 704   free (host);
 705   free (dir);
 706   free (file);
 707   FREE_MAYBE (user);
 708   FREE_MAYBE (passwd);
 709   return res;
 710 }
 711
 712 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 713    location.  Uses parseurl to parse them, and compares the canonical
 714    forms.
 715
 716    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 717    return 0 on error.  */
 718 int
 719 url_equal (const char *url1, const char *url2)
 720 {
 721   struct urlinfo *u1, *u2;
 722   uerr_t err;
 723   int res;
 724
 725   u1 = newurl ();
 726   err = parseurl (url1, u1, 0);
 727   if (err != URLOK)
 728     {
 729       freeurl (u1, 1);
 730       return 0;
 731     }
 732   u2 = newurl ();
 733   err = parseurl (url2, u2, 0);
 734   if (err != URLOK)
 735     {
 736       freeurl (u2, 1);
 737       return 0;
 738     }
 739   res = !strcmp (u1->url, u2->url);
 740   freeurl (u1, 1);
 741   freeurl (u2, 1);
 742   return res;
 743 }
 744 \f
 745 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 746    buffer may contain pretty much anything; no errors are signaled.  */
 747 static const char *
 748 findurl (const char *buf, int howmuch, int *count)
 749 {
 750   char **prot;
 751   const char *s1, *s2;
 752
 753   for (s1 = buf; howmuch; s1++, howmuch--)
 754     for (prot = protostrings; *prot; prot++)
 755       if (howmuch <= strlen (*prot))
 756         continue;
 757       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 758         {
 759           for (s2 = s1, *count = 0;
 760                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 761                  !strchr (URL_SEPARATOR, *s2);
 762                s2++, (*count)++, howmuch--);
 763           return s1;
 764         }
 765   return NULL;
 766 }
 767
 768 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 769    each pointer representing a URL string.  The file is *not* assumed
 770    to be HTML.  */
 771 urlpos *
 772 get_urls_file (const char *file)
 773 {
 774   long nread;
 775   FILE *fp;
 776   char *buf;
 777   const char *pbuf;
 778   int size;
 779   urlpos *first, *current, *old;
 780
 781   if (file && !HYPHENP (file))
 782     {
 783       fp = fopen (file, "rb");
 784       if (!fp)
 785         {
 786           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 787           return NULL;
 788         }
 789     }
 790   else
 791     fp = stdin;
 792   /* Load the file.  */
 793   load_file (fp, &buf, &nread);
 794   if (file && !HYPHENP (file))
 795     fclose (fp);
 796   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 797   first = current = NULL;
 798   /* Fill the linked list with URLs.  */
 799   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 800        pbuf += size)
 801     {
 802       /* Allocate the space.  */
 803       old = current;
 804       current = (urlpos *)xmalloc (sizeof (urlpos));
 805       if (old)
 806         old->next = current;
 807       memset (current, 0, sizeof (*current));
 808       current->next = NULL;
 809       current->url = (char *)xmalloc (size + 1);
 810       memcpy (current->url, pbuf, size);
 811       current->url[size] = '\0';
 812       if (!first)
 813         first = current;
 814     }
 815   /* Free the buffer.  */
 816   free (buf);
 817
 818   return first;
 819 }
 820
 821 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 822    an HTML document using htmlfindurl(), which see.  get_urls_html()
 823    constructs the HTML-s from the relative href-s.
 824
 825    If SILENT is non-zero, do not barf on baseless relative links.  */
 826 urlpos *
 827 get_urls_html (const char *file, const char *this_url, int silent)
 828 {
 829   long nread;
 830   FILE *fp;
 831   char *orig_buf;
 832   const char *buf;
 833   int step, first_time;
 834   urlpos *first, *current, *old;
 835
 836   if (file && !HYPHENP (file))
 837     {
 838       fp = fopen (file, "rb");
 839       if (!fp)
 840         {
 841           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 842           return NULL;
 843         }
 844     }
 845   else
 846     fp = stdin;
 847   /* Load the file.  */
 848   load_file (fp, &orig_buf, &nread);
 849   if (file && !HYPHENP (file))
 850     fclose (fp);
 851   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 852   first = current = NULL;
 853   first_time = 1;
 854   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 855   for (buf = orig_buf;
 856        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
 857        buf += step)
 858     {
 859       int i, no_proto;
 860       int size = step;
 861       const char *pbuf = buf;
 862       char *constr, *base;
 863       const char *cbase;
 864
 865       first_time = 0;
 866
 867       /* A frequent phenomenon that needs to be handled are pages
 868          generated by brain-damaged HTML generators, which refer to to
 869          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 870          any spaces at the beginning or at the end of the string.
 871          This is probably not strictly correct, but that's what the
 872          browsers do, so we may follow.  May the authors of "WYSIWYG"
 873          HTML tools burn in hell for the damage they've inflicted!  */
 874       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 875         {
 876           ++pbuf;
 877           --size;
 878         }
 879       while (size && ISSPACE (pbuf[size - 1]))
 880         --size;
 881       if (!size)
 882         break;
 883
 884       for (i = 0; protostrings[i]; i++)
 885         {
 886           if (!strncasecmp (protostrings[i], pbuf,
 887                             MINVAL (strlen (protostrings[i]), size)))
 888             break;
 889         }
 890       /* Check for http:RELATIVE_URI.  See below for details.  */
 891       if (protostrings[i]
 892           && !(strncasecmp (pbuf, "http:", 5) == 0
 893                && strncasecmp (pbuf, "http://", 7) != 0))
 894         {
 895           no_proto = 0;
 896         }
 897       else
 898         {
 899           no_proto = 1;
 900           /* This is for extremely brain-damaged pages that refer to
 901              relative URI-s as <a href="http:URL">.  Just strip off the
 902              silly leading "http:" (as well as any leading blanks
 903              before it).  */
 904           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 905             pbuf += 5, size -= 5;
 906         }
 907       if (!no_proto)
 908         {
 909           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 910             {
 911               if (!strncasecmp (sup_protos[i].name, pbuf,
 912                                MINVAL (strlen (sup_protos[i].name), size)))
 913                 break;
 914             }
 915           /* Do *not* accept a non-supported protocol.  */
 916           if (i == ARRAY_SIZE (sup_protos))
 917             continue;
 918         }
 919       if (no_proto)
 920         {
 921           /* First, construct the base, which can be relative itself.
 922
 923              Criteria for creating the base are:
 924              1) html_base created by <base href="...">
 925              2) current URL
 926              3) base provided from the command line */
 927           cbase = html_base ();
 928           if (!cbase)
 929             cbase = this_url;
 930           if (!cbase)
 931             cbase = opt.base_href;
 932           if (!cbase)             /* Error condition -- a baseless
 933                                      relative link.  */
 934             {
 935               if (!opt.quiet && !silent)
 936                 {
 937                   /* Use malloc, not alloca because this is called in
 938                      a loop. */
 939                   char *temp = (char *)malloc (size + 1);
 940                   strncpy (temp, pbuf, size);
 941                   temp[size] = '\0';
 942                   logprintf (LOG_NOTQUIET,
 943                              _("Error (%s): Link %s without a base provided.\n"),
 944                              file, temp);
 945                   free (temp);
 946                 }
 947               continue;
 948             }
 949           if (this_url)
 950             base = construct (this_url, cbase, strlen (cbase),
 951                               !has_proto (cbase));
 952           else
 953             {
 954               /* Base must now be absolute, with host name and
 955                  protocol.  */
 956               if (!has_proto (cbase))
 957                 {
 958                   logprintf (LOG_NOTQUIET, _("\
 959 Error (%s): Base %s relative, without referer URL.\n"),
 960                              file, cbase);
 961                   continue;
 962                 }
 963               base = xstrdup (cbase);
 964             }
 965           constr = construct (base, pbuf, size, no_proto);
 966           free (base);
 967         }
 968       else /* has proto */
 969         {
 970           constr = (char *)xmalloc (size + 1);
 971           strncpy (constr, pbuf, size);
 972           constr[size] = '\0';
 973         }
 974 #ifdef DEBUG
 975       if (opt.debug)
 976         {
 977           char *tmp;
 978           const char *tmp2;
 979
 980           tmp2 = html_base ();
 981           /* Use malloc, not alloca because this is called in a loop. */
 982           tmp = (char *)xmalloc (size + 1);
 983           strncpy (tmp, pbuf, size);
 984           tmp[size] = '\0';
 985           logprintf (LOG_ALWAYS,
 986                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
 987                      file, this_url ? this_url : "(null)",
 988                      tmp2 ? tmp2 : "(null)", tmp, constr);
 989           free (tmp);
 990         }
 991 #endif
 992
 993       /* Allocate the space.  */
 994       old = current;
 995       current = (urlpos *)xmalloc (sizeof (urlpos));
 996       if (old)
 997         old->next = current;
 998       if (!first)
 999         first = current;
1000       /* Fill the values.  */
1001       memset (current, 0, sizeof (*current));
1002       current->next = NULL;
1003       current->url = constr;
1004       current->size = size;
1005       current->pos = pbuf - orig_buf;
1006       /* A URL is relative if the host and protocol are not named,
1007          and the name does not start with `/'.  */
1008       if (no_proto && *pbuf != '/')
1009         current->flags |= (URELATIVE | UNOPROTO);
1010       else if (no_proto)
1011         current->flags |= UNOPROTO;
1012     }
1013   free (orig_buf);
1014
1015   return first;
1016 }
1017 \f
1018 /* Free the linked list of urlpos.  */
1019 void
1020 free_urlpos (urlpos *l)
1021 {
1022   while (l)
1023     {
1024       urlpos *next = l->next;
1025       free (l->url);
1026       FREE_MAYBE (l->local_name);
1027       free (l);
1028       l = next;
1029     }
1030 }
1031
1032 /* Rotate FNAME opt.backups times */
1033 void
1034 rotate_backups(const char *fname)
1035 {
1036   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1037   char *from = (char *)alloca (maxlen);
1038   char *to = (char *)alloca (maxlen);
1039   struct stat sb;
1040   int i;
1041
1042   if (stat (fname, &sb) == 0)
1043     if (S_ISREG (sb.st_mode) == 0)
1044       return;
1045
1046   for (i = opt.backups; i > 1; i--)
1047     {
1048       sprintf (from, "%s.%d", fname, i - 1);
1049       sprintf (to, "%s.%d", fname, i);
1050       /* #### This will fail on machines without the rename() system
1051          call.  */
1052       rename (from, to);
1053     }
1054
1055   sprintf (to, "%s.%d", fname, 1);
1056   rename(fname, to);
1057 }
1058
1059 /* Create all the necessary directories for PATH (a file).  Calls
1060    mkdirhier() internally.  */
1061 int
1062 mkalldirs (const char *path)
1063 {
1064   const char *p;
1065   char *t;
1066   struct stat st;
1067   int res;
1068
1069   p = path + strlen (path);
1070   for (; *p != '/' && p != path; p--);
1071   /* Don't create if it's just a file.  */
1072   if ((p == path) && (*p != '/'))
1073     return 0;
1074   t = strdupdelim (path, p);
1075   /* Check whether the directory exists.  */
1076   if ((stat (t, &st) == 0))
1077     {
1078       if (S_ISDIR (st.st_mode))
1079         {
1080           free (t);
1081           return 0;
1082         }
1083       else
1084         {
1085           /* If the dir exists as a file name, remove it first.  This
1086              is *only* for Wget to work with buggy old CERN http
1087              servers.  Here is the scenario: When Wget tries to
1088              retrieve a directory without a slash, e.g.
1089              http://foo/bar (bar being a directory), CERN server will
1090              not redirect it too http://foo/bar/ -- it will generate a
1091              directory listing containing links to bar/file1,
1092              bar/file2, etc.  Wget will lose because it saves this
1093              HTML listing to a file `bar', so it cannot create the
1094              directory.  To work around this, if the file of the same
1095              name exists, we just remove it and create the directory
1096              anyway.  */
1097           DEBUGP (("Removing %s because of directory danger!\n", t));
1098           unlink (t);
1099         }
1100     }
1101   res = make_directory (t);
1102   if (res != 0)
1103     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1104   free (t);
1105   return res;
1106 }
1107
1108 static int
1109 count_slashes (const char *s)
1110 {
1111   int i = 0;
1112   while (*s)
1113     if (*s++ == '/')
1114       ++i;
1115   return i;
1116 }
1117
1118 /* Return the path name of the URL-equivalent file name, with a
1119    remote-like structure of directories.  */
1120 static char *
1121 mkstruct (const struct urlinfo *u)
1122 {
1123   char *host, *dir, *file, *res, *dirpref;
1124   int l;
1125
1126   assert (u->dir != NULL);
1127   assert (u->host != NULL);
1128
1129   if (opt.cut_dirs)
1130     {
1131       char *ptr = u->dir + (*u->dir == '/');
1132       int slash_count = 1 + count_slashes (ptr);
1133       int cut = MINVAL (opt.cut_dirs, slash_count);
1134       for (; cut && *ptr; ptr++)
1135         if (*ptr == '/')
1136           --cut;
1137       STRDUP_ALLOCA (dir, ptr);
1138     }
1139   else
1140     dir = u->dir + (*u->dir == '/');
1141
1142   host = xstrdup (u->host);
1143   /* Check for the true name (or at least a consistent name for saving
1144      to directory) of HOST, reusing the hlist if possible.  */
1145   if (opt.add_hostdir && !opt.simple_check)
1146     {
1147       char *nhost = realhost (host);
1148       free (host);
1149       host = nhost;
1150     }
1151   /* Add dir_prefix and hostname (if required) to the beginning of
1152      dir.  */
1153   if (opt.add_hostdir)
1154     {
1155       if (!DOTP (opt.dir_prefix))
1156         {
1157           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1158                                     + strlen (host) + 1);
1159           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1160         }
1161       else
1162         STRDUP_ALLOCA (dirpref, host);
1163     }
1164   else                         /* not add_hostdir */
1165     {
1166       if (!DOTP (opt.dir_prefix))
1167         dirpref = opt.dir_prefix;
1168       else
1169         dirpref = "";
1170     }
1171   free (host);
1172
1173   /* If there is a prefix, prepend it.  */
1174   if (*dirpref)
1175     {
1176       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1177       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1178       dir = newdir;
1179     }
1180   dir = xstrdup (dir);
1181   URL_CLEANSE (dir);
1182   l = strlen (dir);
1183   if (l && dir[l - 1] == '/')
1184     dir[l - 1] = '\0';
1185
1186   if (!*u->file)
1187     file = "index.html";
1188   else
1189     file = u->file;
1190
1191   /* Finally, construct the full name.  */
1192   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1193   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1194   free (dir);
1195   return res;
1196 }
1197
1198 /* Create a unique filename, corresponding to a given URL.  Calls
1199    mkstruct if necessary.  Does *not* actually create any directories.  */
1200 char *
1201 url_filename (const struct urlinfo *u)
1202 {
1203   char *file, *name;
1204   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1205
1206   if (opt.dirstruct)
1207     {
1208       file = mkstruct (u);
1209       have_prefix = 1;
1210     }
1211   else
1212     {
1213       if (!*u->file)
1214         file = xstrdup ("index.html");
1215       else
1216         file = xstrdup (u->file);
1217     }
1218
1219   if (!have_prefix)
1220     {
1221       /* Check whether the prefix directory is something other than "."
1222          before prepending it.  */
1223       if (!DOTP (opt.dir_prefix))
1224         {
1225           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1226                                          + 1 + strlen (file) + 1);
1227           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1228           free (file);
1229           file = nfile;
1230         }
1231     }
1232   /* DOS-ish file systems don't like `%' signs in them; we change it
1233      to `@'.  */
1234 #ifdef WINDOWS
1235   {
1236     char *p = file;
1237     for (p = file; *p; p++)
1238       if (*p == '%')
1239         *p = '@';
1240   }
1241 #endif /* WINDOWS */
1242
1243   /* Check the cases in which the unique extensions are not used:
1244      1) Clobbering is turned off (-nc).
1245      2) Retrieval with regetting.
1246      3) Timestamping is used.
1247      4) Hierarchy is built.
1248
1249      The exception is the case when file does exist and is a
1250      directory (actually support for bad httpd-s).  */
1251   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1252       && !(file_exists_p (file) && !file_non_directory_p (file)))
1253     return file;
1254
1255   /* Find a unique name.  */
1256   name = unique_name (file);
1257   free (file);
1258   return name;
1259 }
1260
1261 /* Construct an absolute URL, given a (possibly) relative one.  This
1262    is more tricky than it might seem, but it works.  */
1263 static char *
1264 construct (const char *url, const char *sub, int subsize, int no_proto)
1265 {
1266   char *constr;
1267
1268   if (no_proto)
1269     {
1270       int i;
1271
1272       if (*sub != '/')
1273         {
1274           for (i = strlen (url); i && url[i] != '/'; i--);
1275           if (!i || (url[i] == url[i - 1]))
1276             {
1277               int l = strlen (url);
1278               char *t = (char *)alloca (l + 2);
1279               strcpy (t, url);
1280               t[l] = '/';
1281               t[l + 1] = '\0';
1282               url = t;
1283               i = l;
1284             }
1285           constr = (char *)xmalloc (i + 1 + subsize + 1);
1286           strncpy (constr, url, i + 1);
1287           constr[i + 1] = '\0';
1288           strncat (constr, sub, subsize);
1289         }
1290       else /* *sub == `/' */
1291         {
1292           int fl;
1293
1294           i = 0;
1295           do
1296             {
1297               for (; url[i] && url[i] != '/'; i++);
1298               if (!url[i])
1299                 break;
1300               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1301               if (fl)
1302                 i += 2;
1303             }
1304           while (fl);
1305           if (!url[i])
1306             {
1307               int l = strlen (url);
1308               char *t = (char *)alloca (l + 2);
1309               strcpy (t, url);
1310               t[l] = '/';
1311               t[l + 1] = '\0';
1312               url = t;
1313             }
1314           constr = (char *)xmalloc (i + 1 + subsize + 1);
1315           strncpy (constr, url, i);
1316           constr[i] = '\0';
1317           strncat (constr + i, sub, subsize);
1318           constr[i + subsize] = '\0';
1319         } /* *sub == `/' */
1320     }
1321   else /* !no_proto */
1322     {
1323       constr = (char *)xmalloc (subsize + 1);
1324       strncpy (constr, sub, subsize);
1325       constr[subsize] = '\0';
1326     }
1327   return constr;
1328 }
1329 \f
1330 /* Optimize URL by host, destructively replacing u->host with realhost
1331    (u->host).  Do this regardless of opt.simple_check.  */
1332 void
1333 opt_url (struct urlinfo *u)
1334 {
1335   /* Find the "true" host.  */
1336   char *host = realhost (u->host);
1337   free (u->host);
1338   u->host = host;
1339   assert (u->dir != NULL);      /* the URL must have been parsed */
1340   /* Refresh the printed representation.  */
1341   free (u->url);
1342   u->url = str_url (u, 0);
1343 }
1344 \f
1345 /* Returns proxy host address, in accordance with PROTO.  */
1346 char *
1347 getproxy (uerr_t proto)
1348 {
1349   if (proto == URLHTTP)
1350     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1351   else if (proto == URLFTP)
1352     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1353   else
1354     return NULL;
1355 }
1356
1357 /* Should a host be accessed through proxy, concerning no_proxy?  */
1358 int
1359 no_proxy_match (const char *host, const char **no_proxy)
1360 {
1361   if (!no_proxy)
1362     return 1;
1363   else
1364     return !sufmatch (no_proxy, host);
1365 }
1366 \f
1367 /* Change the links in an HTML document.  Accepts a structure that
1368    defines the positions of all the links.  */
1369 void
1370 convert_links (const char *file, urlpos *l)
1371 {
1372   FILE *fp;
1373   char *buf, *p, *p2;
1374   long size;
1375
1376   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1377   /* Read from the file....  */
1378   fp = fopen (file, "rb");
1379   if (!fp)
1380     {
1381       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1382                  file, strerror (errno));
1383       return;
1384     }
1385   /* ...to a buffer.  */
1386   load_file (fp, &buf, &size);
1387   fclose (fp);
1388   if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1389     /* Rather than just writing over the original .html file with the converted
1390        version, save the former to *.orig.  Note we only do this for files we've
1391        _successfully_ downloaded, so we don't clobber .orig files sitting around
1392        from previous invocations. */
1393     {
1394       /* Construct the backup filename as the original name plus ".orig". */
1395       size_t         filename_len = strlen(file);
1396       char*          filename_plus_orig_suffix = malloc(filename_len +
1397                                                         sizeof(".orig"));
1398       boolean        already_wrote_backup_file = FALSE;
1399       slist*         converted_file_ptr;
1400       static slist*  converted_files = NULL;
1401
1402       /* Would a single s[n]printf() call be faster? */
1403       strcpy(filename_plus_orig_suffix, file);
1404       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1405
1406       /* We can get called twice on the same URL thanks to the
1407          convert_all_links() call in main().  If we write the .orig file each
1408          time in such a case, it'll end up containing the first-pass conversion,
1409          not the original file.  So, see if we've already been called on this
1410          file. */
1411       converted_file_ptr = converted_files;
1412       while (converted_file_ptr != NULL)
1413         if (strcmp(converted_file_ptr->string, file) == 0)
1414           {
1415             already_wrote_backup_file = TRUE;
1416             break;
1417           }
1418         else
1419           converted_file_ptr = converted_file_ptr->next;
1420
1421       if (!already_wrote_backup_file)
1422         {
1423           /* Rename <file> to <file>.orig before former gets written over. */
1424           if (rename(file, filename_plus_orig_suffix) != 0)
1425             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1426                        file, filename_plus_orig_suffix, strerror (errno));
1427
1428           /* Remember that we've already written a .orig backup for this file.
1429              Note that we never free this memory since we need it till the
1430              convert_all_links() call, which is one of the last things the
1431              program does before terminating.  BTW, I'm not sure if it would be
1432              safe to just set 'converted_file_ptr->string' to 'file' below,
1433              rather than making a copy of the string...  Another note is that I
1434              thought I could just add a field to the urlpos structure saying
1435              that we'd written a .orig file for this URL, but that didn't work,
1436              so I had to make this separate list. */
1437           converted_file_ptr = malloc(sizeof(slist));
1438           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1439           converted_file_ptr->next = converted_files;
1440           converted_files = converted_file_ptr;
1441         }
1442
1443       free(filename_plus_orig_suffix);
1444     }
1445   /* Now open the file for writing.  */
1446   fp = fopen (file, "wb");
1447   if (!fp)
1448     {
1449       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1450                  file, strerror (errno));
1451       free (buf);
1452       return;
1453     }
1454   /* [If someone understands why multiple URLs can correspond to one local file,
1455      can they please add a comment here...?] */
1456   for (p = buf; l; l = l->next)
1457     {
1458       if (l->pos >= size)
1459         {
1460           DEBUGP (("Something strange is going on.  Please investigate."));
1461           break;
1462         }
1463       /* If the URL already is relative or it is not to be converted
1464          for some other reason (e.g. because of not having been
1465          downloaded in the first place), skip it.  */
1466       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1467         {
1468           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1469                    l->pos, l->flags));
1470           continue;
1471         }
1472       /* Else, reach the position of the offending URL, echoing
1473          everything up to it to the outfile.  */
1474       for (p2 = buf + l->pos; p < p2; p++)
1475         putc (*p, fp);
1476       if (l->flags & UABS2REL)
1477         {
1478           char *newname = construct_relative (file, l->local_name);
1479           fprintf (fp, "%s", newname);
1480           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1481                    l->url, newname, l->pos, file));
1482           free (newname);
1483         }
1484       p += l->size;
1485     }
1486   if (p - buf < size)
1487     {
1488       for (p2 = buf + size; p < p2; p++)
1489         putc (*p, fp);
1490     }
1491   fclose (fp);
1492   free (buf);
1493   logputs (LOG_VERBOSE, _("done.\n"));
1494 }
1495
1496 /* Construct and return a malloced copy of the relative link from two
1497    pieces of information: local name S1 of the referring file and
1498    local name S2 of the referred file.
1499
1500    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1501    "jagor.srce.hr/images/news.gif", the function will return
1502    "images/news.gif".
1503
1504    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1505    "fly.cc.fer.hr/images/fly.gif", the function will return
1506    "../images/fly.gif".
1507
1508    Caveats: S1 should not begin with `/', unless S2 also begins with
1509    '/'.  S1 should not contain things like ".." and such --
1510    construct_relative ("fly/ioccc/../index.html",
1511    "fly/images/fly.gif") will fail.  (A workaround is to call
1512    something like path_simplify() on S1).  */
1513 static char *
1514 construct_relative (const char *s1, const char *s2)
1515 {
1516   int i, cnt, sepdirs1;
1517   char *res;
1518
1519   if (*s2 == '/')
1520     return xstrdup (s2);
1521   /* S1 should *not* be absolute, if S2 wasn't.  */
1522   assert (*s1 != '/');
1523   i = cnt = 0;
1524   /* Skip the directories common to both strings.  */
1525   while (1)
1526     {
1527       while (s1[i] && s2[i]
1528              && (s1[i] == s2[i])
1529              && (s1[i] != '/')
1530              && (s2[i] != '/'))
1531         ++i;
1532       if (s1[i] == '/' && s2[i] == '/')
1533         cnt = ++i;
1534       else
1535         break;
1536     }
1537   for (sepdirs1 = 0; s1[i]; i++)
1538     if (s1[i] == '/')
1539       ++sepdirs1;
1540   /* Now, construct the file as of:
1541      - ../ repeated sepdirs1 time
1542      - all the non-mutual directories of S2.  */
1543   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1544   for (i = 0; i < sepdirs1; i++)
1545     memcpy (res + 3 * i, "../", 3);
1546   strcpy (res + 3 * i, s2 + cnt);
1547   return res;
1548 }
1549 \f
1550 /* Add URL to the head of the list L.  */
1551 urlpos *
1552 add_url (urlpos *l, const char *url, const char *file)
1553 {
1554   urlpos *t;
1555
1556   t = (urlpos *)xmalloc (sizeof (urlpos));
1557   memset (t, 0, sizeof (*t));
1558   t->url = xstrdup (url);
1559   t->local_name = xstrdup (file);
1560   t->next = l;
1561   return t;
1562 }
1563
1564
1565 /* Remembers which files have been downloaded.  Should be called with
1566    add_or_check == ADD_FILE for each file we actually download successfully
1567    (i.e. not for ones we have failures on or that we skip due to -N).  If you
1568    just want to check if a file has been previously added without adding it,
1569    call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
1570    function with local filenames, not remote URLs -- by some means that isn't
1571    commented well enough for me understand, multiple remote URLs can apparently
1572    correspond to a single local file. */
1573 boolean
1574 downloaded_file (downloaded_file_t  add_or_check, const char*  file)
1575 {
1576   boolean        found_file = FALSE;
1577   static slist*  downloaded_files = NULL;
1578   slist*         rover = downloaded_files;
1579
1580   while (rover != NULL)
1581     if (strcmp(rover->string, file) == 0)
1582       {
1583         found_file = TRUE;
1584         break;
1585       }
1586     else
1587       rover = rover->next;
1588
1589   if (found_file)
1590     return TRUE;  /* file had already been downloaded */
1591   else
1592     {
1593       if (add_or_check == ADD_FILE)
1594         {
1595           rover = malloc(sizeof(slist));
1596           rover->string = xstrdup(file);  /* die on out-of-mem. */
1597           rover->next = downloaded_files;
1598           downloaded_files = rover;
1599         }
1600
1601       return FALSE;  /* file had not already been downloaded */
1602     }
1603 }