sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding, and \033 for safe printing.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE " <>\"%{}|\\^[]`\033"
  62 #endif /* WINDOWS */
  63
  64 /* If S contains unsafe characters, free it and replace it with a
  65    version that doesn't.  */
  66 #define URL_CLEANSE(s) do                       \
  67 {                                               \
  68   if (contains_unsafe (s))                      \
  69     {                                           \
  70       char *uc_tmp = encode_string (s);         \
  71       free (s);                                 \
  72       (s) = uc_tmp;                             \
  73     }                                           \
  74 } while (0)
  75
  76 /* Is a directory "."?  */
  77 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  78 /* Is a directory ".."?  */
  79 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  80
  81 /* NULL-terminated list of strings to be recognized as prototypes (URL
  82    schemes).  Note that recognized doesn't mean supported -- only HTTP
  83    and FTP are currently supported.
  84
  85    However, a string that does not match anything in the list will be
  86    considered a relative URL.  Thus it's important that this list has
  87    anything anyone could think of being legal.
  88
  89    There are wild things here.  :-) Take a look at
  90    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  91    fun.  */
  92 static char *protostrings[] =
  93 {
  94   "cid:",
  95   "clsid:",
  96   "file:",
  97   "finger:",
  98   "ftp:",
  99   "gopher:",
 100   "hdl:",
 101   "http:",
 102   "https:",
 103   "ilu:",
 104   "ior:",
 105   "irc:",
 106   "java:",
 107   "javascript:",
 108   "lifn:",
 109   "mailto:",
 110   "mid:",
 111   "news:",
 112   "nntp:",
 113   "path:",
 114   "prospero:",
 115   "rlogin:",
 116   "service:",
 117   "shttp:",
 118   "snews:",
 119   "stanf:",
 120   "telnet:",
 121   "tn3270:",
 122   "wais:",
 123   "whois++:",
 124   NULL
 125 };
 126
 127 struct proto
 128 {
 129   char *name;
 130   uerr_t ind;
 131   unsigned short port;
 132 };
 133
 134 /* Similar to former, but for supported protocols: */
 135 static struct proto sup_protos[] =
 136 {
 137   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 138   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 139   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 140 };
 141
 142 static void parse_dir PARAMS ((const char *, char **, char **));
 143 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 144 static char *construct PARAMS ((const char *, const char *, int , int));
 145 static char *construct_relative PARAMS ((const char *, const char *));
 146 static char process_ftp_type PARAMS ((char *));
 147
 148 \f
 149 /* Returns the number of characters to be skipped if the first thing
 150    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 151    URL: are also skipped.  */
 152 int
 153 skip_url (const char *url)
 154 {
 155   int i;
 156
 157   if (toupper (url[0]) == 'U'
 158       && toupper (url[1]) == 'R'
 159       && toupper (url[2]) == 'L'
 160       && url[3] == ':')
 161     {
 162       /* Skip blanks.  */
 163       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 164       return i;
 165     }
 166   else
 167     return 0;
 168 }
 169
 170 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 171 int
 172 contains_unsafe (const char *s)
 173 {
 174   for (; *s; s++)
 175     if (strchr (URL_UNSAFE, *s))
 176       return 1;
 177   return 0;
 178 }
 179
 180 /* Decodes the forms %xy in a URL to the character the hexadecimal
 181    code of which is xy.  xy are hexadecimal digits from
 182    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 183    hex-digits or `%' precedes `\0', the sequence is inserted
 184    literally.  */
 185
 186 static void
 187 decode_string (char *s)
 188 {
 189   char *p = s;
 190
 191   for (; *s; s++, p++)
 192     {
 193       if (*s != '%')
 194         *p = *s;
 195       else
 196         {
 197           /* Do nothing if at the end of the string, or if the chars
 198              are not hex-digits.  */
 199           if (!*(s + 1) || !*(s + 2)
 200               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 201             {
 202               *p = *s;
 203               continue;
 204             }
 205           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 206           s += 2;
 207         }
 208     }
 209   *p = '\0';
 210 }
 211
 212 /* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
 213    string, returning a malloc-ed %XX encoded string.  */
 214 char *
 215 encode_string (const char *s)
 216 {
 217   const char *b;
 218   char *p, *res;
 219   int i;
 220
 221   b = s;
 222   for (i = 0; *s; s++, i++)
 223     if (strchr (URL_UNSAFE, *s))
 224       i += 2; /* Two more characters (hex digits) */
 225   res = (char *)xmalloc (i + 1);
 226   s = b;
 227   for (p = res; *s; s++)
 228     if (strchr (URL_UNSAFE, *s))
 229       {
 230         const unsigned char c = *s;
 231         *p++ = '%';
 232         *p++ = HEXD2ASC (c >> 4);
 233         *p++ = HEXD2ASC (c & 0xf);
 234       }
 235     else
 236       *p++ = *s;
 237   *p = '\0';
 238   return res;
 239 }
 240 \f
 241 /* Returns the proto-type if URL's protocol is supported, or
 242    URLUNKNOWN if not.  */
 243 uerr_t
 244 urlproto (const char *url)
 245 {
 246   int i;
 247
 248   url += skip_url (url);
 249   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 250     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 251       return sup_protos[i].ind;
 252   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 253   if (url[i] == ':')
 254     {
 255       for (++i; url[i] && url[i] != '/'; i++)
 256         if (!ISDIGIT (url[i]))
 257           return URLBADPORT;
 258       if (url[i - 1] == ':')
 259         return URLFTP;
 260       else
 261         return URLHTTP;
 262     }
 263   else
 264     return URLHTTP;
 265 }
 266
 267 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 268    part is found, returns 0.  */
 269 int
 270 skip_proto (const char *url)
 271 {
 272   char **s;
 273   int l;
 274
 275   for (s = protostrings; *s; s++)
 276     if (!strncasecmp (*s, url, strlen (*s)))
 277       break;
 278   if (!*s)
 279     return 0;
 280   l = strlen (*s);
 281   /* HTTP and FTP protocols are expected to yield exact host names
 282      (i.e. the `//' part must be skipped, too).  */
 283   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 284     l += 2;
 285   return l;
 286 }
 287
 288 /* Returns 1 if the URL begins with a protocol (supported or
 289    unsupported), 0 otherwise.  */
 290 static int
 291 has_proto (const char *url)
 292 {
 293   char **s;
 294
 295   url += skip_url (url);
 296   for (s = protostrings; *s; s++)
 297     if (strncasecmp (url, *s, strlen (*s)) == 0)
 298       return 1;
 299   return 0;
 300 }
 301
 302 /* Skip the username and password, if present here.  The function
 303    should be called *not* with the complete URL, but with the part
 304    right after the protocol.
 305
 306    If no username and password are found, return 0.  */
 307 int
 308 skip_uname (const char *url)
 309 {
 310   const char *p;
 311   for (p = url; *p && *p != '/'; p++)
 312     if (*p == '@')
 313       break;
 314   /* If a `@' was found before the first occurrence of `/', skip
 315      it.  */
 316   if (*p == '@')
 317     return p - url + 1;
 318   else
 319     return 0;
 320 }
 321 \f
 322 /* Allocate a new urlinfo structure, fill it with default values and
 323    return a pointer to it.  */
 324 struct urlinfo *
 325 newurl (void)
 326 {
 327   struct urlinfo *u;
 328
 329   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 330   memset (u, 0, sizeof (*u));
 331   u->proto = URLUNKNOWN;
 332   return u;
 333 }
 334
 335 /* Perform a "deep" free of the urlinfo structure.  The structure
 336    should have been created with newurl, but need not have been used.
 337    If free_pointer is non-0, free the pointer itself.  */
 338 void
 339 freeurl (struct urlinfo *u, int complete)
 340 {
 341   assert (u != NULL);
 342   FREE_MAYBE (u->url);
 343   FREE_MAYBE (u->host);
 344   FREE_MAYBE (u->path);
 345   FREE_MAYBE (u->file);
 346   FREE_MAYBE (u->dir);
 347   FREE_MAYBE (u->user);
 348   FREE_MAYBE (u->passwd);
 349   FREE_MAYBE (u->local);
 350   FREE_MAYBE (u->referer);
 351   if (u->proxy)
 352     freeurl (u->proxy, 1);
 353   if (complete)
 354     free (u);
 355   return;
 356 }
 357 \f
 358 /* Extract the given URL of the form
 359    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 360    1. hostname (terminated with `/' or `:')
 361    2. port number (terminated with `/'), or chosen for the protocol
 362    3. dirname (everything after hostname)
 363    Most errors are handled.  No allocation is done, you must supply
 364    pointers to allocated memory.
 365    ...and a host of other stuff :-)
 366
 367    - Recognizes hostname:dir/file for FTP and
 368      hostname (:portnum)?/dir/file for HTTP.
 369    - Parses the path to yield directory and file
 370    - Parses the URL to yield the username and passwd (if present)
 371    - Decodes the strings, in case they contain "forbidden" characters
 372    - Writes the result to struct urlinfo
 373
 374    If the argument STRICT is set, it recognizes only the canonical
 375    form.  */
 376 uerr_t
 377 parseurl (const char *url, struct urlinfo *u, int strict)
 378 {
 379   int i, l, abs_ftp;
 380   int recognizable;            /* Recognizable URL is the one where
 381                                   the protocol name was explicitly
 382                                   named, i.e. it wasn't deduced from
 383                                   the URL format.  */
 384   uerr_t type;
 385
 386   DEBUGP (("parseurl (\"%s\") -> ", url));
 387   url += skip_url (url);
 388   recognizable = has_proto (url);
 389   if (strict && !recognizable)
 390     return URLUNKNOWN;
 391   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 392     {
 393       l = strlen (sup_protos[i].name);
 394       if (!strncasecmp (sup_protos[i].name, url, l))
 395         break;
 396     }
 397   /* If protocol is recognizable, but unsupported, bail out, else
 398      suppose unknown.  */
 399   if (recognizable && !sup_protos[i].name)
 400     return URLUNKNOWN;
 401   else if (i == ARRAY_SIZE (sup_protos))
 402     type = URLUNKNOWN;
 403   else
 404     u->proto = type = sup_protos[i].ind;
 405
 406   if (type == URLUNKNOWN)
 407     l = 0;
 408   /* Allow a username and password to be specified (i.e. just skip
 409      them for now).  */
 410   if (recognizable)
 411     l += skip_uname (url + l);
 412   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 413   if (i == l)
 414     return URLBADHOST;
 415   /* Get the hostname.  */
 416   u->host = strdupdelim (url + l, url + i);
 417   DEBUGP (("host %s -> ", u->host));
 418
 419   /* Assume no port has been given.  */
 420   u->port = 0;
 421   if (url[i] == ':')
 422     {
 423       /* We have a colon delimiting the hostname.  It could mean that
 424          a port number is following it, or a directory.  */
 425       if (ISDIGIT (url[++i]))    /* A port number */
 426         {
 427           if (type == URLUNKNOWN)
 428             u->proto = type = URLHTTP;
 429           for (; url[i] && url[i] != '/'; i++)
 430             if (ISDIGIT (url[i]))
 431               u->port = 10 * u->port + (url[i] - '0');
 432             else
 433               return URLBADPORT;
 434           if (!u->port)
 435             return URLBADPORT;
 436           DEBUGP (("port %hu -> ", u->port));
 437         }
 438       else if (type == URLUNKNOWN) /* or a directory */
 439         u->proto = type = URLFTP;
 440       else                      /* or just a misformed port number */
 441         return URLBADPORT;
 442     }
 443   else if (type == URLUNKNOWN)
 444     u->proto = type = URLHTTP;
 445   if (!u->port)
 446     {
 447       int i;
 448       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 449         if (sup_protos[i].ind == type)
 450           break;
 451       if (i == ARRAY_SIZE (sup_protos))
 452         return URLUNKNOWN;
 453       u->port = sup_protos[i].port;
 454     }
 455   /* Some delimiter troubles...  */
 456   if (url[i] == '/' && url[i - 1] != ':')
 457     ++i;
 458   if (type == URLHTTP)
 459     while (url[i] && url[i] == '/')
 460       ++i;
 461   u->path = (char *)xmalloc (strlen (url + i) + 8);
 462   strcpy (u->path, url + i);
 463   if (type == URLFTP)
 464     {
 465       u->ftp_type = process_ftp_type (u->path);
 466       /* #### We don't handle type `d' correctly yet.  */
 467       if (!u->ftp_type || toupper (u->ftp_type) == 'D')
 468         u->ftp_type = 'I';
 469     }
 470   DEBUGP (("opath %s -> ", u->path));
 471   /* Parse the username and password (if existing).  */
 472   parse_uname (url, &u->user, &u->passwd);
 473   /* Decode the strings, as per RFC 1738.  */
 474   decode_string (u->host);
 475   decode_string (u->path);
 476   if (u->user)
 477     decode_string (u->user);
 478   if (u->passwd)
 479     decode_string (u->passwd);
 480   /* Parse the directory.  */
 481   parse_dir (u->path, &u->dir, &u->file);
 482   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 483   /* Simplify the directory.  */
 484   path_simplify (u->dir);
 485   /* Remove the leading `/' in HTTP.  */
 486   if (type == URLHTTP && *u->dir == '/')
 487     strcpy (u->dir, u->dir + 1);
 488   DEBUGP (("ndir %s\n", u->dir));
 489   /* Strip trailing `/'.  */
 490   l = strlen (u->dir);
 491   if (l && u->dir[l - 1] == '/')
 492     u->dir[l - 1] = '\0';
 493   /* Re-create the path: */
 494   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 495   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 496       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 497   strcpy (u->path, abs_ftp ? "%2F" : "/");
 498   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 499   strcat (u->path, *u->dir ? "/" : "");
 500   strcat (u->path, u->file);
 501   URL_CLEANSE (u->path);
 502   /* Create the clean URL.  */
 503   u->url = str_url (u, 0);
 504   return URLOK;
 505 }
 506 \f
 507 /* Build the directory and filename components of the path.  Both
 508    components are *separately* malloc-ed strings!  It does not change
 509    the contents of path.
 510
 511    If the path ends with "." or "..", they are (correctly) counted as
 512    directories.  */
 513 static void
 514 parse_dir (const char *path, char **dir, char **file)
 515 {
 516   int i, l;
 517
 518   for (i = l = strlen (path); i && path[i] != '/'; i--);
 519   if (!i && *path != '/')   /* Just filename */
 520     {
 521       if (DOTP (path) || DDOTP (path))
 522         {
 523           *dir = xstrdup (path);
 524           *file = xstrdup ("");
 525         }
 526       else
 527         {
 528           *dir = xstrdup ("");     /* This is required because of FTP */
 529           *file = xstrdup (path);
 530         }
 531     }
 532   else if (!i)                 /* /filename */
 533     {
 534       if (DOTP (path + 1) || DDOTP (path + 1))
 535         {
 536           *dir = xstrdup (path);
 537           *file = xstrdup ("");
 538         }
 539       else
 540         {
 541           *dir = xstrdup ("/");
 542           *file = xstrdup (path + 1);
 543         }
 544     }
 545   else /* Nonempty directory with or without a filename */
 546     {
 547       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 548         {
 549           *dir = xstrdup (path);
 550           *file = xstrdup ("");
 551         }
 552       else
 553         {
 554           *dir = strdupdelim (path, path + i);
 555           *file = strdupdelim (path + i + 1, path + l + 1);
 556         }
 557     }
 558 }
 559
 560 /* Find the optional username and password within the URL, as per
 561    RFC1738.  The returned user and passwd char pointers are
 562    malloc-ed.  */
 563 static uerr_t
 564 parse_uname (const char *url, char **user, char **passwd)
 565 {
 566   int l;
 567   const char *p, *col;
 568   char **where;
 569
 570   *user = NULL;
 571   *passwd = NULL;
 572   url += skip_url (url);
 573   /* Look for end of protocol string.  */
 574   l = skip_proto (url);
 575   if (!l)
 576     return URLUNKNOWN;
 577   /* Add protocol offset.  */
 578   url += l;
 579   /* Is there an `@' character?  */
 580   for (p = url; *p && *p != '/'; p++)
 581     if (*p == '@')
 582       break;
 583   /* If not, return.  */
 584   if (*p != '@')
 585     return URLOK;
 586   /* Else find the username and password.  */
 587   for (p = col = url; *p != '@'; p++)
 588     {
 589       if (*p == ':' && !*user)
 590         {
 591           *user = (char *)xmalloc (p - url + 1);
 592           memcpy (*user, url, p - url);
 593           (*user)[p - url] = '\0';
 594           col = p + 1;
 595         }
 596     }
 597   /* Decide whether you have only the username or both.  */
 598   where = *user ? passwd : user;
 599   *where = (char *)xmalloc (p - col + 1);
 600   memcpy (*where, col, p - col);
 601   (*where)[p - col] = '\0';
 602   return URLOK;
 603 }
 604
 605 /* If PATH ends with `;type=X', return the character X.  */
 606 static char
 607 process_ftp_type (char *path)
 608 {
 609   int len = strlen (path);
 610
 611   if (len >= 7
 612       && !memcmp (path + len - 7, ";type=", 6))
 613     {
 614       path[len - 7] = '\0';
 615       return path[len - 1];
 616     }
 617   else
 618     return '\0';
 619 }
 620 \f
 621 /* Return the URL as fine-formed string, with a proper protocol, port
 622    number, directory and optional user/password.  If HIDE is non-zero,
 623    password will be hidden.  The forbidden characters in the URL will
 624    be cleansed.  */
 625 char *
 626 str_url (const struct urlinfo *u, int hide)
 627 {
 628   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 629   int i, l, ln, lu, lh, lp, lf, ld;
 630
 631   /* Look for the protocol name.  */
 632   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 633     if (sup_protos[i].ind == u->proto)
 634       break;
 635   if (i == ARRAY_SIZE (sup_protos))
 636     return NULL;
 637   proto_name = sup_protos[i].name;
 638   host = CLEANDUP (u->host);
 639   dir = CLEANDUP (u->dir);
 640   file = CLEANDUP (u->file);
 641   user = passwd = NULL;
 642   if (u->user)
 643     user = CLEANDUP (u->user);
 644   if (u->passwd)
 645     {
 646       int i;
 647       passwd = CLEANDUP (u->passwd);
 648       if (hide)
 649         for (i = 0; passwd[i]; i++)
 650           passwd[i] = 'x';
 651     }
 652   if (u->proto == URLFTP && *dir == '/')
 653     {
 654       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 655       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 656       *tmp = '%';
 657       tmp[1] = '2';
 658       tmp[2] = 'F';
 659       strcpy (tmp + 3, dir + 1);
 660       free (dir);
 661       dir = tmp;
 662     }
 663
 664   ln = strlen (proto_name);
 665   lu = user ? strlen (user) : 0;
 666   lp = passwd ? strlen (passwd) : 0;
 667   lh = strlen (host);
 668   ld = strlen (dir);
 669   lf = strlen (file);
 670   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 671   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 672      (user ? user : ""), (passwd ? ":" : ""),
 673      (passwd ? passwd : ""), (user ? "@" : ""),
 674      host, u->port, dir, *dir ? "/" : "", file); */
 675   l = 0;
 676   memcpy (res, proto_name, ln);
 677   l += ln;
 678   if (user)
 679     {
 680       memcpy (res + l, user, lu);
 681       l += lu;
 682       if (passwd)
 683         {
 684           res[l++] = ':';
 685           memcpy (res + l, passwd, lp);
 686           l += lp;
 687         }
 688       res[l++] = '@';
 689     }
 690   memcpy (res + l, host, lh);
 691   l += lh;
 692   res[l++] = ':';
 693   long_to_string (res + l, (long)u->port);
 694   l += numdigit (u->port);
 695   res[l++] = '/';
 696   memcpy (res + l, dir, ld);
 697   l += ld;
 698   if (*dir)
 699     res[l++] = '/';
 700   strcpy (res + l, file);
 701   free (host);
 702   free (dir);
 703   free (file);
 704   FREE_MAYBE (user);
 705   FREE_MAYBE (passwd);
 706   return res;
 707 }
 708
 709 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 710    location.  Uses parseurl to parse them, and compares the canonical
 711    forms.
 712
 713    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 714    return 0 on error.  */
 715 int
 716 url_equal (const char *url1, const char *url2)
 717 {
 718   struct urlinfo *u1, *u2;
 719   uerr_t err;
 720   int res;
 721
 722   u1 = newurl ();
 723   err = parseurl (url1, u1, 0);
 724   if (err != URLOK)
 725     {
 726       freeurl (u1, 1);
 727       return 0;
 728     }
 729   u2 = newurl ();
 730   err = parseurl (url2, u2, 0);
 731   if (err != URLOK)
 732     {
 733       freeurl (u2, 1);
 734       return 0;
 735     }
 736   res = !strcmp (u1->url, u2->url);
 737   freeurl (u1, 1);
 738   freeurl (u2, 1);
 739   return res;
 740 }
 741 \f
 742 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 743    buffer may contain pretty much anything; no errors are signaled.  */
 744 static const char *
 745 findurl (const char *buf, int howmuch, int *count)
 746 {
 747   char **prot;
 748   const char *s1, *s2;
 749
 750   for (s1 = buf; howmuch; s1++, howmuch--)
 751     for (prot = protostrings; *prot; prot++)
 752       if (howmuch <= strlen (*prot))
 753         continue;
 754       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 755         {
 756           for (s2 = s1, *count = 0;
 757                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 758                  !strchr (URL_SEPARATOR, *s2);
 759                s2++, (*count)++, howmuch--);
 760           return s1;
 761         }
 762   return NULL;
 763 }
 764
 765 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 766    each pointer representing a URL string.  The file is *not* assumed
 767    to be HTML.  */
 768 urlpos *
 769 get_urls_file (const char *file)
 770 {
 771   long nread;
 772   FILE *fp;
 773   char *buf;
 774   const char *pbuf;
 775   int size;
 776   urlpos *first, *current, *old;
 777
 778   if (file && !HYPHENP (file))
 779     {
 780       fp = fopen (file, "rb");
 781       if (!fp)
 782         {
 783           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 784           return NULL;
 785         }
 786     }
 787   else
 788     fp = stdin;
 789   /* Load the file.  */
 790   load_file (fp, &buf, &nread);
 791   if (file && !HYPHENP (file))
 792     fclose (fp);
 793   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 794   first = current = NULL;
 795   /* Fill the linked list with URLs.  */
 796   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 797        pbuf += size)
 798     {
 799       /* Allocate the space.  */
 800       old = current;
 801       current = (urlpos *)xmalloc (sizeof (urlpos));
 802       if (old)
 803         old->next = current;
 804       memset (current, 0, sizeof (*current));
 805       current->next = NULL;
 806       current->url = (char *)xmalloc (size + 1);
 807       memcpy (current->url, pbuf, size);
 808       current->url[size] = '\0';
 809       if (!first)
 810         first = current;
 811     }
 812   /* Free the buffer.  */
 813   free (buf);
 814
 815   return first;
 816 }
 817
 818 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 819    an HTML document using htmlfindurl(), which see.  get_urls_html()
 820    constructs the HTML-s from the relative href-s.
 821
 822    If SILENT is non-zero, do not barf on baseless relative links.  */
 823 urlpos *
 824 get_urls_html (const char *file, const char *this_url, int silent)
 825 {
 826   long nread;
 827   FILE *fp;
 828   char *orig_buf;
 829   const char *buf;
 830   int step, first_time;
 831   urlpos *first, *current, *old;
 832
 833   if (file && !HYPHENP (file))
 834     {
 835       fp = fopen (file, "rb");
 836       if (!fp)
 837         {
 838           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 839           return NULL;
 840         }
 841     }
 842   else
 843     fp = stdin;
 844   /* Load the file.  */
 845   load_file (fp, &orig_buf, &nread);
 846   if (file && !HYPHENP (file))
 847     fclose (fp);
 848   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 849   first = current = NULL;
 850   first_time = 1;
 851   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 852   for (buf = orig_buf;
 853        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
 854        buf += step)
 855     {
 856       int i, no_proto;
 857       int size = step;
 858       const char *pbuf = buf;
 859       char *constr, *base;
 860       const char *cbase;
 861
 862       first_time = 0;
 863
 864       /* A frequent phenomenon that needs to be handled are pages
 865          generated by brain-damaged HTML generators, which refer to to
 866          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 867          any spaces at the beginning or at the end of the string.
 868          This is probably not strictly correct, but that's what the
 869          browsers do, so we may follow.  May the authors of "WYSIWYG"
 870          HTML tools burn in hell for the damage they've inflicted!  */
 871       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 872         {
 873           ++pbuf;
 874           --size;
 875         }
 876       while (size && ISSPACE (pbuf[size - 1]))
 877         --size;
 878       if (!size)
 879         break;
 880
 881       for (i = 0; protostrings[i]; i++)
 882         {
 883           if (!strncasecmp (protostrings[i], pbuf,
 884                             MINVAL (strlen (protostrings[i]), size)))
 885             break;
 886         }
 887       /* Check for http:RELATIVE_URI.  See below for details.  */
 888       if (protostrings[i]
 889           && !(strncasecmp (pbuf, "http:", 5) == 0
 890                && strncasecmp (pbuf, "http://", 7) != 0))
 891         {
 892           no_proto = 0;
 893         }
 894       else
 895         {
 896           no_proto = 1;
 897           /* This is for extremely brain-damaged pages that refer to
 898              relative URI-s as <a href="http:URL">.  Just strip off the
 899              silly leading "http:" (as well as any leading blanks
 900              before it).  */
 901           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 902             pbuf += 5, size -= 5;
 903         }
 904       if (!no_proto)
 905         {
 906           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 907             {
 908               if (!strncasecmp (sup_protos[i].name, pbuf,
 909                                MINVAL (strlen (sup_protos[i].name), size)))
 910                 break;
 911             }
 912           /* Do *not* accept a non-supported protocol.  */
 913           if (i == ARRAY_SIZE (sup_protos))
 914             continue;
 915         }
 916       if (no_proto)
 917         {
 918           /* First, construct the base, which can be relative itself.
 919
 920              Criteria for creating the base are:
 921              1) html_base created by <base href="...">
 922              2) current URL
 923              3) base provided from the command line */
 924           cbase = html_base ();
 925           if (!cbase)
 926             cbase = this_url;
 927           if (!cbase)
 928             cbase = opt.base_href;
 929           if (!cbase)             /* Error condition -- a baseless
 930                                      relative link.  */
 931             {
 932               if (!opt.quiet && !silent)
 933                 {
 934                   /* Use malloc, not alloca because this is called in
 935                      a loop. */
 936                   char *temp = (char *)malloc (size + 1);
 937                   strncpy (temp, pbuf, size);
 938                   temp[size] = '\0';
 939                   logprintf (LOG_NOTQUIET,
 940                              _("Error (%s): Link %s without a base provided.\n"),
 941                              file, temp);
 942                   free (temp);
 943                 }
 944               continue;
 945             }
 946           if (this_url)
 947             base = construct (this_url, cbase, strlen (cbase),
 948                               !has_proto (cbase));
 949           else
 950             {
 951               /* Base must now be absolute, with host name and
 952                  protocol.  */
 953               if (!has_proto (cbase))
 954                 {
 955                   logprintf (LOG_NOTQUIET, _("\
 956 Error (%s): Base %s relative, without referer URL.\n"),
 957                              file, cbase);
 958                   continue;
 959                 }
 960               base = xstrdup (cbase);
 961             }
 962           constr = construct (base, pbuf, size, no_proto);
 963           free (base);
 964         }
 965       else /* has proto */
 966         {
 967           constr = (char *)xmalloc (size + 1);
 968           strncpy (constr, pbuf, size);
 969           constr[size] = '\0';
 970         }
 971 #ifdef DEBUG
 972       if (opt.debug)
 973         {
 974           char *tmp;
 975           const char *tmp2;
 976
 977           tmp2 = html_base ();
 978           /* Use malloc, not alloca because this is called in a loop. */
 979           tmp = (char *)xmalloc (size + 1);
 980           strncpy (tmp, pbuf, size);
 981           tmp[size] = '\0';
 982           logprintf (LOG_ALWAYS,
 983                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
 984                      file, this_url ? this_url : "(null)",
 985                      tmp2 ? tmp2 : "(null)", tmp, constr);
 986           free (tmp);
 987         }
 988 #endif
 989
 990       /* Allocate the space.  */
 991       old = current;
 992       current = (urlpos *)xmalloc (sizeof (urlpos));
 993       if (old)
 994         old->next = current;
 995       if (!first)
 996         first = current;
 997       /* Fill the values.  */
 998       memset (current, 0, sizeof (*current));
 999       current->next = NULL;
1000       current->url = constr;
1001       current->size = size;
1002       current->pos = pbuf - orig_buf;
1003       /* A URL is relative if the host and protocol are not named,
1004          and the name does not start with `/'.  */
1005       if (no_proto && *pbuf != '/')
1006         current->flags |= (URELATIVE | UNOPROTO);
1007       else if (no_proto)
1008         current->flags |= UNOPROTO;
1009     }
1010   free (orig_buf);
1011
1012   return first;
1013 }
1014 \f
1015 /* Free the linked list of urlpos.  */
1016 void
1017 free_urlpos (urlpos *l)
1018 {
1019   while (l)
1020     {
1021       urlpos *next = l->next;
1022       free (l->url);
1023       FREE_MAYBE (l->local_name);
1024       free (l);
1025       l = next;
1026     }
1027 }
1028
1029 /* Rotate FNAME opt.backups times */
1030 void
1031 rotate_backups(const char *fname)
1032 {
1033   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1034   char *from = (char *)alloca (maxlen);
1035   char *to = (char *)alloca (maxlen);
1036   struct stat sb;
1037   int i;
1038
1039   if (stat (fname, &sb) == 0)
1040     if (S_ISREG (sb.st_mode) == 0)
1041       return;
1042
1043   for (i = opt.backups; i > 1; i--)
1044     {
1045       sprintf (from, "%s.%d", fname, i - 1);
1046       sprintf (to, "%s.%d", fname, i);
1047       /* #### This will fail on machines without the rename() system
1048          call.  */
1049       rename (from, to);
1050     }
1051
1052   sprintf (to, "%s.%d", fname, 1);
1053   rename(fname, to);
1054 }
1055
1056 /* Create all the necessary directories for PATH (a file).  Calls
1057    mkdirhier() internally.  */
1058 int
1059 mkalldirs (const char *path)
1060 {
1061   const char *p;
1062   char *t;
1063   struct stat st;
1064   int res;
1065
1066   p = path + strlen (path);
1067   for (; *p != '/' && p != path; p--);
1068   /* Don't create if it's just a file.  */
1069   if ((p == path) && (*p != '/'))
1070     return 0;
1071   t = strdupdelim (path, p);
1072   /* Check whether the directory exists.  */
1073   if ((stat (t, &st) == 0))
1074     {
1075       if (S_ISDIR (st.st_mode))
1076         {
1077           free (t);
1078           return 0;
1079         }
1080       else
1081         {
1082           /* If the dir exists as a file name, remove it first.  This
1083              is *only* for Wget to work with buggy old CERN http
1084              servers.  Here is the scenario: When Wget tries to
1085              retrieve a directory without a slash, e.g.
1086              http://foo/bar (bar being a directory), CERN server will
1087              not redirect it too http://foo/bar/ -- it will generate a
1088              directory listing containing links to bar/file1,
1089              bar/file2, etc.  Wget will lose because it saves this
1090              HTML listing to a file `bar', so it cannot create the
1091              directory.  To work around this, if the file of the same
1092              name exists, we just remove it and create the directory
1093              anyway.  */
1094           DEBUGP (("Removing %s because of directory danger!\n", t));
1095           unlink (t);
1096         }
1097     }
1098   res = make_directory (t);
1099   if (res != 0)
1100     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1101   free (t);
1102   return res;
1103 }
1104
1105 static int
1106 count_slashes (const char *s)
1107 {
1108   int i = 0;
1109   while (*s)
1110     if (*s++ == '/')
1111       ++i;
1112   return i;
1113 }
1114
1115 /* Return the path name of the URL-equivalent file name, with a
1116    remote-like structure of directories.  */
1117 static char *
1118 mkstruct (const struct urlinfo *u)
1119 {
1120   char *host, *dir, *file, *res, *dirpref;
1121   int l;
1122
1123   assert (u->dir != NULL);
1124   assert (u->host != NULL);
1125
1126   if (opt.cut_dirs)
1127     {
1128       char *ptr = u->dir + (*u->dir == '/');
1129       int slash_count = 1 + count_slashes (ptr);
1130       int cut = MINVAL (opt.cut_dirs, slash_count);
1131       for (; cut && *ptr; ptr++)
1132         if (*ptr == '/')
1133           --cut;
1134       STRDUP_ALLOCA (dir, ptr);
1135     }
1136   else
1137     dir = u->dir + (*u->dir == '/');
1138
1139   host = xstrdup (u->host);
1140   /* Check for the true name (or at least a consistent name for saving
1141      to directory) of HOST, reusing the hlist if possible.  */
1142   if (opt.add_hostdir && !opt.simple_check)
1143     {
1144       char *nhost = realhost (host);
1145       free (host);
1146       host = nhost;
1147     }
1148   /* Add dir_prefix and hostname (if required) to the beginning of
1149      dir.  */
1150   if (opt.add_hostdir)
1151     {
1152       if (!DOTP (opt.dir_prefix))
1153         {
1154           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1155                                     + strlen (host) + 1);
1156           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1157         }
1158       else
1159         STRDUP_ALLOCA (dirpref, host);
1160     }
1161   else                         /* not add_hostdir */
1162     {
1163       if (!DOTP (opt.dir_prefix))
1164         dirpref = opt.dir_prefix;
1165       else
1166         dirpref = "";
1167     }
1168   free (host);
1169
1170   /* If there is a prefix, prepend it.  */
1171   if (*dirpref)
1172     {
1173       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1174       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1175       dir = newdir;
1176     }
1177   dir = xstrdup (dir);
1178   URL_CLEANSE (dir);
1179   l = strlen (dir);
1180   if (l && dir[l - 1] == '/')
1181     dir[l - 1] = '\0';
1182
1183   if (!*u->file)
1184     file = "index.html";
1185   else
1186     file = u->file;
1187
1188   /* Finally, construct the full name.  */
1189   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1190   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1191   free (dir);
1192   return res;
1193 }
1194
1195 /* Create a unique filename, corresponding to a given URL.  Calls
1196    mkstruct if necessary.  Does *not* actually create any directories.  */
1197 char *
1198 url_filename (const struct urlinfo *u)
1199 {
1200   char *file, *name;
1201   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1202
1203   if (opt.dirstruct)
1204     {
1205       file = mkstruct (u);
1206       have_prefix = 1;
1207     }
1208   else
1209     {
1210       if (!*u->file)
1211         file = xstrdup ("index.html");
1212       else
1213         file = xstrdup (u->file);
1214     }
1215
1216   if (!have_prefix)
1217     {
1218       /* Check whether the prefix directory is something other than "."
1219          before prepending it.  */
1220       if (!DOTP (opt.dir_prefix))
1221         {
1222           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1223                                          + 1 + strlen (file) + 1);
1224           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1225           free (file);
1226           file = nfile;
1227         }
1228     }
1229   /* DOS-ish file systems don't like `%' signs in them; we change it
1230      to `@'.  */
1231 #ifdef WINDOWS
1232   {
1233     char *p = file;
1234     for (p = file; *p; p++)
1235       if (*p == '%')
1236         *p = '@';
1237   }
1238 #endif /* WINDOWS */
1239
1240   /* Check the cases in which the unique extensions are not used:
1241      1) Clobbering is turned off (-nc).
1242      2) Retrieval with regetting.
1243      3) Timestamping is used.
1244      4) Hierarchy is built.
1245
1246      The exception is the case when file does exist and is a
1247      directory (actually support for bad httpd-s).  */
1248   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1249       && !(file_exists_p (file) && !file_non_directory_p (file)))
1250     return file;
1251
1252   /* Find a unique name.  */
1253   name = unique_name (file);
1254   free (file);
1255   return name;
1256 }
1257
1258 /* Construct an absolute URL, given a (possibly) relative one.  This
1259    is more tricky than it might seem, but it works.  */
1260 static char *
1261 construct (const char *url, const char *sub, int subsize, int no_proto)
1262 {
1263   char *constr;
1264
1265   if (no_proto)
1266     {
1267       int i;
1268
1269       if (*sub != '/')
1270         {
1271           for (i = strlen (url); i && url[i] != '/'; i--);
1272           if (!i || (url[i] == url[i - 1]))
1273             {
1274               int l = strlen (url);
1275               char *t = (char *)alloca (l + 2);
1276               strcpy (t, url);
1277               t[l] = '/';
1278               t[l + 1] = '\0';
1279               url = t;
1280               i = l;
1281             }
1282           constr = (char *)xmalloc (i + 1 + subsize + 1);
1283           strncpy (constr, url, i + 1);
1284           constr[i + 1] = '\0';
1285           strncat (constr, sub, subsize);
1286         }
1287       else /* *sub == `/' */
1288         {
1289           int fl;
1290
1291           i = 0;
1292           do
1293             {
1294               for (; url[i] && url[i] != '/'; i++);
1295               if (!url[i])
1296                 break;
1297               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1298               if (fl)
1299                 i += 2;
1300             }
1301           while (fl);
1302           if (!url[i])
1303             {
1304               int l = strlen (url);
1305               char *t = (char *)alloca (l + 2);
1306               strcpy (t, url);
1307               t[l] = '/';
1308               t[l + 1] = '\0';
1309               url = t;
1310             }
1311           constr = (char *)xmalloc (i + 1 + subsize + 1);
1312           strncpy (constr, url, i);
1313           constr[i] = '\0';
1314           strncat (constr + i, sub, subsize);
1315           constr[i + subsize] = '\0';
1316         } /* *sub == `/' */
1317     }
1318   else /* !no_proto */
1319     {
1320       constr = (char *)xmalloc (subsize + 1);
1321       strncpy (constr, sub, subsize);
1322       constr[subsize] = '\0';
1323     }
1324   return constr;
1325 }
1326 \f
1327 /* Optimize URL by host, destructively replacing u->host with realhost
1328    (u->host).  Do this regardless of opt.simple_check.  */
1329 void
1330 opt_url (struct urlinfo *u)
1331 {
1332   /* Find the "true" host.  */
1333   char *host = realhost (u->host);
1334   free (u->host);
1335   u->host = host;
1336   assert (u->dir != NULL);      /* the URL must have been parsed */
1337   /* Refresh the printed representation.  */
1338   free (u->url);
1339   u->url = str_url (u, 0);
1340 }
1341 \f
1342 /* Returns proxy host address, in accordance with PROTO.  */
1343 char *
1344 getproxy (uerr_t proto)
1345 {
1346   if (proto == URLHTTP)
1347     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1348   else if (proto == URLFTP)
1349     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1350   else
1351     return NULL;
1352 }
1353
1354 /* Should a host be accessed through proxy, concerning no_proxy?  */
1355 int
1356 no_proxy_match (const char *host, const char **no_proxy)
1357 {
1358   if (!no_proxy)
1359     return 1;
1360   else
1361     return !sufmatch (no_proxy, host);
1362 }
1363 \f
1364 /* Change the links in an HTML document.  Accepts a structure that
1365    defines the positions of all the links.  */
1366 void
1367 convert_links (const char *file, urlpos *l)
1368 {
1369   FILE *fp;
1370   char *buf, *p, *p2;
1371   long size;
1372
1373   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1374   /* Read from the file....  */
1375   fp = fopen (file, "rb");
1376   if (!fp)
1377     {
1378       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1379                  file, strerror (errno));
1380       return;
1381     }
1382   /* ...to a buffer.  */
1383   load_file (fp, &buf, &size);
1384   fclose (fp);
1385   if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1386     /* Rather than just writing over the original .html file with the converted
1387        version, save the former to *.orig.  Note we only do this for files we've
1388        _successfully_ downloaded, so we don't clobber .orig files sitting around
1389        from previous invocations. */
1390     {
1391       /* Construct the backup filename as the original name plus ".orig". */
1392       size_t         filename_len = strlen(file);
1393       char*          filename_plus_orig_suffix = malloc(filename_len +
1394                                                         sizeof(".orig"));
1395       boolean        already_wrote_backup_file = FALSE;
1396       slist*         converted_file_ptr;
1397       static slist*  converted_files = NULL;
1398
1399       /* Would a single s[n]printf() call be faster? */
1400       strcpy(filename_plus_orig_suffix, file);
1401       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1402
1403       /* We can get called twice on the same URL thanks to the
1404          convert_all_links() call in main().  If we write the .orig file each
1405          time in such a case, it'll end up containing the first-pass conversion,
1406          not the original file.  So, see if we've already been called on this
1407          file. */
1408       converted_file_ptr = converted_files;
1409       while (converted_file_ptr != NULL)
1410         if (strcmp(converted_file_ptr->string, file) == 0)
1411           {
1412             already_wrote_backup_file = TRUE;
1413             break;
1414           }
1415         else
1416           converted_file_ptr = converted_file_ptr->next;
1417
1418       if (!already_wrote_backup_file)
1419         {
1420           /* Rename <file> to <file>.orig before former gets written over. */
1421           if (rename(file, filename_plus_orig_suffix) != 0)
1422             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1423                        file, filename_plus_orig_suffix, strerror (errno));
1424
1425           /* Remember that we've already written a .orig backup for this file.
1426              Note that we never free this memory since we need it till the
1427              convert_all_links() call, which is one of the last things the
1428              program does before terminating.  BTW, I'm not sure if it would be
1429              safe to just set 'converted_file_ptr->string' to 'file' below,
1430              rather than making a copy of the string...  Another note is that I
1431              thought I could just add a field to the urlpos structure saying
1432              that we'd written a .orig file for this URL, but that didn't work,
1433              so I had to make this separate list. */
1434           converted_file_ptr = malloc(sizeof(slist));
1435           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1436           converted_file_ptr->next = converted_files;
1437           converted_files = converted_file_ptr;
1438         }
1439
1440       free(filename_plus_orig_suffix);
1441     }
1442   /* Now open the file for writing.  */
1443   fp = fopen (file, "wb");
1444   if (!fp)
1445     {
1446       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1447                  file, strerror (errno));
1448       free (buf);
1449       return;
1450     }
1451   /* [If someone understands why multiple URLs can correspond to one local file,
1452      can they please add a comment here...?] */
1453   for (p = buf; l; l = l->next)
1454     {
1455       if (l->pos >= size)
1456         {
1457           DEBUGP (("Something strange is going on.  Please investigate."));
1458           break;
1459         }
1460       /* If the URL already is relative or it is not to be converted
1461          for some other reason (e.g. because of not having been
1462          downloaded in the first place), skip it.  */
1463       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1464         {
1465           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1466                    l->pos, l->flags));
1467           continue;
1468         }
1469       /* Else, reach the position of the offending URL, echoing
1470          everything up to it to the outfile.  */
1471       for (p2 = buf + l->pos; p < p2; p++)
1472         putc (*p, fp);
1473       if (l->flags & UABS2REL)
1474         {
1475           char *newname = construct_relative (file, l->local_name);
1476           fprintf (fp, "%s", newname);
1477           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1478                    l->url, newname, l->pos, file));
1479           free (newname);
1480         }
1481       p += l->size;
1482     }
1483   if (p - buf < size)
1484     {
1485       for (p2 = buf + size; p < p2; p++)
1486         putc (*p, fp);
1487     }
1488   fclose (fp);
1489   free (buf);
1490   logputs (LOG_VERBOSE, _("done.\n"));
1491 }
1492
1493 /* Construct and return a malloced copy of the relative link from two
1494    pieces of information: local name S1 of the referring file and
1495    local name S2 of the referred file.
1496
1497    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1498    "jagor.srce.hr/images/news.gif", the function will return
1499    "images/news.gif".
1500
1501    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1502    "fly.cc.fer.hr/images/fly.gif", the function will return
1503    "../images/fly.gif".
1504
1505    Caveats: S1 should not begin with `/', unless S2 also begins with
1506    '/'.  S1 should not contain things like ".." and such --
1507    construct_relative ("fly/ioccc/../index.html",
1508    "fly/images/fly.gif") will fail.  (A workaround is to call
1509    something like path_simplify() on S1).  */
1510 static char *
1511 construct_relative (const char *s1, const char *s2)
1512 {
1513   int i, cnt, sepdirs1;
1514   char *res;
1515
1516   if (*s2 == '/')
1517     return xstrdup (s2);
1518   /* S1 should *not* be absolute, if S2 wasn't.  */
1519   assert (*s1 != '/');
1520   i = cnt = 0;
1521   /* Skip the directories common to both strings.  */
1522   while (1)
1523     {
1524       while (s1[i] && s2[i]
1525              && (s1[i] == s2[i])
1526              && (s1[i] != '/')
1527              && (s2[i] != '/'))
1528         ++i;
1529       if (s1[i] == '/' && s2[i] == '/')
1530         cnt = ++i;
1531       else
1532         break;
1533     }
1534   for (sepdirs1 = 0; s1[i]; i++)
1535     if (s1[i] == '/')
1536       ++sepdirs1;
1537   /* Now, construct the file as of:
1538      - ../ repeated sepdirs1 time
1539      - all the non-mutual directories of S2.  */
1540   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1541   for (i = 0; i < sepdirs1; i++)
1542     memcpy (res + 3 * i, "../", 3);
1543   strcpy (res + 3 * i, s2 + cnt);
1544   return res;
1545 }
1546 \f
1547 /* Add URL to the head of the list L.  */
1548 urlpos *
1549 add_url (urlpos *l, const char *url, const char *file)
1550 {
1551   urlpos *t;
1552
1553   t = (urlpos *)xmalloc (sizeof (urlpos));
1554   memset (t, 0, sizeof (*t));
1555   t->url = xstrdup (url);
1556   t->local_name = xstrdup (file);
1557   t->next = l;
1558   return t;
1559 }
1560
1561
1562 /* Remembers which files have been downloaded.  Should be called with
1563    add_or_check == ADD_FILE for each file we actually download successfully
1564    (i.e. not for ones we have failures on or that we skip due to -N).  If you
1565    just want to check if a file has been previously added without adding it,
1566    call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
1567    function with local filenames, not remote URLs -- by some means that isn't
1568    commented well enough for me understand, multiple remote URLs can apparently
1569    correspond to a single local file. */
1570 boolean
1571 downloaded_file (downloaded_file_t  add_or_check, const char*  file)
1572 {
1573   boolean        found_file = FALSE;
1574   static slist*  downloaded_files = NULL;
1575   slist*         rover = downloaded_files;
1576
1577   while (rover != NULL)
1578     if (strcmp(rover->string, file) == 0)
1579       {
1580         found_file = TRUE;
1581         break;
1582       }
1583     else
1584       rover = rover->next;
1585
1586   if (found_file)
1587     return TRUE;  /* file had already been downloaded */
1588   else
1589     {
1590       if (add_or_check == ADD_FILE)
1591         {
1592           rover = malloc(sizeof(slist));
1593           rover->string = xstrdup(file);  /* die on out-of-mem. */
1594           rover->next = downloaded_files;
1595           downloaded_files = rover;
1596         }
1597
1598       return FALSE;  /* file had not already been downloaded */
1599     }
1600 }