sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41 #include "html.h"
  42
  43 #ifndef errno
  44 extern int errno;
  45 #endif
  46
  47 /* Default port definitions */
  48 #define DEFAULT_HTTP_PORT 80
  49 #define DEFAULT_FTP_PORT 21
  50
  51 /* URL separator (for findurl) */
  52 #define URL_SEPARATOR "!\"#'(),>`{}|<>"
  53
  54 /* A list of unsafe characters for encoding, as per RFC1738.  '@' and
  55    ':' (not listed in RFC) were added because of user/password
  56    encoding, and \033 for safe printing.  */
  57
  58 #ifndef WINDOWS
  59 # define URL_UNSAFE " <>\"#%{}|\\^~[]`@:\033"
  60 #else  /* WINDOWS */
  61 # define URL_UNSAFE " <>\"%{}|\\^[]`\033"
  62 #endif /* WINDOWS */
  63
  64 /* If S contains unsafe characters, free it and replace it with a
  65    version that doesn't.  */
  66 #define URL_CLEANSE(s) do                       \
  67 {                                               \
  68   if (contains_unsafe (s))                      \
  69     {                                           \
  70       char *uc_tmp = encode_string (s);         \
  71       free (s);                                 \
  72       (s) = uc_tmp;                             \
  73     }                                           \
  74 } while (0)
  75
  76 /* Is a directory "."?  */
  77 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  78 /* Is a directory ".."?  */
  79 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  80
  81 /* NULL-terminated list of strings to be recognized as prototypes (URL
  82    schemes).  Note that recognized doesn't mean supported -- only HTTP
  83    and FTP are currently supported.
  84
  85    However, a string that does not match anything in the list will be
  86    considered a relative URL.  Thus it's important that this list has
  87    anything anyone could think of being legal.
  88
  89    There are wild things here.  :-) Take a look at
  90    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  91    fun.  */
  92 static char *protostrings[] =
  93 {
  94   "cid:",
  95   "clsid:",
  96   "file:",
  97   "finger:",
  98   "ftp:",
  99   "gopher:",
 100   "hdl:",
 101   "http:",
 102   "https:",
 103   "ilu:",
 104   "ior:",
 105   "irc:",
 106   "java:",
 107   "javascript:",
 108   "lifn:",
 109   "mailto:",
 110   "mid:",
 111   "news:",
 112   "nntp:",
 113   "path:",
 114   "prospero:",
 115   "rlogin:",
 116   "service:",
 117   "shttp:",
 118   "snews:",
 119   "stanf:",
 120   "telnet:",
 121   "tn3270:",
 122   "wais:",
 123   "whois++:",
 124   NULL
 125 };
 126
 127 struct proto
 128 {
 129   char *name;
 130   uerr_t ind;
 131   unsigned short port;
 132 };
 133
 134 /* Similar to former, but for supported protocols: */
 135 static struct proto sup_protos[] =
 136 {
 137   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 138   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 139   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 140 };
 141
 142 static void parse_dir PARAMS ((const char *, char **, char **));
 143 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 144 static char *construct PARAMS ((const char *, const char *, int , int));
 145 static char *construct_relative PARAMS ((const char *, const char *));
 146 static char process_ftp_type PARAMS ((char *));
 147
 148 \f
 149 /* Returns the number of characters to be skipped if the first thing
 150    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 151    URL: are also skipped.  */
 152 int
 153 skip_url (const char *url)
 154 {
 155   int i;
 156
 157   if (toupper (url[0]) == 'U'
 158       && toupper (url[1]) == 'R'
 159       && toupper (url[2]) == 'L'
 160       && url[3] == ':')
 161     {
 162       /* Skip blanks.  */
 163       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 164       return i;
 165     }
 166   else
 167     return 0;
 168 }
 169
 170 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 171 int
 172 contains_unsafe (const char *s)
 173 {
 174   for (; *s; s++)
 175     if (strchr (URL_UNSAFE, *s))
 176       return 1;
 177   return 0;
 178 }
 179
 180 /* Decodes the forms %xy in a URL to the character the hexadecimal
 181    code of which is xy.  xy are hexadecimal digits from
 182    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 183    hex-digits or `%' precedes `\0', the sequence is inserted
 184    literally.  */
 185
 186 static void
 187 decode_string (char *s)
 188 {
 189   char *p = s;
 190
 191   for (; *s; s++, p++)
 192     {
 193       if (*s != '%')
 194         *p = *s;
 195       else
 196         {
 197           /* Do nothing if at the end of the string, or if the chars
 198              are not hex-digits.  */
 199           if (!*(s + 1) || !*(s + 2)
 200               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 201             {
 202               *p = *s;
 203               continue;
 204             }
 205           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 206           s += 2;
 207         }
 208     }
 209   *p = '\0';
 210 }
 211
 212 /* Encodes the unsafe characters (listed in URL_UNSAFE) in a given
 213    string, returning a malloc-ed %XX encoded string.  */
 214 char *
 215 encode_string (const char *s)
 216 {
 217   const char *b;
 218   char *p, *res;
 219   int i;
 220
 221   b = s;
 222   for (i = 0; *s; s++, i++)
 223     if (strchr (URL_UNSAFE, *s))
 224       i += 2; /* Two more characters (hex digits) */
 225   res = (char *)xmalloc (i + 1);
 226   s = b;
 227   for (p = res; *s; s++)
 228     if (strchr (URL_UNSAFE, *s))
 229       {
 230         const unsigned char c = *s;
 231         *p++ = '%';
 232         *p++ = HEXD2ASC (c >> 4);
 233         *p++ = HEXD2ASC (c & 0xf);
 234       }
 235     else
 236       *p++ = *s;
 237   *p = '\0';
 238   return res;
 239 }
 240 \f
 241 /* Returns the proto-type if URL's protocol is supported, or
 242    URLUNKNOWN if not.  */
 243 uerr_t
 244 urlproto (const char *url)
 245 {
 246   int i;
 247
 248   url += skip_url (url);
 249   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 250     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 251       return sup_protos[i].ind;
 252   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 253   if (url[i] == ':')
 254     {
 255       for (++i; url[i] && url[i] != '/'; i++)
 256         if (!ISDIGIT (url[i]))
 257           return URLBADPORT;
 258       if (url[i - 1] == ':')
 259         return URLFTP;
 260       else
 261         return URLHTTP;
 262     }
 263   else
 264     return URLHTTP;
 265 }
 266
 267 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 268    part is found, returns 0.  */
 269 int
 270 skip_proto (const char *url)
 271 {
 272   char **s;
 273   int l;
 274
 275   for (s = protostrings; *s; s++)
 276     if (!strncasecmp (*s, url, strlen (*s)))
 277       break;
 278   if (!*s)
 279     return 0;
 280   l = strlen (*s);
 281   /* HTTP and FTP protocols are expected to yield exact host names
 282      (i.e. the `//' part must be skipped, too).  */
 283   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 284     l += 2;
 285   return l;
 286 }
 287
 288 /* Returns 1 if the URL begins with a protocol (supported or
 289    unsupported), 0 otherwise.  */
 290 static int
 291 has_proto (const char *url)
 292 {
 293   char **s;
 294
 295   url += skip_url (url);
 296   for (s = protostrings; *s; s++)
 297     if (strncasecmp (url, *s, strlen (*s)) == 0)
 298       return 1;
 299   return 0;
 300 }
 301
 302 /* Skip the username and password, if present here.  The function
 303    should be called *not* with the complete URL, but with the part
 304    right after the protocol.
 305
 306    If no username and password are found, return 0.  */
 307 int
 308 skip_uname (const char *url)
 309 {
 310   const char *p;
 311   for (p = url; *p && *p != '/'; p++)
 312     if (*p == '@')
 313       break;
 314   /* If a `@' was found before the first occurrence of `/', skip
 315      it.  */
 316   if (*p == '@')
 317     return p - url + 1;
 318   else
 319     return 0;
 320 }
 321 \f
 322 /* Allocate a new urlinfo structure, fill it with default values and
 323    return a pointer to it.  */
 324 struct urlinfo *
 325 newurl (void)
 326 {
 327   struct urlinfo *u;
 328
 329   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 330   memset (u, 0, sizeof (*u));
 331   u->proto = URLUNKNOWN;
 332   return u;
 333 }
 334
 335 /* Perform a "deep" free of the urlinfo structure.  The structure
 336    should have been created with newurl, but need not have been used.
 337    If free_pointer is non-0, free the pointer itself.  */
 338 void
 339 freeurl (struct urlinfo *u, int complete)
 340 {
 341   assert (u != NULL);
 342   FREE_MAYBE (u->url);
 343   FREE_MAYBE (u->host);
 344   FREE_MAYBE (u->path);
 345   FREE_MAYBE (u->file);
 346   FREE_MAYBE (u->dir);
 347   FREE_MAYBE (u->user);
 348   FREE_MAYBE (u->passwd);
 349   FREE_MAYBE (u->local);
 350   FREE_MAYBE (u->referer);
 351   if (u->proxy)
 352     freeurl (u->proxy, 1);
 353   if (complete)
 354     free (u);
 355   return;
 356 }
 357 \f
 358 /* Extract the given URL of the form
 359    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 360    1. hostname (terminated with `/' or `:')
 361    2. port number (terminated with `/'), or chosen for the protocol
 362    3. dirname (everything after hostname)
 363    Most errors are handled.  No allocation is done, you must supply
 364    pointers to allocated memory.
 365    ...and a host of other stuff :-)
 366
 367    - Recognizes hostname:dir/file for FTP and
 368      hostname (:portnum)?/dir/file for HTTP.
 369    - Parses the path to yield directory and file
 370    - Parses the URL to yield the username and passwd (if present)
 371    - Decodes the strings, in case they contain "forbidden" characters
 372    - Writes the result to struct urlinfo
 373
 374    If the argument STRICT is set, it recognizes only the canonical
 375    form.  */
 376 uerr_t
 377 parseurl (const char *url, struct urlinfo *u, int strict)
 378 {
 379   int i, l, abs_ftp;
 380   int recognizable;            /* Recognizable URL is the one where
 381                                   the protocol name was explicitly
 382                                   named, i.e. it wasn't deduced from
 383                                   the URL format.  */
 384   uerr_t type;
 385
 386   DEBUGP (("parseurl (\"%s\") -> ", url));
 387   url += skip_url (url);
 388   recognizable = has_proto (url);
 389   if (strict && !recognizable)
 390     return URLUNKNOWN;
 391   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 392     {
 393       l = strlen (sup_protos[i].name);
 394       if (!strncasecmp (sup_protos[i].name, url, l))
 395         break;
 396     }
 397   /* If protocol is recognizable, but unsupported, bail out, else
 398      suppose unknown.  */
 399   if (recognizable && !sup_protos[i].name)
 400     return URLUNKNOWN;
 401   else if (i == ARRAY_SIZE (sup_protos))
 402     type = URLUNKNOWN;
 403   else
 404     u->proto = type = sup_protos[i].ind;
 405
 406   if (type == URLUNKNOWN)
 407     l = 0;
 408   /* Allow a username and password to be specified (i.e. just skip
 409      them for now).  */
 410   if (recognizable)
 411     l += skip_uname (url + l);
 412   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 413   if (i == l)
 414     return URLBADHOST;
 415   /* Get the hostname.  */
 416   u->host = strdupdelim (url + l, url + i);
 417   DEBUGP (("host %s -> ", u->host));
 418
 419   /* Assume no port has been given.  */
 420   u->port = 0;
 421   if (url[i] == ':')
 422     {
 423       /* We have a colon delimiting the hostname.  It could mean that
 424          a port number is following it, or a directory.  */
 425       if (ISDIGIT (url[++i]))    /* A port number */
 426         {
 427           if (type == URLUNKNOWN)
 428             u->proto = type = URLHTTP;
 429           for (; url[i] && url[i] != '/'; i++)
 430             if (ISDIGIT (url[i]))
 431               u->port = 10 * u->port + (url[i] - '0');
 432             else
 433               return URLBADPORT;
 434           if (!u->port)
 435             return URLBADPORT;
 436           DEBUGP (("port %hu -> ", u->port));
 437         }
 438       else if (type == URLUNKNOWN) /* or a directory */
 439         u->proto = type = URLFTP;
 440       else                      /* or just a misformed port number */
 441         return URLBADPORT;
 442     }
 443   else if (type == URLUNKNOWN)
 444     u->proto = type = URLHTTP;
 445   if (!u->port)
 446     {
 447       int i;
 448       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 449         if (sup_protos[i].ind == type)
 450           break;
 451       if (i == ARRAY_SIZE (sup_protos))
 452         return URLUNKNOWN;
 453       u->port = sup_protos[i].port;
 454     }
 455   /* Some delimiter troubles...  */
 456   if (url[i] == '/' && url[i - 1] != ':')
 457     ++i;
 458   if (type == URLHTTP)
 459     while (url[i] && url[i] == '/')
 460       ++i;
 461
 462   /* dfb: break "path" into "path" and "qstring" if the URL is HTTP
 463      if it's not an HTTP url, set l to the last character, so the
 464      xmalloc and strncpy work as desired */
 465   if (type == URLHTTP) {
 466     for (l = i; url[l] && url[l] != '?'; l++);
 467     if (l != strlen(url)) {
 468       /* copy the query string, including the '?' into u->qstring */
 469       u->qstring = (char *)xmalloc (strlen (url + l) + 8);
 470       strcpy (u->qstring, url + l);
 471     }
 472   } else {
 473     l = strlen(url);
 474   }
 475
 476
 477   u->path = strdupdelim (url + i, url + l);
 478   if (type == URLFTP)
 479     {
 480       u->ftp_type = process_ftp_type (u->path);
 481       /* #### We don't handle type `d' correctly yet.  */
 482       if (!u->ftp_type || toupper (u->ftp_type) == 'D')
 483         u->ftp_type = 'I';
 484     }
 485   DEBUGP (("opath %s -> ", u->path));
 486   /* Parse the username and password (if existing).  */
 487   parse_uname (url, &u->user, &u->passwd);
 488   /* Decode the strings, as per RFC 1738.  */
 489   decode_string (u->host);
 490   decode_string (u->path);
 491   if (u->user)
 492     decode_string (u->user);
 493   if (u->passwd)
 494     decode_string (u->passwd);
 495   /* Parse the directory.  */
 496   parse_dir (u->path, &u->dir, &u->file);
 497   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 498   if (type == URLHTTP && u->qstring)
 499     DEBUGP (("query-string %s -> ", u->qstring));
 500   /* Simplify the directory.  */
 501   path_simplify (u->dir);
 502   /* Remove the leading `/' in HTTP.  */
 503   if (type == URLHTTP && *u->dir == '/')
 504     strcpy (u->dir, u->dir + 1);
 505   DEBUGP (("ndir %s\n", u->dir));
 506   /* Strip trailing `/'.  */
 507   l = strlen (u->dir);
 508   if (l && u->dir[l - 1] == '/')
 509     u->dir[l - 1] = '\0';
 510   /* Re-create the path: */
 511   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 512   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 513       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 514   strcpy (u->path, abs_ftp ? "%2F" : "/");
 515   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 516   strcat (u->path, *u->dir ? "/" : "");
 517   strcat (u->path, u->file);
 518   URL_CLEANSE (u->path);
 519   /* Create the clean URL.  */
 520   u->url = str_url (u, 0);
 521   return URLOK;
 522 }
 523 \f
 524 /* Build the directory and filename components of the path.  Both
 525    components are *separately* malloc-ed strings!  It does not change
 526    the contents of path.
 527
 528    If the path ends with "." or "..", they are (correctly) counted as
 529    directories.  */
 530 static void
 531 parse_dir (const char *path, char **dir, char **file)
 532 {
 533   int i, l;
 534
 535   for (i = l = strlen (path); i && path[i] != '/'; i--);
 536   if (!i && *path != '/')   /* Just filename */
 537     {
 538       if (DOTP (path) || DDOTP (path))
 539         {
 540           *dir = xstrdup (path);
 541           *file = xstrdup ("");
 542         }
 543       else
 544         {
 545           *dir = xstrdup ("");     /* This is required because of FTP */
 546           *file = xstrdup (path);
 547         }
 548     }
 549   else if (!i)                 /* /filename */
 550     {
 551       if (DOTP (path + 1) || DDOTP (path + 1))
 552         {
 553           *dir = xstrdup (path);
 554           *file = xstrdup ("");
 555         }
 556       else
 557         {
 558           *dir = xstrdup ("/");
 559           *file = xstrdup (path + 1);
 560         }
 561     }
 562   else /* Nonempty directory with or without a filename */
 563     {
 564       if (DOTP (path + i + 1) || DDOTP (path + i + 1))
 565         {
 566           *dir = xstrdup (path);
 567           *file = xstrdup ("");
 568         }
 569       else
 570         {
 571           *dir = strdupdelim (path, path + i);
 572           *file = strdupdelim (path + i + 1, path + l + 1);
 573         }
 574     }
 575 }
 576
 577 /* Find the optional username and password within the URL, as per
 578    RFC1738.  The returned user and passwd char pointers are
 579    malloc-ed.  */
 580 static uerr_t
 581 parse_uname (const char *url, char **user, char **passwd)
 582 {
 583   int l;
 584   const char *p, *col;
 585   char **where;
 586
 587   *user = NULL;
 588   *passwd = NULL;
 589   url += skip_url (url);
 590   /* Look for end of protocol string.  */
 591   l = skip_proto (url);
 592   if (!l)
 593     return URLUNKNOWN;
 594   /* Add protocol offset.  */
 595   url += l;
 596   /* Is there an `@' character?  */
 597   for (p = url; *p && *p != '/'; p++)
 598     if (*p == '@')
 599       break;
 600   /* If not, return.  */
 601   if (*p != '@')
 602     return URLOK;
 603   /* Else find the username and password.  */
 604   for (p = col = url; *p != '@'; p++)
 605     {
 606       if (*p == ':' && !*user)
 607         {
 608           *user = (char *)xmalloc (p - url + 1);
 609           memcpy (*user, url, p - url);
 610           (*user)[p - url] = '\0';
 611           col = p + 1;
 612         }
 613     }
 614   /* Decide whether you have only the username or both.  */
 615   where = *user ? passwd : user;
 616   *where = (char *)xmalloc (p - col + 1);
 617   memcpy (*where, col, p - col);
 618   (*where)[p - col] = '\0';
 619   return URLOK;
 620 }
 621
 622 /* If PATH ends with `;type=X', return the character X.  */
 623 static char
 624 process_ftp_type (char *path)
 625 {
 626   int len = strlen (path);
 627
 628   if (len >= 7
 629       && !memcmp (path + len - 7, ";type=", 6))
 630     {
 631       path[len - 7] = '\0';
 632       return path[len - 1];
 633     }
 634   else
 635     return '\0';
 636 }
 637 \f
 638 /* Return the URL as fine-formed string, with a proper protocol, port
 639    number, directory and optional user/password.  If HIDE is non-zero,
 640    password will be hidden.  The forbidden characters in the URL will
 641    be cleansed.  */
 642 char *
 643 str_url (const struct urlinfo *u, int hide)
 644 {
 645   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 646   int i, l, ln, lu, lh, lp, lf, ld, lq;
 647
 648   /* Look for the protocol name.  */
 649   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 650     if (sup_protos[i].ind == u->proto)
 651       break;
 652   if (i == ARRAY_SIZE (sup_protos))
 653     return NULL;
 654   proto_name = sup_protos[i].name;
 655   host = CLEANDUP (u->host);
 656   dir = CLEANDUP (u->dir);
 657   file = CLEANDUP (u->file);
 658   user = passwd = NULL;
 659   if (u->user)
 660     user = CLEANDUP (u->user);
 661   if (u->passwd)
 662     {
 663       int i;
 664       passwd = CLEANDUP (u->passwd);
 665       if (hide)
 666         for (i = 0; passwd[i]; i++)
 667           passwd[i] = 'x';
 668     }
 669   if (u->proto == URLFTP && *dir == '/')
 670     {
 671       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 672       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 673       *tmp = '%';
 674       tmp[1] = '2';
 675       tmp[2] = 'F';
 676       strcpy (tmp + 3, dir + 1);
 677       free (dir);
 678       dir = tmp;
 679     }
 680
 681   ln = strlen (proto_name);
 682   lu = user ? strlen (user) : 0;
 683   lp = passwd ? strlen (passwd) : 0;
 684   lh = strlen (host);
 685   ld = strlen (dir);
 686   lf = strlen (file);
 687   lq = (u->proto == URLHTTP && u->qstring) ? strlen (u->qstring) : 0;
 688   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + lq + 20); /* safe sex */
 689   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 690      (user ? user : ""), (passwd ? ":" : ""),
 691      (passwd ? passwd : ""), (user ? "@" : ""),
 692      host, u->port, dir, *dir ? "/" : "", file); */
 693   l = 0;
 694   memcpy (res, proto_name, ln);
 695   l += ln;
 696   if (user)
 697     {
 698       memcpy (res + l, user, lu);
 699       l += lu;
 700       if (passwd)
 701         {
 702           res[l++] = ':';
 703           memcpy (res + l, passwd, lp);
 704           l += lp;
 705         }
 706       res[l++] = '@';
 707     }
 708   memcpy (res + l, host, lh);
 709   l += lh;
 710   res[l++] = ':';
 711   long_to_string (res + l, (long)u->port);
 712   l += numdigit (u->port);
 713   res[l++] = '/';
 714   memcpy (res + l, dir, ld);
 715   l += ld;
 716   if (*dir)
 717     res[l++] = '/';
 718   strcpy (res + l, file);
 719   l += lf;
 720   free (host);
 721   free (dir);
 722   free (file);
 723   if (u->qstring)
 724     {
 725       /* copy in the raw query string to avoid munging arguments */
 726       memcpy (res + l, u->qstring, lq);
 727     }
 728   FREE_MAYBE (user);
 729   FREE_MAYBE (passwd);
 730   return res;
 731 }
 732
 733 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 734    location.  Uses parseurl to parse them, and compares the canonical
 735    forms.
 736
 737    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 738    return 0 on error.  */
 739 int
 740 url_equal (const char *url1, const char *url2)
 741 {
 742   struct urlinfo *u1, *u2;
 743   uerr_t err;
 744   int res;
 745
 746   u1 = newurl ();
 747   err = parseurl (url1, u1, 0);
 748   if (err != URLOK)
 749     {
 750       freeurl (u1, 1);
 751       return 0;
 752     }
 753   u2 = newurl ();
 754   err = parseurl (url2, u2, 0);
 755   if (err != URLOK)
 756     {
 757       freeurl (u2, 1);
 758       return 0;
 759     }
 760   res = !strcmp (u1->url, u2->url);
 761   freeurl (u1, 1);
 762   freeurl (u2, 1);
 763   return res;
 764 }
 765 \f
 766 /* Find URL of format scheme:hostname[:port]/dir in a buffer.  The
 767    buffer may contain pretty much anything; no errors are signaled.  */
 768 static const char *
 769 findurl (const char *buf, int howmuch, int *count)
 770 {
 771   char **prot;
 772   const char *s1, *s2;
 773
 774   for (s1 = buf; howmuch; s1++, howmuch--)
 775     for (prot = protostrings; *prot; prot++)
 776       if (howmuch <= strlen (*prot))
 777         continue;
 778       else if (!strncasecmp (*prot, s1, strlen (*prot)))
 779         {
 780           for (s2 = s1, *count = 0;
 781                howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
 782                  !strchr (URL_SEPARATOR, *s2);
 783                s2++, (*count)++, howmuch--);
 784           return s1;
 785         }
 786   return NULL;
 787 }
 788
 789 /* Scans the file for signs of URL-s.  Returns a vector of pointers,
 790    each pointer representing a URL string.  The file is *not* assumed
 791    to be HTML.  */
 792 urlpos *
 793 get_urls_file (const char *file)
 794 {
 795   long nread;
 796   FILE *fp;
 797   char *buf;
 798   const char *pbuf;
 799   int size;
 800   urlpos *first, *current, *old;
 801
 802   if (file && !HYPHENP (file))
 803     {
 804       fp = fopen (file, "rb");
 805       if (!fp)
 806         {
 807           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 808           return NULL;
 809         }
 810     }
 811   else
 812     fp = stdin;
 813   /* Load the file.  */
 814   load_file (fp, &buf, &nread);
 815   if (file && !HYPHENP (file))
 816     fclose (fp);
 817   DEBUGP (("Loaded %s (size %ld).\n", file, nread));
 818   first = current = NULL;
 819   /* Fill the linked list with URLs.  */
 820   for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
 821        pbuf += size)
 822     {
 823       /* Allocate the space.  */
 824       old = current;
 825       current = (urlpos *)xmalloc (sizeof (urlpos));
 826       if (old)
 827         old->next = current;
 828       memset (current, 0, sizeof (*current));
 829       current->next = NULL;
 830       current->url = (char *)xmalloc (size + 1);
 831       memcpy (current->url, pbuf, size);
 832       current->url[size] = '\0';
 833       if (!first)
 834         first = current;
 835     }
 836   /* Free the buffer.  */
 837   free (buf);
 838
 839   return first;
 840 }
 841
 842 /* Similar to get_urls_file, but for HTML files.  FILE is scanned as
 843    an HTML document using htmlfindurl(), which see.  get_urls_html()
 844    constructs the HTML-s from the relative href-s.
 845
 846    If SILENT is non-zero, do not barf on baseless relative links.  */
 847 urlpos *
 848 get_urls_html (const char *file, const char *this_url, int silent)
 849 {
 850   long nread;
 851   FILE *fp;
 852   char *orig_buf;
 853   const char *buf;
 854   int step, first_time;
 855   urlpos *first, *current, *old;
 856
 857   if (file && !HYPHENP (file))
 858     {
 859       fp = fopen (file, "rb");
 860       if (!fp)
 861         {
 862           logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 863           return NULL;
 864         }
 865     }
 866   else
 867     fp = stdin;
 868   /* Load the file.  */
 869   load_file (fp, &orig_buf, &nread);
 870   if (file && !HYPHENP (file))
 871     fclose (fp);
 872   DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
 873   first = current = NULL;
 874   first_time = 1;
 875   /* Iterate over the URLs in BUF, picked by htmlfindurl().  */
 876   for (buf = orig_buf;
 877        (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time));
 878        buf += step)
 879     {
 880       int i, no_proto;
 881       int size = step;
 882       const char *pbuf = buf;
 883       char *constr, *base;
 884       const char *cbase;
 885
 886       first_time = 0;
 887
 888       /* A frequent phenomenon that needs to be handled are pages
 889          generated by brain-damaged HTML generators, which refer to to
 890          URI-s as <a href="<spaces>URI<spaces>">.  We simply ignore
 891          any spaces at the beginning or at the end of the string.
 892          This is probably not strictly correct, but that's what the
 893          browsers do, so we may follow.  May the authors of "WYSIWYG"
 894          HTML tools burn in hell for the damage they've inflicted!  */
 895       while ((pbuf < buf + step) && ISSPACE (*pbuf))
 896         {
 897           ++pbuf;
 898           --size;
 899         }
 900       while (size && ISSPACE (pbuf[size - 1]))
 901         --size;
 902       if (!size)
 903         break;
 904
 905       for (i = 0; protostrings[i]; i++)
 906         {
 907           if (!strncasecmp (protostrings[i], pbuf,
 908                             MINVAL (strlen (protostrings[i]), size)))
 909             break;
 910         }
 911       /* Check for http:RELATIVE_URI.  See below for details.  */
 912       if (protostrings[i]
 913           && !(strncasecmp (pbuf, "http:", 5) == 0
 914                && strncasecmp (pbuf, "http://", 7) != 0))
 915         {
 916           no_proto = 0;
 917         }
 918       else
 919         {
 920           no_proto = 1;
 921           /* This is for extremely brain-damaged pages that refer to
 922              relative URI-s as <a href="http:URL">.  Just strip off the
 923              silly leading "http:" (as well as any leading blanks
 924              before it).  */
 925           if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
 926             pbuf += 5, size -= 5;
 927         }
 928       if (!no_proto)
 929         {
 930           for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 931             {
 932               if (!strncasecmp (sup_protos[i].name, pbuf,
 933                                MINVAL (strlen (sup_protos[i].name), size)))
 934                 break;
 935             }
 936           /* Do *not* accept a non-supported protocol.  */
 937           if (i == ARRAY_SIZE (sup_protos))
 938             continue;
 939         }
 940       if (no_proto)
 941         {
 942           /* First, construct the base, which can be relative itself.
 943
 944              Criteria for creating the base are:
 945              1) html_base created by <base href="...">
 946              2) current URL
 947              3) base provided from the command line */
 948           cbase = html_base ();
 949           if (!cbase)
 950             cbase = this_url;
 951           if (!cbase)
 952             cbase = opt.base_href;
 953           if (!cbase)             /* Error condition -- a baseless
 954                                      relative link.  */
 955             {
 956               if (!opt.quiet && !silent)
 957                 {
 958                   /* Use malloc, not alloca because this is called in
 959                      a loop. */
 960                   char *temp = (char *)malloc (size + 1);
 961                   strncpy (temp, pbuf, size);
 962                   temp[size] = '\0';
 963                   logprintf (LOG_NOTQUIET,
 964                              _("Error (%s): Link %s without a base provided.\n"),
 965                              file, temp);
 966                   free (temp);
 967                 }
 968               continue;
 969             }
 970           if (this_url)
 971             base = construct (this_url, cbase, strlen (cbase),
 972                               !has_proto (cbase));
 973           else
 974             {
 975               /* Base must now be absolute, with host name and
 976                  protocol.  */
 977               if (!has_proto (cbase))
 978                 {
 979                   logprintf (LOG_NOTQUIET, _("\
 980 Error (%s): Base %s relative, without referer URL.\n"),
 981                              file, cbase);
 982                   continue;
 983                 }
 984               base = xstrdup (cbase);
 985             }
 986           constr = construct (base, pbuf, size, no_proto);
 987           free (base);
 988         }
 989       else /* has proto */
 990         {
 991           constr = (char *)xmalloc (size + 1);
 992           strncpy (constr, pbuf, size);
 993           constr[size] = '\0';
 994         }
 995 #ifdef DEBUG
 996       if (opt.debug)
 997         {
 998           char *tmp;
 999           const char *tmp2;
1000
1001           tmp2 = html_base ();
1002           /* Use malloc, not alloca because this is called in a loop. */
1003           tmp = (char *)xmalloc (size + 1);
1004           strncpy (tmp, pbuf, size);
1005           tmp[size] = '\0';
1006           logprintf (LOG_ALWAYS,
1007                      "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
1008                      file, this_url ? this_url : "(null)",
1009                      tmp2 ? tmp2 : "(null)", tmp, constr);
1010           free (tmp);
1011         }
1012 #endif
1013
1014       /* Allocate the space.  */
1015       old = current;
1016       current = (urlpos *)xmalloc (sizeof (urlpos));
1017       if (old)
1018         old->next = current;
1019       if (!first)
1020         first = current;
1021       /* Fill the values.  */
1022       memset (current, 0, sizeof (*current));
1023       current->next = NULL;
1024       current->url = constr;
1025       current->size = size;
1026       current->pos = pbuf - orig_buf;
1027       /* A URL is relative if the host and protocol are not named,
1028          and the name does not start with `/'.  */
1029       if (no_proto && *pbuf != '/')
1030         current->flags |= (URELATIVE | UNOPROTO);
1031       else if (no_proto)
1032         current->flags |= UNOPROTO;
1033     }
1034   free (orig_buf);
1035
1036   return first;
1037 }
1038 \f
1039 /* Free the linked list of urlpos.  */
1040 void
1041 free_urlpos (urlpos *l)
1042 {
1043   while (l)
1044     {
1045       urlpos *next = l->next;
1046       free (l->url);
1047       FREE_MAYBE (l->local_name);
1048       free (l);
1049       l = next;
1050     }
1051 }
1052
1053 /* Rotate FNAME opt.backups times */
1054 void
1055 rotate_backups(const char *fname)
1056 {
1057   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1058   char *from = (char *)alloca (maxlen);
1059   char *to = (char *)alloca (maxlen);
1060   struct stat sb;
1061   int i;
1062
1063   if (stat (fname, &sb) == 0)
1064     if (S_ISREG (sb.st_mode) == 0)
1065       return;
1066
1067   for (i = opt.backups; i > 1; i--)
1068     {
1069       sprintf (from, "%s.%d", fname, i - 1);
1070       sprintf (to, "%s.%d", fname, i);
1071       /* #### This will fail on machines without the rename() system
1072          call.  */
1073       rename (from, to);
1074     }
1075
1076   sprintf (to, "%s.%d", fname, 1);
1077   rename(fname, to);
1078 }
1079
1080 /* Create all the necessary directories for PATH (a file).  Calls
1081    mkdirhier() internally.  */
1082 int
1083 mkalldirs (const char *path)
1084 {
1085   const char *p;
1086   char *t;
1087   struct stat st;
1088   int res;
1089
1090   p = path + strlen (path);
1091   for (; *p != '/' && p != path; p--);
1092   /* Don't create if it's just a file.  */
1093   if ((p == path) && (*p != '/'))
1094     return 0;
1095   t = strdupdelim (path, p);
1096   /* Check whether the directory exists.  */
1097   if ((stat (t, &st) == 0))
1098     {
1099       if (S_ISDIR (st.st_mode))
1100         {
1101           free (t);
1102           return 0;
1103         }
1104       else
1105         {
1106           /* If the dir exists as a file name, remove it first.  This
1107              is *only* for Wget to work with buggy old CERN http
1108              servers.  Here is the scenario: When Wget tries to
1109              retrieve a directory without a slash, e.g.
1110              http://foo/bar (bar being a directory), CERN server will
1111              not redirect it too http://foo/bar/ -- it will generate a
1112              directory listing containing links to bar/file1,
1113              bar/file2, etc.  Wget will lose because it saves this
1114              HTML listing to a file `bar', so it cannot create the
1115              directory.  To work around this, if the file of the same
1116              name exists, we just remove it and create the directory
1117              anyway.  */
1118           DEBUGP (("Removing %s because of directory danger!\n", t));
1119           unlink (t);
1120         }
1121     }
1122   res = make_directory (t);
1123   if (res != 0)
1124     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1125   free (t);
1126   return res;
1127 }
1128
1129 static int
1130 count_slashes (const char *s)
1131 {
1132   int i = 0;
1133   while (*s)
1134     if (*s++ == '/')
1135       ++i;
1136   return i;
1137 }
1138
1139 /* Return the path name of the URL-equivalent file name, with a
1140    remote-like structure of directories.  */
1141 static char *
1142 mkstruct (const struct urlinfo *u)
1143 {
1144   char *host, *dir, *file, *res, *dirpref;
1145   int l;
1146
1147   assert (u->dir != NULL);
1148   assert (u->host != NULL);
1149
1150   if (opt.cut_dirs)
1151     {
1152       char *ptr = u->dir + (*u->dir == '/');
1153       int slash_count = 1 + count_slashes (ptr);
1154       int cut = MINVAL (opt.cut_dirs, slash_count);
1155       for (; cut && *ptr; ptr++)
1156         if (*ptr == '/')
1157           --cut;
1158       STRDUP_ALLOCA (dir, ptr);
1159     }
1160   else
1161     dir = u->dir + (*u->dir == '/');
1162
1163   host = xstrdup (u->host);
1164   /* Check for the true name (or at least a consistent name for saving
1165      to directory) of HOST, reusing the hlist if possible.  */
1166   if (opt.add_hostdir && !opt.simple_check)
1167     {
1168       char *nhost = realhost (host);
1169       free (host);
1170       host = nhost;
1171     }
1172   /* Add dir_prefix and hostname (if required) to the beginning of
1173      dir.  */
1174   if (opt.add_hostdir)
1175     {
1176       if (!DOTP (opt.dir_prefix))
1177         {
1178           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1179                                     + strlen (host) + 1);
1180           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1181         }
1182       else
1183         STRDUP_ALLOCA (dirpref, host);
1184     }
1185   else                         /* not add_hostdir */
1186     {
1187       if (!DOTP (opt.dir_prefix))
1188         dirpref = opt.dir_prefix;
1189       else
1190         dirpref = "";
1191     }
1192   free (host);
1193
1194   /* If there is a prefix, prepend it.  */
1195   if (*dirpref)
1196     {
1197       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1198       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1199       dir = newdir;
1200     }
1201   dir = xstrdup (dir);
1202   URL_CLEANSE (dir);
1203   l = strlen (dir);
1204   if (l && dir[l - 1] == '/')
1205     dir[l - 1] = '\0';
1206
1207   if (!*u->file)
1208     file = "index.html";
1209   else
1210     file = u->file;
1211
1212   /* Finally, construct the full name.  */
1213   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1214   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1215   free (dir);
1216   return res;
1217 }
1218
1219 /* Create a unique filename, corresponding to a given URL.  Calls
1220    mkstruct if necessary.  Does *not* actually create any directories.  */
1221 char *
1222 url_filename (const struct urlinfo *u)
1223 {
1224   char *file, *name;
1225   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1226
1227   if (opt.dirstruct)
1228     {
1229       file = mkstruct (u);
1230       have_prefix = 1;
1231     }
1232   else
1233     {
1234       if (!*u->file)
1235         file = xstrdup ("index.html");
1236       else
1237         file = xstrdup (u->file);
1238     }
1239
1240   if (!have_prefix)
1241     {
1242       /* Check whether the prefix directory is something other than "."
1243          before prepending it.  */
1244       if (!DOTP (opt.dir_prefix))
1245         {
1246           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1247                                          + 1 + strlen (file) + 1);
1248           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1249           free (file);
1250           file = nfile;
1251         }
1252     }
1253   /* DOS-ish file systems don't like `%' signs in them; we change it
1254      to `@'.  */
1255 #ifdef WINDOWS
1256   {
1257     char *p = file;
1258     for (p = file; *p; p++)
1259       if (*p == '%')
1260         *p = '@';
1261   }
1262 #endif /* WINDOWS */
1263
1264   /* Check the cases in which the unique extensions are not used:
1265      1) Clobbering is turned off (-nc).
1266      2) Retrieval with regetting.
1267      3) Timestamping is used.
1268      4) Hierarchy is built.
1269
1270      The exception is the case when file does exist and is a
1271      directory (actually support for bad httpd-s).  */
1272   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1273       && !(file_exists_p (file) && !file_non_directory_p (file)))
1274     return file;
1275
1276   /* Find a unique name.  */
1277   name = unique_name (file);
1278   free (file);
1279   return name;
1280 }
1281
1282 /* Construct an absolute URL, given a (possibly) relative one.  This
1283    is more tricky than it might seem, but it works.  */
1284 static char *
1285 construct (const char *url, const char *sub, int subsize, int no_proto)
1286 {
1287   char *constr;
1288
1289   if (no_proto)
1290     {
1291       int i;
1292
1293       if (*sub != '/')
1294         {
1295           for (i = strlen (url); i && url[i] != '/'; i--);
1296           if (!i || (url[i] == url[i - 1]))
1297             {
1298               int l = strlen (url);
1299               char *t = (char *)alloca (l + 2);
1300               strcpy (t, url);
1301               t[l] = '/';
1302               t[l + 1] = '\0';
1303               url = t;
1304               i = l;
1305             }
1306           constr = (char *)xmalloc (i + 1 + subsize + 1);
1307           strncpy (constr, url, i + 1);
1308           constr[i + 1] = '\0';
1309           strncat (constr, sub, subsize);
1310         }
1311       else /* *sub == `/' */
1312         {
1313           int fl;
1314
1315           i = 0;
1316           do
1317             {
1318               for (; url[i] && url[i] != '/'; i++);
1319               if (!url[i])
1320                 break;
1321               fl = (url[i] == url[i + 1] && url[i + 1] == '/');
1322               if (fl)
1323                 i += 2;
1324             }
1325           while (fl);
1326           if (!url[i])
1327             {
1328               int l = strlen (url);
1329               char *t = (char *)alloca (l + 2);
1330               strcpy (t, url);
1331               t[l] = '/';
1332               t[l + 1] = '\0';
1333               url = t;
1334             }
1335           constr = (char *)xmalloc (i + 1 + subsize + 1);
1336           strncpy (constr, url, i);
1337           constr[i] = '\0';
1338           strncat (constr + i, sub, subsize);
1339           constr[i + subsize] = '\0';
1340         } /* *sub == `/' */
1341     }
1342   else /* !no_proto */
1343     {
1344       constr = (char *)xmalloc (subsize + 1);
1345       strncpy (constr, sub, subsize);
1346       constr[subsize] = '\0';
1347     }
1348   return constr;
1349 }
1350 \f
1351 /* Optimize URL by host, destructively replacing u->host with realhost
1352    (u->host).  Do this regardless of opt.simple_check.  */
1353 void
1354 opt_url (struct urlinfo *u)
1355 {
1356   /* Find the "true" host.  */
1357   char *host = realhost (u->host);
1358   free (u->host);
1359   u->host = host;
1360   assert (u->dir != NULL);      /* the URL must have been parsed */
1361   /* Refresh the printed representation.  */
1362   free (u->url);
1363   u->url = str_url (u, 0);
1364 }
1365 \f
1366 /* Returns proxy host address, in accordance with PROTO.  */
1367 char *
1368 getproxy (uerr_t proto)
1369 {
1370   if (proto == URLHTTP)
1371     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1372   else if (proto == URLFTP)
1373     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1374   else
1375     return NULL;
1376 }
1377
1378 /* Should a host be accessed through proxy, concerning no_proxy?  */
1379 int
1380 no_proxy_match (const char *host, const char **no_proxy)
1381 {
1382   if (!no_proxy)
1383     return 1;
1384   else
1385     return !sufmatch (no_proxy, host);
1386 }
1387 \f
1388 /* Change the links in an HTML document.  Accepts a structure that
1389    defines the positions of all the links.  */
1390 void
1391 convert_links (const char *file, urlpos *l)
1392 {
1393   FILE *fp;
1394   char *buf, *p, *p2;
1395   long size;
1396
1397   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1398   /* Read from the file....  */
1399   fp = fopen (file, "rb");
1400   if (!fp)
1401     {
1402       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1403                  file, strerror (errno));
1404       return;
1405     }
1406   /* ...to a buffer.  */
1407   load_file (fp, &buf, &size);
1408   fclose (fp);
1409   if (opt.backup_converted && downloaded_file(CHECK_FOR_FILE, file))
1410     /* Rather than just writing over the original .html file with the converted
1411        version, save the former to *.orig.  Note we only do this for files we've
1412        _successfully_ downloaded, so we don't clobber .orig files sitting around
1413        from previous invocations. */
1414     {
1415       /* Construct the backup filename as the original name plus ".orig". */
1416       size_t         filename_len = strlen(file);
1417       char*          filename_plus_orig_suffix = malloc(filename_len +
1418                                                         sizeof(".orig"));
1419       boolean        already_wrote_backup_file = FALSE;
1420       slist*         converted_file_ptr;
1421       static slist*  converted_files = NULL;
1422
1423       /* Would a single s[n]printf() call be faster? */
1424       strcpy(filename_plus_orig_suffix, file);
1425       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1426
1427       /* We can get called twice on the same URL thanks to the
1428          convert_all_links() call in main().  If we write the .orig file each
1429          time in such a case, it'll end up containing the first-pass conversion,
1430          not the original file.  So, see if we've already been called on this
1431          file. */
1432       converted_file_ptr = converted_files;
1433       while (converted_file_ptr != NULL)
1434         if (strcmp(converted_file_ptr->string, file) == 0)
1435           {
1436             already_wrote_backup_file = TRUE;
1437             break;
1438           }
1439         else
1440           converted_file_ptr = converted_file_ptr->next;
1441
1442       if (!already_wrote_backup_file)
1443         {
1444           /* Rename <file> to <file>.orig before former gets written over. */
1445           if (rename(file, filename_plus_orig_suffix) != 0)
1446             logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1447                        file, filename_plus_orig_suffix, strerror (errno));
1448
1449           /* Remember that we've already written a .orig backup for this file.
1450              Note that we never free this memory since we need it till the
1451              convert_all_links() call, which is one of the last things the
1452              program does before terminating.  BTW, I'm not sure if it would be
1453              safe to just set 'converted_file_ptr->string' to 'file' below,
1454              rather than making a copy of the string...  Another note is that I
1455              thought I could just add a field to the urlpos structure saying
1456              that we'd written a .orig file for this URL, but that didn't work,
1457              so I had to make this separate list. */
1458           converted_file_ptr = malloc(sizeof(slist));
1459           converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1460           converted_file_ptr->next = converted_files;
1461           converted_files = converted_file_ptr;
1462         }
1463
1464       free(filename_plus_orig_suffix);
1465     }
1466   /* Now open the file for writing.  */
1467   fp = fopen (file, "wb");
1468   if (!fp)
1469     {
1470       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1471                  file, strerror (errno));
1472       free (buf);
1473       return;
1474     }
1475   /* [If someone understands why multiple URLs can correspond to one local file,
1476      can they please add a comment here...?] */
1477   for (p = buf; l; l = l->next)
1478     {
1479       if (l->pos >= size)
1480         {
1481           DEBUGP (("Something strange is going on.  Please investigate."));
1482           break;
1483         }
1484       /* If the URL already is relative or it is not to be converted
1485          for some other reason (e.g. because of not having been
1486          downloaded in the first place), skip it.  */
1487       if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
1488         {
1489           DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
1490                    l->pos, l->flags));
1491           continue;
1492         }
1493       /* Else, reach the position of the offending URL, echoing
1494          everything up to it to the outfile.  */
1495       for (p2 = buf + l->pos; p < p2; p++)
1496         putc (*p, fp);
1497       if (l->flags & UABS2REL)
1498         {
1499           char *newname = construct_relative (file, l->local_name);
1500           fprintf (fp, "%s", newname);
1501           DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
1502                    l->url, newname, l->pos, file));
1503           free (newname);
1504         }
1505       p += l->size;
1506     }
1507   if (p - buf < size)
1508     {
1509       for (p2 = buf + size; p < p2; p++)
1510         putc (*p, fp);
1511     }
1512   fclose (fp);
1513   free (buf);
1514   logputs (LOG_VERBOSE, _("done.\n"));
1515 }
1516
1517 /* Construct and return a malloced copy of the relative link from two
1518    pieces of information: local name S1 of the referring file and
1519    local name S2 of the referred file.
1520
1521    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1522    "jagor.srce.hr/images/news.gif", the function will return
1523    "images/news.gif".
1524
1525    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1526    "fly.cc.fer.hr/images/fly.gif", the function will return
1527    "../images/fly.gif".
1528
1529    Caveats: S1 should not begin with `/', unless S2 also begins with
1530    '/'.  S1 should not contain things like ".." and such --
1531    construct_relative ("fly/ioccc/../index.html",
1532    "fly/images/fly.gif") will fail.  (A workaround is to call
1533    something like path_simplify() on S1).  */
1534 static char *
1535 construct_relative (const char *s1, const char *s2)
1536 {
1537   int i, cnt, sepdirs1;
1538   char *res;
1539
1540   if (*s2 == '/')
1541     return xstrdup (s2);
1542   /* S1 should *not* be absolute, if S2 wasn't.  */
1543   assert (*s1 != '/');
1544   i = cnt = 0;
1545   /* Skip the directories common to both strings.  */
1546   while (1)
1547     {
1548       while (s1[i] && s2[i]
1549              && (s1[i] == s2[i])
1550              && (s1[i] != '/')
1551              && (s2[i] != '/'))
1552         ++i;
1553       if (s1[i] == '/' && s2[i] == '/')
1554         cnt = ++i;
1555       else
1556         break;
1557     }
1558   for (sepdirs1 = 0; s1[i]; i++)
1559     if (s1[i] == '/')
1560       ++sepdirs1;
1561   /* Now, construct the file as of:
1562      - ../ repeated sepdirs1 time
1563      - all the non-mutual directories of S2.  */
1564   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1565   for (i = 0; i < sepdirs1; i++)
1566     memcpy (res + 3 * i, "../", 3);
1567   strcpy (res + 3 * i, s2 + cnt);
1568   return res;
1569 }
1570 \f
1571 /* Add URL to the head of the list L.  */
1572 urlpos *
1573 add_url (urlpos *l, const char *url, const char *file)
1574 {
1575   urlpos *t;
1576
1577   t = (urlpos *)xmalloc (sizeof (urlpos));
1578   memset (t, 0, sizeof (*t));
1579   t->url = xstrdup (url);
1580   t->local_name = xstrdup (file);
1581   t->next = l;
1582   return t;
1583 }
1584
1585
1586 /* Remembers which files have been downloaded.  Should be called with
1587    add_or_check == ADD_FILE for each file we actually download successfully
1588    (i.e. not for ones we have failures on or that we skip due to -N).  If you
1589    just want to check if a file has been previously added without adding it,
1590    call with add_or_check == CHECK_FOR_FILE.  Please be sure to call this
1591    function with local filenames, not remote URLs -- by some means that isn't
1592    commented well enough for me understand, multiple remote URLs can apparently
1593    correspond to a single local file. */
1594 boolean
1595 downloaded_file (downloaded_file_t  add_or_check, const char*  file)
1596 {
1597   boolean        found_file = FALSE;
1598   static slist*  downloaded_files = NULL;
1599   slist*         rover = downloaded_files;
1600
1601   while (rover != NULL)
1602     if (strcmp(rover->string, file) == 0)
1603       {
1604         found_file = TRUE;
1605         break;
1606       }
1607     else
1608       rover = rover->next;
1609
1610   if (found_file)
1611     return TRUE;  /* file had already been downloaded */
1612   else
1613     {
1614       if (add_or_check == ADD_FILE)
1615         {
1616           rover = malloc(sizeof(slist));
1617           rover->string = xstrdup(file);  /* die on out-of-mem. */
1618           rover->next = downloaded_files;
1619           downloaded_files = rover;
1620         }
1621
1622       return FALSE;  /* file had not already been downloaded */
1623     }
1624 }