sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Default port definitions */
  47 #define DEFAULT_HTTP_PORT 80
  48 #define DEFAULT_FTP_PORT 21
  49
  50 /* Table of Unsafe chars.  This is intialized in
  51    init_unsafe_char_table.  */
  52
  53 static char unsafe_char_table[256];
  54
  55 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  56
  57 /* If S contains unsafe characters, free it and replace it with a
  58    version that doesn't.  */
  59 #define URL_CLEANSE(s) do                       \
  60 {                                               \
  61   if (contains_unsafe (s))                      \
  62     {                                           \
  63       char *uc_tmp = encode_string (s);         \
  64       xfree (s);                                \
  65       (s) = uc_tmp;                             \
  66     }                                           \
  67 } while (0)
  68
  69 /* Is a directory "."?  */
  70 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  71 /* Is a directory ".."?  */
  72 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  73
  74 #if 0
  75 static void path_simplify_with_kludge PARAMS ((char *));
  76 #endif
  77 static int urlpath_length PARAMS ((const char *));
  78
  79 /* NULL-terminated list of strings to be recognized as prototypes (URL
  80    schemes).  Note that recognized doesn't mean supported -- only HTTP
  81    and FTP are currently supported.
  82
  83    However, a string that does not match anything in the list will be
  84    considered a relative URL.  Thus it's important that this list has
  85    anything anyone could think of being legal.
  86
  87    There are wild things here.  :-) Take a look at
  88    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  89    fun.  */
  90 static char *protostrings[] =
  91 {
  92   "cid:",
  93   "clsid:",
  94   "file:",
  95   "finger:",
  96   "ftp:",
  97   "gopher:",
  98   "hdl:",
  99   "http:",
 100   "https:",
 101   "ilu:",
 102   "ior:",
 103   "irc:",
 104   "java:",
 105   "javascript:",
 106   "lifn:",
 107   "mailto:",
 108   "mid:",
 109   "news:",
 110   "nntp:",
 111   "path:",
 112   "prospero:",
 113   "rlogin:",
 114   "service:",
 115   "shttp:",
 116   "snews:",
 117   "stanf:",
 118   "telnet:",
 119   "tn3270:",
 120   "wais:",
 121   "whois++:",
 122   NULL
 123 };
 124
 125 struct proto
 126 {
 127   char *name;
 128   uerr_t ind;
 129   unsigned short port;
 130 };
 131
 132 /* Similar to former, but for supported protocols: */
 133 static struct proto sup_protos[] =
 134 {
 135   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 136   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 137   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 138 };
 139
 140 static void parse_dir PARAMS ((const char *, char **, char **));
 141 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 142 static char *construct PARAMS ((const char *, const char *, int , int));
 143 static char *construct_relative PARAMS ((const char *, const char *));
 144 static char process_ftp_type PARAMS ((char *));
 145
 146 \f
 147 /* Returns the number of characters to be skipped if the first thing
 148    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 149    URL: are also skipped.  */
 150 int
 151 skip_url (const char *url)
 152 {
 153   int i;
 154
 155   if (TOUPPER (url[0]) == 'U'
 156       && TOUPPER (url[1]) == 'R'
 157       && TOUPPER (url[2]) == 'L'
 158       && url[3] == ':')
 159     {
 160       /* Skip blanks.  */
 161       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 162       return i;
 163     }
 164   else
 165     return 0;
 166 }
 167
 168 /* Unsafe chars:
 169    - anything <= 32;
 170    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 171    - @ and :, for user/password encoding.
 172    - everything over 127 (but we don't bother with recording those.  */
 173 void
 174 init_unsafe_char_table (void)
 175 {
 176   int i;
 177   for (i = 0; i < 256; i++)
 178     if (i < 32 || i >= 127
 179         || i == '<'
 180         || i == '>'
 181         || i == '\"'
 182         || i == '#'
 183         || i == '%'
 184         || i == '{'
 185         || i == '}'
 186         || i == '|'
 187         || i == '\\'
 188         || i == '^'
 189         || i == '~'
 190         || i == '['
 191         || i == ']'
 192         || i == '`')
 193       unsafe_char_table[i] = 1;
 194 }
 195
 196 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 197 int
 198 contains_unsafe (const char *s)
 199 {
 200   for (; *s; s++)
 201     if (UNSAFE_CHAR (*s))
 202       return 1;
 203   return 0;
 204 }
 205
 206 /* Decodes the forms %xy in a URL to the character the hexadecimal
 207    code of which is xy.  xy are hexadecimal digits from
 208    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 209    hex-digits or `%' precedes `\0', the sequence is inserted
 210    literally.  */
 211
 212 static void
 213 decode_string (char *s)
 214 {
 215   char *p = s;
 216
 217   for (; *s; s++, p++)
 218     {
 219       if (*s != '%')
 220         *p = *s;
 221       else
 222         {
 223           /* Do nothing if at the end of the string, or if the chars
 224              are not hex-digits.  */
 225           if (!*(s + 1) || !*(s + 2)
 226               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 227             {
 228               *p = *s;
 229               continue;
 230             }
 231           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 232           s += 2;
 233         }
 234     }
 235   *p = '\0';
 236 }
 237
 238 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 239    given string, returning a malloc-ed %XX encoded string.  */
 240 char *
 241 encode_string (const char *s)
 242 {
 243   const char *b;
 244   char *p, *res;
 245   int i;
 246
 247   b = s;
 248   for (i = 0; *s; s++, i++)
 249     if (UNSAFE_CHAR (*s))
 250       i += 2; /* Two more characters (hex digits) */
 251   res = (char *)xmalloc (i + 1);
 252   s = b;
 253   for (p = res; *s; s++)
 254     if (UNSAFE_CHAR (*s))
 255       {
 256         const unsigned char c = *s;
 257         *p++ = '%';
 258         *p++ = HEXD2ASC (c >> 4);
 259         *p++ = HEXD2ASC (c & 0xf);
 260       }
 261     else
 262       *p++ = *s;
 263   *p = '\0';
 264   return res;
 265 }
 266 \f
 267 /* Returns the proto-type if URL's protocol is supported, or
 268    URLUNKNOWN if not.  */
 269 uerr_t
 270 urlproto (const char *url)
 271 {
 272   int i;
 273
 274   url += skip_url (url);
 275   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 276     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 277       return sup_protos[i].ind;
 278   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 279   if (url[i] == ':')
 280     {
 281       for (++i; url[i] && url[i] != '/'; i++)
 282         if (!ISDIGIT (url[i]))
 283           return URLBADPORT;
 284       if (url[i - 1] == ':')
 285         return URLFTP;
 286       else
 287         return URLHTTP;
 288     }
 289   else
 290     return URLHTTP;
 291 }
 292
 293 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 294    part is found, returns 0.  */
 295 int
 296 skip_proto (const char *url)
 297 {
 298   char **s;
 299   int l;
 300
 301   for (s = protostrings; *s; s++)
 302     if (!strncasecmp (*s, url, strlen (*s)))
 303       break;
 304   if (!*s)
 305     return 0;
 306   l = strlen (*s);
 307   /* HTTP and FTP protocols are expected to yield exact host names
 308      (i.e. the `//' part must be skipped, too).  */
 309   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 310     l += 2;
 311   return l;
 312 }
 313
 314 /* Returns 1 if the URL begins with a protocol (supported or
 315    unsupported), 0 otherwise.  */
 316 int
 317 has_proto (const char *url)
 318 {
 319   char **s;
 320
 321   url += skip_url (url);
 322   for (s = protostrings; *s; s++)
 323     if (strncasecmp (url, *s, strlen (*s)) == 0)
 324       return 1;
 325   return 0;
 326 }
 327
 328 /* Skip the username and password, if present here.  The function
 329    should be called *not* with the complete URL, but with the part
 330    right after the protocol.
 331
 332    If no username and password are found, return 0.  */
 333 int
 334 skip_uname (const char *url)
 335 {
 336   const char *p;
 337   for (p = url; *p && *p != '/'; p++)
 338     if (*p == '@')
 339       break;
 340   /* If a `@' was found before the first occurrence of `/', skip
 341      it.  */
 342   if (*p == '@')
 343     return p - url + 1;
 344   else
 345     return 0;
 346 }
 347 \f
 348 /* Allocate a new urlinfo structure, fill it with default values and
 349    return a pointer to it.  */
 350 struct urlinfo *
 351 newurl (void)
 352 {
 353   struct urlinfo *u;
 354
 355   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 356   memset (u, 0, sizeof (*u));
 357   u->proto = URLUNKNOWN;
 358   return u;
 359 }
 360
 361 /* Perform a "deep" free of the urlinfo structure.  The structure
 362    should have been created with newurl, but need not have been used.
 363    If free_pointer is non-0, free the pointer itself.  */
 364 void
 365 freeurl (struct urlinfo *u, int complete)
 366 {
 367   assert (u != NULL);
 368   FREE_MAYBE (u->url);
 369   FREE_MAYBE (u->host);
 370   FREE_MAYBE (u->path);
 371   FREE_MAYBE (u->file);
 372   FREE_MAYBE (u->dir);
 373   FREE_MAYBE (u->user);
 374   FREE_MAYBE (u->passwd);
 375   FREE_MAYBE (u->local);
 376   FREE_MAYBE (u->referer);
 377   if (u->proxy)
 378     freeurl (u->proxy, 1);
 379   if (complete)
 380     xfree (u);
 381   return;
 382 }
 383 \f
 384 /* Extract the given URL of the form
 385    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 386    1. hostname (terminated with `/' or `:')
 387    2. port number (terminated with `/'), or chosen for the protocol
 388    3. dirname (everything after hostname)
 389    Most errors are handled.  No allocation is done, you must supply
 390    pointers to allocated memory.
 391    ...and a host of other stuff :-)
 392
 393    - Recognizes hostname:dir/file for FTP and
 394      hostname (:portnum)?/dir/file for HTTP.
 395    - Parses the path to yield directory and file
 396    - Parses the URL to yield the username and passwd (if present)
 397    - Decodes the strings, in case they contain "forbidden" characters
 398    - Writes the result to struct urlinfo
 399
 400    If the argument STRICT is set, it recognizes only the canonical
 401    form.  */
 402 uerr_t
 403 parseurl (const char *url, struct urlinfo *u, int strict)
 404 {
 405   int i, l, abs_ftp;
 406   int recognizable;            /* Recognizable URL is the one where
 407                                   the protocol name was explicitly
 408                                   named, i.e. it wasn't deduced from
 409                                   the URL format.  */
 410   uerr_t type;
 411
 412   DEBUGP (("parseurl (\"%s\") -> ", url));
 413   url += skip_url (url);
 414   recognizable = has_proto (url);
 415   if (strict && !recognizable)
 416     return URLUNKNOWN;
 417   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 418     {
 419       l = strlen (sup_protos[i].name);
 420       if (!strncasecmp (sup_protos[i].name, url, l))
 421         break;
 422     }
 423   /* If protocol is recognizable, but unsupported, bail out, else
 424      suppose unknown.  */
 425   if (recognizable && i == ARRAY_SIZE (sup_protos))
 426     return URLUNKNOWN;
 427   else if (i == ARRAY_SIZE (sup_protos))
 428     type = URLUNKNOWN;
 429   else
 430     u->proto = type = sup_protos[i].ind;
 431
 432   if (type == URLUNKNOWN)
 433     l = 0;
 434   /* Allow a username and password to be specified (i.e. just skip
 435      them for now).  */
 436   if (recognizable)
 437     l += skip_uname (url + l);
 438   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 439   if (i == l)
 440     return URLBADHOST;
 441   /* Get the hostname.  */
 442   u->host = strdupdelim (url + l, url + i);
 443   DEBUGP (("host %s -> ", u->host));
 444
 445   /* Assume no port has been given.  */
 446   u->port = 0;
 447   if (url[i] == ':')
 448     {
 449       /* We have a colon delimiting the hostname.  It could mean that
 450          a port number is following it, or a directory.  */
 451       if (ISDIGIT (url[++i]))    /* A port number */
 452         {
 453           if (type == URLUNKNOWN)
 454             u->proto = type = URLHTTP;
 455           for (; url[i] && url[i] != '/'; i++)
 456             if (ISDIGIT (url[i]))
 457               u->port = 10 * u->port + (url[i] - '0');
 458             else
 459               return URLBADPORT;
 460           if (!u->port)
 461             return URLBADPORT;
 462           DEBUGP (("port %hu -> ", u->port));
 463         }
 464       else if (type == URLUNKNOWN) /* or a directory */
 465         u->proto = type = URLFTP;
 466       else                      /* or just a misformed port number */
 467         return URLBADPORT;
 468     }
 469   else if (type == URLUNKNOWN)
 470     u->proto = type = URLHTTP;
 471   if (!u->port)
 472     {
 473       int i;
 474       for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 475         if (sup_protos[i].ind == type)
 476           break;
 477       if (i == ARRAY_SIZE (sup_protos))
 478         return URLUNKNOWN;
 479       u->port = sup_protos[i].port;
 480     }
 481   /* Some delimiter troubles...  */
 482   if (url[i] == '/' && url[i - 1] != ':')
 483     ++i;
 484   if (type == URLHTTP)
 485     while (url[i] && url[i] == '/')
 486       ++i;
 487   u->path = (char *)xmalloc (strlen (url + i) + 8);
 488   strcpy (u->path, url + i);
 489   if (type == URLFTP)
 490     {
 491       u->ftp_type = process_ftp_type (u->path);
 492       /* #### We don't handle type `d' correctly yet.  */
 493       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 494         u->ftp_type = 'I';
 495     }
 496   DEBUGP (("opath %s -> ", u->path));
 497   /* Parse the username and password (if existing).  */
 498   parse_uname (url, &u->user, &u->passwd);
 499   /* Decode the strings, as per RFC 1738.  */
 500   decode_string (u->host);
 501   decode_string (u->path);
 502   if (u->user)
 503     decode_string (u->user);
 504   if (u->passwd)
 505     decode_string (u->passwd);
 506   /* Parse the directory.  */
 507   parse_dir (u->path, &u->dir, &u->file);
 508   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 509   /* Simplify the directory.  */
 510   path_simplify (u->dir);
 511   /* Remove the leading `/' in HTTP.  */
 512   if (type == URLHTTP && *u->dir == '/')
 513     strcpy (u->dir, u->dir + 1);
 514   DEBUGP (("ndir %s\n", u->dir));
 515   /* Strip trailing `/'.  */
 516   l = strlen (u->dir);
 517   if (l && u->dir[l - 1] == '/')
 518     u->dir[l - 1] = '\0';
 519   /* Re-create the path: */
 520   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 521   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 522       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 523   strcpy (u->path, abs_ftp ? "%2F" : "/");
 524   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 525   strcat (u->path, *u->dir ? "/" : "");
 526   strcat (u->path, u->file);
 527   URL_CLEANSE (u->path);
 528   DEBUGP (("newpath: %s\n", u->path));
 529   /* Create the clean URL.  */
 530   u->url = str_url (u, 0);
 531   return URLOK;
 532 }
 533 \f
 534 /* Special versions of DOTP and DDOTP for parse_dir(). */
 535
 536 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 537 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 538                      && (!*((x) + 2) || *((x) + 2) == '?'))
 539
 540 /* Build the directory and filename components of the path.  Both
 541    components are *separately* malloc-ed strings!  It does not change
 542    the contents of path.
 543
 544    If the path ends with "." or "..", they are (correctly) counted as
 545    directories.  */
 546 static void
 547 parse_dir (const char *path, char **dir, char **file)
 548 {
 549   int i, l;
 550
 551   l = urlpath_length (path);
 552   for (i = l; i && path[i] != '/'; i--);
 553
 554   if (!i && *path != '/')   /* Just filename */
 555     {
 556       if (PD_DOTP (path) || PD_DDOTP (path))
 557         {
 558           *dir = strdupdelim (path, path + l);
 559           *file = xstrdup (path + l); /* normally empty, but could
 560                                          contain ?... */
 561         }
 562       else
 563         {
 564           *dir = xstrdup ("");     /* This is required because of FTP */
 565           *file = xstrdup (path);
 566         }
 567     }
 568   else if (!i)                 /* /filename */
 569     {
 570       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 571         {
 572           *dir = strdupdelim (path, path + l);
 573           *file = xstrdup (path + l); /* normally empty, but could
 574                                          contain ?... */
 575         }
 576       else
 577         {
 578           *dir = xstrdup ("/");
 579           *file = xstrdup (path + 1);
 580         }
 581     }
 582   else /* Nonempty directory with or without a filename */
 583     {
 584       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 585         {
 586           *dir = strdupdelim (path, path + l);
 587           *file = xstrdup (path + l); /* normally empty, but could
 588                                          contain ?... */
 589         }
 590       else
 591         {
 592           *dir = strdupdelim (path, path + i);
 593           *file = xstrdup (path + i + 1);
 594         }
 595     }
 596 }
 597
 598 /* Find the optional username and password within the URL, as per
 599    RFC1738.  The returned user and passwd char pointers are
 600    malloc-ed.  */
 601 static uerr_t
 602 parse_uname (const char *url, char **user, char **passwd)
 603 {
 604   int l;
 605   const char *p, *col;
 606   char **where;
 607
 608   *user = NULL;
 609   *passwd = NULL;
 610   url += skip_url (url);
 611   /* Look for end of protocol string.  */
 612   l = skip_proto (url);
 613   if (!l)
 614     return URLUNKNOWN;
 615   /* Add protocol offset.  */
 616   url += l;
 617   /* Is there an `@' character?  */
 618   for (p = url; *p && *p != '/'; p++)
 619     if (*p == '@')
 620       break;
 621   /* If not, return.  */
 622   if (*p != '@')
 623     return URLOK;
 624   /* Else find the username and password.  */
 625   for (p = col = url; *p != '@'; p++)
 626     {
 627       if (*p == ':' && !*user)
 628         {
 629           *user = (char *)xmalloc (p - url + 1);
 630           memcpy (*user, url, p - url);
 631           (*user)[p - url] = '\0';
 632           col = p + 1;
 633         }
 634     }
 635   /* Decide whether you have only the username or both.  */
 636   where = *user ? passwd : user;
 637   *where = (char *)xmalloc (p - col + 1);
 638   memcpy (*where, col, p - col);
 639   (*where)[p - col] = '\0';
 640   return URLOK;
 641 }
 642
 643 /* If PATH ends with `;type=X', return the character X.  */
 644 static char
 645 process_ftp_type (char *path)
 646 {
 647   int len = strlen (path);
 648
 649   if (len >= 7
 650       && !memcmp (path + len - 7, ";type=", 6))
 651     {
 652       path[len - 7] = '\0';
 653       return path[len - 1];
 654     }
 655   else
 656     return '\0';
 657 }
 658 \f
 659 /* Return the URL as fine-formed string, with a proper protocol,
 660    optional port number, directory and optional user/password.  If
 661    HIDE is non-zero, password will be hidden.  The forbidden
 662    characters in the URL will be cleansed.  */
 663 char *
 664 str_url (const struct urlinfo *u, int hide)
 665 {
 666   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 667   int i, l, ln, lu, lh, lp, lf, ld;
 668   unsigned short proto_default_port;
 669
 670   /* Look for the protocol name.  */
 671   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 672     if (sup_protos[i].ind == u->proto)
 673       break;
 674   if (i == ARRAY_SIZE (sup_protos))
 675     return NULL;
 676   proto_name = sup_protos[i].name;
 677   proto_default_port = sup_protos[i].port;
 678   host = CLEANDUP (u->host);
 679   dir = CLEANDUP (u->dir);
 680   file = CLEANDUP (u->file);
 681   user = passwd = NULL;
 682   if (u->user)
 683     user = CLEANDUP (u->user);
 684   if (u->passwd)
 685     {
 686       int i;
 687       passwd = CLEANDUP (u->passwd);
 688       if (hide)
 689         for (i = 0; passwd[i]; i++)
 690           passwd[i] = 'x';
 691     }
 692   if (u->proto == URLFTP && *dir == '/')
 693     {
 694       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 695       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 696       tmp[0] = '%';
 697       tmp[1] = '2';
 698       tmp[2] = 'F';
 699       strcpy (tmp + 3, dir + 1);
 700       xfree (dir);
 701       dir = tmp;
 702     }
 703
 704   ln = strlen (proto_name);
 705   lu = user ? strlen (user) : 0;
 706   lp = passwd ? strlen (passwd) : 0;
 707   lh = strlen (host);
 708   ld = strlen (dir);
 709   lf = strlen (file);
 710   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 711   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 712      (user ? user : ""), (passwd ? ":" : ""),
 713      (passwd ? passwd : ""), (user ? "@" : ""),
 714      host, u->port, dir, *dir ? "/" : "", file); */
 715   l = 0;
 716   memcpy (res, proto_name, ln);
 717   l += ln;
 718   if (user)
 719     {
 720       memcpy (res + l, user, lu);
 721       l += lu;
 722       if (passwd)
 723         {
 724           res[l++] = ':';
 725           memcpy (res + l, passwd, lp);
 726           l += lp;
 727         }
 728       res[l++] = '@';
 729     }
 730   memcpy (res + l, host, lh);
 731   l += lh;
 732   if (u->port != proto_default_port)
 733     {
 734       res[l++] = ':';
 735       long_to_string (res + l, (long)u->port);
 736       l += numdigit (u->port);
 737     }
 738   res[l++] = '/';
 739   memcpy (res + l, dir, ld);
 740   l += ld;
 741   if (*dir)
 742     res[l++] = '/';
 743   strcpy (res + l, file);
 744   xfree (host);
 745   xfree (dir);
 746   xfree (file);
 747   FREE_MAYBE (user);
 748   FREE_MAYBE (passwd);
 749   return res;
 750 }
 751
 752 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 753    location.  Uses parseurl to parse them, and compares the canonical
 754    forms.
 755
 756    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 757    return 0 on error.  */
 758 int
 759 url_equal (const char *url1, const char *url2)
 760 {
 761   struct urlinfo *u1, *u2;
 762   uerr_t err;
 763   int res;
 764
 765   u1 = newurl ();
 766   err = parseurl (url1, u1, 0);
 767   if (err != URLOK)
 768     {
 769       freeurl (u1, 1);
 770       return 0;
 771     }
 772   u2 = newurl ();
 773   err = parseurl (url2, u2, 0);
 774   if (err != URLOK)
 775     {
 776       freeurl (u2, 1);
 777       return 0;
 778     }
 779   res = !strcmp (u1->url, u2->url);
 780   freeurl (u1, 1);
 781   freeurl (u2, 1);
 782   return res;
 783 }
 784 \f
 785 urlpos *
 786 get_urls_file (const char *file)
 787 {
 788   struct file_memory *fm;
 789   urlpos *head, *tail;
 790   const char *text, *text_end;
 791
 792   /* Load the file.  */
 793   fm = read_file (file);
 794   if (!fm)
 795     {
 796       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 797       return NULL;
 798     }
 799   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 800   head = tail = NULL;
 801   text = fm->content;
 802   text_end = fm->content + fm->length;
 803   while (text < text_end)
 804     {
 805       const char *line_beg = text;
 806       const char *line_end = memchr (text, '\n', text_end - text);
 807       if (!line_end)
 808         line_end = text_end;
 809       else
 810         ++line_end;
 811       text = line_end;
 812       while (line_beg < line_end
 813              && ISSPACE (*line_beg))
 814         ++line_beg;
 815       while (line_end > line_beg + 1
 816              && ISSPACE (*(line_end - 1)))
 817         --line_end;
 818       if (line_end > line_beg)
 819         {
 820           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 821           memset (entry, 0, sizeof (*entry));
 822           entry->next = NULL;
 823           entry->url = strdupdelim (line_beg, line_end);
 824           if (!head)
 825             head = entry;
 826           else
 827             tail->next = entry;
 828           tail = entry;
 829         }
 830     }
 831   read_file_free (fm);
 832   return head;
 833 }
 834 \f
 835 /* Free the linked list of urlpos.  */
 836 void
 837 free_urlpos (urlpos *l)
 838 {
 839   while (l)
 840     {
 841       urlpos *next = l->next;
 842       xfree (l->url);
 843       FREE_MAYBE (l->local_name);
 844       xfree (l);
 845       l = next;
 846     }
 847 }
 848
 849 /* Rotate FNAME opt.backups times */
 850 void
 851 rotate_backups(const char *fname)
 852 {
 853   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 854   char *from = (char *)alloca (maxlen);
 855   char *to = (char *)alloca (maxlen);
 856   struct stat sb;
 857   int i;
 858
 859   if (stat (fname, &sb) == 0)
 860     if (S_ISREG (sb.st_mode) == 0)
 861       return;
 862
 863   for (i = opt.backups; i > 1; i--)
 864     {
 865       sprintf (from, "%s.%d", fname, i - 1);
 866       sprintf (to, "%s.%d", fname, i);
 867       /* #### This will fail on machines without the rename() system
 868          call.  */
 869       rename (from, to);
 870     }
 871
 872   sprintf (to, "%s.%d", fname, 1);
 873   rename(fname, to);
 874 }
 875
 876 /* Create all the necessary directories for PATH (a file).  Calls
 877    mkdirhier() internally.  */
 878 int
 879 mkalldirs (const char *path)
 880 {
 881   const char *p;
 882   char *t;
 883   struct stat st;
 884   int res;
 885
 886   p = path + strlen (path);
 887   for (; *p != '/' && p != path; p--);
 888   /* Don't create if it's just a file.  */
 889   if ((p == path) && (*p != '/'))
 890     return 0;
 891   t = strdupdelim (path, p);
 892   /* Check whether the directory exists.  */
 893   if ((stat (t, &st) == 0))
 894     {
 895       if (S_ISDIR (st.st_mode))
 896         {
 897           xfree (t);
 898           return 0;
 899         }
 900       else
 901         {
 902           /* If the dir exists as a file name, remove it first.  This
 903              is *only* for Wget to work with buggy old CERN http
 904              servers.  Here is the scenario: When Wget tries to
 905              retrieve a directory without a slash, e.g.
 906              http://foo/bar (bar being a directory), CERN server will
 907              not redirect it too http://foo/bar/ -- it will generate a
 908              directory listing containing links to bar/file1,
 909              bar/file2, etc.  Wget will lose because it saves this
 910              HTML listing to a file `bar', so it cannot create the
 911              directory.  To work around this, if the file of the same
 912              name exists, we just remove it and create the directory
 913              anyway.  */
 914           DEBUGP (("Removing %s because of directory danger!\n", t));
 915           unlink (t);
 916         }
 917     }
 918   res = make_directory (t);
 919   if (res != 0)
 920     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 921   xfree (t);
 922   return res;
 923 }
 924
 925 static int
 926 count_slashes (const char *s)
 927 {
 928   int i = 0;
 929   while (*s)
 930     if (*s++ == '/')
 931       ++i;
 932   return i;
 933 }
 934
 935 /* Return the path name of the URL-equivalent file name, with a
 936    remote-like structure of directories.  */
 937 static char *
 938 mkstruct (const struct urlinfo *u)
 939 {
 940   char *host, *dir, *file, *res, *dirpref;
 941   int l;
 942
 943   assert (u->dir != NULL);
 944   assert (u->host != NULL);
 945
 946   if (opt.cut_dirs)
 947     {
 948       char *ptr = u->dir + (*u->dir == '/');
 949       int slash_count = 1 + count_slashes (ptr);
 950       int cut = MINVAL (opt.cut_dirs, slash_count);
 951       for (; cut && *ptr; ptr++)
 952         if (*ptr == '/')
 953           --cut;
 954       STRDUP_ALLOCA (dir, ptr);
 955     }
 956   else
 957     dir = u->dir + (*u->dir == '/');
 958
 959   host = xstrdup (u->host);
 960   /* Check for the true name (or at least a consistent name for saving
 961      to directory) of HOST, reusing the hlist if possible.  */
 962   if (opt.add_hostdir && !opt.simple_check)
 963     {
 964       char *nhost = realhost (host);
 965       xfree (host);
 966       host = nhost;
 967     }
 968   /* Add dir_prefix and hostname (if required) to the beginning of
 969      dir.  */
 970   if (opt.add_hostdir)
 971     {
 972       if (!DOTP (opt.dir_prefix))
 973         {
 974           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 975                                     + strlen (host) + 1);
 976           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 977         }
 978       else
 979         STRDUP_ALLOCA (dirpref, host);
 980     }
 981   else                         /* not add_hostdir */
 982     {
 983       if (!DOTP (opt.dir_prefix))
 984         dirpref = opt.dir_prefix;
 985       else
 986         dirpref = "";
 987     }
 988   xfree (host);
 989
 990   /* If there is a prefix, prepend it.  */
 991   if (*dirpref)
 992     {
 993       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
 994       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
 995       dir = newdir;
 996     }
 997   dir = xstrdup (dir);
 998   URL_CLEANSE (dir);
 999   l = strlen (dir);
1000   if (l && dir[l - 1] == '/')
1001     dir[l - 1] = '\0';
1002
1003   if (!*u->file)
1004     file = "index.html";
1005   else
1006     file = u->file;
1007
1008   /* Finally, construct the full name.  */
1009   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1010   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1011   xfree (dir);
1012   return res;
1013 }
1014
1015 /* Create a unique filename, corresponding to a given URL.  Calls
1016    mkstruct if necessary.  Does *not* actually create any directories.  */
1017 char *
1018 url_filename (const struct urlinfo *u)
1019 {
1020   char *file, *name;
1021   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1022
1023   if (opt.dirstruct)
1024     {
1025       file = mkstruct (u);
1026       have_prefix = 1;
1027     }
1028   else
1029     {
1030       if (!*u->file)
1031         file = xstrdup ("index.html");
1032       else
1033         file = xstrdup (u->file);
1034     }
1035
1036   if (!have_prefix)
1037     {
1038       /* Check whether the prefix directory is something other than "."
1039          before prepending it.  */
1040       if (!DOTP (opt.dir_prefix))
1041         {
1042           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1043                                          + 1 + strlen (file) + 1);
1044           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1045           xfree (file);
1046           file = nfile;
1047         }
1048     }
1049   /* DOS-ish file systems don't like `%' signs in them; we change it
1050      to `@'.  */
1051 #ifdef WINDOWS
1052   {
1053     char *p = file;
1054     for (p = file; *p; p++)
1055       if (*p == '%')
1056         *p = '@';
1057   }
1058 #endif /* WINDOWS */
1059
1060   /* Check the cases in which the unique extensions are not used:
1061      1) Clobbering is turned off (-nc).
1062      2) Retrieval with regetting.
1063      3) Timestamping is used.
1064      4) Hierarchy is built.
1065
1066      The exception is the case when file does exist and is a
1067      directory (actually support for bad httpd-s).  */
1068   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1069       && !(file_exists_p (file) && !file_non_directory_p (file)))
1070     return file;
1071
1072   /* Find a unique name.  */
1073   name = unique_name (file);
1074   xfree (file);
1075   return name;
1076 }
1077
1078 /* Like strlen(), but allow the URL to be ended with '?'.  */
1079 static int
1080 urlpath_length (const char *url)
1081 {
1082   const char *q = strchr (url, '?');
1083   if (q)
1084     return q - url;
1085   return strlen (url);
1086 }
1087
1088 /* Find the last occurrence of character C in the range [b, e), or
1089    NULL, if none are present.  This is almost completely equivalent to
1090    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1091    the contents of the string.  */
1092 static const char *
1093 find_last_char (const char *b, const char *e, char c)
1094 {
1095   for (; e > b; e--)
1096     if (*e == c)
1097       return e;
1098   return NULL;
1099 }
1100
1101 /* Construct a URL by concatenating an absolute URL and a path, which
1102    may or may not be absolute.  This tries to behave "reasonably" in
1103    all foreseeable cases.  It employs little specific knowledge about
1104    protocols or URL-specific stuff -- it just works on strings.  */
1105 static char *
1106 construct (const char *url, const char *sub, int subsize, int no_proto)
1107 {
1108   char *constr;
1109
1110   if (no_proto)
1111     {
1112       const char *end = url + urlpath_length (url);
1113
1114       if (*sub != '/')
1115         {
1116           /* SUB is a relative URL: we need to replace everything
1117              after last slash (possibly empty) with SUB.
1118
1119              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1120              our result should be "whatever/foo/qux/xyzzy".  */
1121           int need_explicit_slash = 0;
1122           int span;
1123           const char *start_insert;
1124           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1125           if (!last_slash)
1126             {
1127               /* No slash found at all.  Append SUB to what we have,
1128                  but we'll need a slash as a separator.
1129
1130                  Example: if url == "foo" and sub == "qux/xyzzy", then
1131                  we cannot just append sub to url, because we'd get
1132                  "fooqux/xyzzy", whereas what we want is
1133                  "foo/qux/xyzzy".
1134
1135                  To make sure the / gets inserted, we set
1136                  need_explicit_slash to 1.  We also set start_insert
1137                  to end + 1, so that the length calculations work out
1138                  correctly for one more (slash) character.  Accessing
1139                  that character is fine, since it will be the
1140                  delimiter, '\0' or '?'.  */
1141               /* example: "foo?..." */
1142               /*               ^    ('?' gets changed to '/') */
1143               start_insert = end + 1;
1144               need_explicit_slash = 1;
1145             }
1146           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1147             {
1148               /* example: http://host"  */
1149               /*                      ^ */
1150               start_insert = end + 1;
1151               need_explicit_slash = 1;
1152             }
1153           else
1154             {
1155               /* example: "whatever/foo/bar" */
1156               /*                        ^    */
1157               start_insert = last_slash + 1;
1158             }
1159
1160           span = start_insert - url;
1161           constr = (char *)xmalloc (span + subsize + 1);
1162           if (span)
1163             memcpy (constr, url, span);
1164           if (need_explicit_slash)
1165             constr[span - 1] = '/';
1166           if (subsize)
1167             memcpy (constr + span, sub, subsize);
1168           constr[span + subsize] = '\0';
1169         }
1170       else /* *sub == `/' */
1171         {
1172           /* SUB is an absolute path: we need to replace everything
1173              after (and including) the FIRST slash with SUB.
1174
1175              So, if URL is "http://host/whatever/foo/bar", and SUB is
1176              "/qux/xyzzy", our result should be
1177              "http://host/qux/xyzzy".  */
1178           int span;
1179           const char *slash;
1180           const char *start_insert = NULL; /* for gcc to shut up. */
1181           const char *pos = url;
1182           int seen_slash_slash = 0;
1183           /* We're looking for the first slash, but want to ignore
1184              double slash. */
1185         again:
1186           slash = memchr (pos, '/', end - pos);
1187           if (slash && !seen_slash_slash)
1188             if (*(slash + 1) == '/')
1189               {
1190                 pos = slash + 2;
1191                 seen_slash_slash = 1;
1192                 goto again;
1193               }
1194
1195           /* At this point, SLASH is the location of the first / after
1196              "//", or the first slash altogether.  START_INSERT is the
1197              pointer to the location where SUB will be inserted.  When
1198              examining the last two examples, keep in mind that SUB
1199              begins with '/'. */
1200
1201           if (!slash && !seen_slash_slash)
1202             /* example: "foo" */
1203             /*           ^    */
1204             start_insert = url;
1205           else if (!slash && seen_slash_slash)
1206             /* example: "http://foo" */
1207             /*                     ^ */
1208             start_insert = end;
1209           else if (slash && !seen_slash_slash)
1210             /* example: "foo/bar" */
1211             /*           ^        */
1212             start_insert = url;
1213           else if (slash && seen_slash_slash)
1214             /* example: "http://something/" */
1215             /*                           ^  */
1216             start_insert = slash;
1217
1218           span = start_insert - url;
1219           constr = (char *)xmalloc (span + subsize + 1);
1220           if (span)
1221             memcpy (constr, url, span);
1222           if (subsize)
1223             memcpy (constr + span, sub, subsize);
1224           constr[span + subsize] = '\0';
1225         }
1226     }
1227   else /* !no_proto */
1228     {
1229       constr = strdupdelim (sub, sub + subsize);
1230     }
1231   return constr;
1232 }
1233
1234 /* Like the function above, but with a saner caller interface. */
1235 char *
1236 url_concat (const char *base_url, const char *new_url)
1237 {
1238   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1239 }
1240 \f
1241 /* Optimize URL by host, destructively replacing u->host with realhost
1242    (u->host).  Do this regardless of opt.simple_check.  */
1243 void
1244 opt_url (struct urlinfo *u)
1245 {
1246   /* Find the "true" host.  */
1247   char *host = realhost (u->host);
1248   xfree (u->host);
1249   u->host = host;
1250   assert (u->dir != NULL);      /* the URL must have been parsed */
1251   /* Refresh the printed representation.  */
1252   xfree (u->url);
1253   u->url = str_url (u, 0);
1254 }
1255
1256 /* This beautiful kludge is fortunately not needed, as I've made
1257    parse_dir do the (almost) right thing, so that a query can never
1258    become a part of directory.  */
1259 #if 0
1260 /* Call path_simplify, but make sure that the part after the
1261    question-mark, if any, is not destroyed by path_simplify's
1262    "optimizations".  */
1263 void
1264 path_simplify_with_kludge (char *path)
1265 {
1266   char *query = strchr (path, '?');
1267   if (query)
1268     /* path_simplify also works destructively, so we also have the
1269        license to write. */
1270     *query = '\0';
1271   path_simplify (path);
1272   if (query)
1273     {
1274       char *newend = path + strlen (path);
1275       *query = '?';
1276       if (newend != query)
1277         memmove (newend, query, strlen (query) + 1);
1278     }
1279 }
1280 #endif
1281 \f
1282 /* Returns proxy host address, in accordance with PROTO.  */
1283 char *
1284 getproxy (uerr_t proto)
1285 {
1286   if (proto == URLHTTP)
1287     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1288   else if (proto == URLFTP)
1289     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1290   else
1291     return NULL;
1292 }
1293
1294 /* Should a host be accessed through proxy, concerning no_proxy?  */
1295 int
1296 no_proxy_match (const char *host, const char **no_proxy)
1297 {
1298   if (!no_proxy)
1299     return 1;
1300   else
1301     return !sufmatch (no_proxy, host);
1302 }
1303 \f
1304 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1305
1306 /* Change the links in an HTML document.  Accepts a structure that
1307    defines the positions of all the links.  */
1308 void
1309 convert_links (const char *file, urlpos *l)
1310 {
1311   struct file_memory *fm;
1312   FILE               *fp;
1313   char               *p;
1314   downloaded_file_t  downloaded_file_return;
1315
1316   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1317
1318   {
1319     /* First we do a "dry run": go through the list L and see whether
1320        any URL needs to be converted in the first place.  If not, just
1321        leave the file alone.  */
1322     int count = 0;
1323     urlpos *dry = l;
1324     for (dry = l; dry; dry = dry->next)
1325       if (dry->convert != CO_NOCONVERT)
1326         ++count;
1327     if (!count)
1328       {
1329         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1330         return;
1331       }
1332   }
1333
1334   fm = read_file (file);
1335   if (!fm)
1336     {
1337       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1338                  file, strerror (errno));
1339       return;
1340     }
1341
1342   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1343   if (opt.backup_converted && downloaded_file_return)
1344     write_backup_file (file, downloaded_file_return);
1345
1346   /* Before opening the file for writing, unlink the file.  This is
1347      important if the data in FM is mmaped.  In such case, nulling the
1348      file, which is what fopen() below does, would make us read all
1349      zeroes from the mmaped region.  */
1350   if (unlink (file) < 0 && errno != ENOENT)
1351     {
1352       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1353                  file, strerror (errno));
1354       read_file_free (fm);
1355       return;
1356     }
1357   /* Now open the file for writing.  */
1358   fp = fopen (file, "wb");
1359   if (!fp)
1360     {
1361       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1362                  file, strerror (errno));
1363       read_file_free (fm);
1364       return;
1365     }
1366   /* Here we loop through all the URLs in file, replacing those of
1367      them that are downloaded with relative references.  */
1368   p = fm->content;
1369   for (; l; l = l->next)
1370     {
1371       char *url_start = fm->content + l->pos;
1372       if (l->pos >= fm->length)
1373         {
1374           DEBUGP (("Something strange is going on.  Please investigate."));
1375           break;
1376         }
1377       /* If the URL is not to be converted, skip it.  */
1378       if (l->convert == CO_NOCONVERT)
1379         {
1380           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1381           continue;
1382         }
1383
1384       /* Echo the file contents, up to the offending URL's opening
1385          quote, to the outfile.  */
1386       fwrite (p, 1, url_start - p, fp);
1387       p = url_start;
1388       if (l->convert == CO_CONVERT_TO_RELATIVE)
1389         {
1390           /* Convert absolute URL to relative. */
1391           char *newname = construct_relative (file, l->local_name);
1392           char *quoted_newname = html_quote_string (newname);
1393           putc (*p, fp);        /* quoting char */
1394           fputs (quoted_newname, fp);
1395           p += l->size - 1;
1396           putc (*p, fp);        /* close quote */
1397           ++p;
1398           xfree (newname);
1399           xfree (quoted_newname);
1400           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1401                    l->url, newname, l->pos, file));
1402         }
1403       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1404         {
1405           /* Convert the link to absolute URL. */
1406           char *newlink = l->url;
1407           char *quoted_newlink = html_quote_string (newlink);
1408           putc (*p, fp);        /* quoting char */
1409           fputs (quoted_newlink, fp);
1410           p += l->size - 1;
1411           putc (*p, fp);        /* close quote */
1412           ++p;
1413           xfree (quoted_newlink);
1414           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1415                    newlink, l->pos, file));
1416         }
1417     }
1418   /* Output the rest of the file. */
1419   if (p - fm->content < fm->length)
1420     fwrite (p, 1, fm->length - (p - fm->content), fp);
1421   fclose (fp);
1422   read_file_free (fm);
1423   logputs (LOG_VERBOSE, _("done.\n"));
1424 }
1425
1426 /* Construct and return a malloced copy of the relative link from two
1427    pieces of information: local name S1 of the referring file and
1428    local name S2 of the referred file.
1429
1430    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1431    "jagor.srce.hr/images/news.gif", the function will return
1432    "images/news.gif".
1433
1434    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1435    "fly.cc.fer.hr/images/fly.gif", the function will return
1436    "../images/fly.gif".
1437
1438    Caveats: S1 should not begin with `/', unless S2 also begins with
1439    '/'.  S1 should not contain things like ".." and such --
1440    construct_relative ("fly/ioccc/../index.html",
1441    "fly/images/fly.gif") will fail.  (A workaround is to call
1442    something like path_simplify() on S1).  */
1443 static char *
1444 construct_relative (const char *s1, const char *s2)
1445 {
1446   int i, cnt, sepdirs1;
1447   char *res;
1448
1449   if (*s2 == '/')
1450     return xstrdup (s2);
1451   /* S1 should *not* be absolute, if S2 wasn't.  */
1452   assert (*s1 != '/');
1453   i = cnt = 0;
1454   /* Skip the directories common to both strings.  */
1455   while (1)
1456     {
1457       while (s1[i] && s2[i]
1458              && (s1[i] == s2[i])
1459              && (s1[i] != '/')
1460              && (s2[i] != '/'))
1461         ++i;
1462       if (s1[i] == '/' && s2[i] == '/')
1463         cnt = ++i;
1464       else
1465         break;
1466     }
1467   for (sepdirs1 = 0; s1[i]; i++)
1468     if (s1[i] == '/')
1469       ++sepdirs1;
1470   /* Now, construct the file as of:
1471      - ../ repeated sepdirs1 time
1472      - all the non-mutual directories of S2.  */
1473   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1474   for (i = 0; i < sepdirs1; i++)
1475     memcpy (res + 3 * i, "../", 3);
1476   strcpy (res + 3 * i, s2 + cnt);
1477   return res;
1478 }
1479 \f
1480 /* Add URL to the head of the list L.  */
1481 urlpos *
1482 add_url (urlpos *l, const char *url, const char *file)
1483 {
1484   urlpos *t;
1485
1486   t = (urlpos *)xmalloc (sizeof (urlpos));
1487   memset (t, 0, sizeof (*t));
1488   t->url = xstrdup (url);
1489   t->local_name = xstrdup (file);
1490   t->next = l;
1491   return t;
1492 }
1493
1494 static void
1495 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1496 {
1497   /* Rather than just writing over the original .html file with the
1498      converted version, save the former to *.orig.  Note we only do
1499      this for files we've _successfully_ downloaded, so we don't
1500      clobber .orig files sitting around from previous invocations. */
1501
1502   /* Construct the backup filename as the original name plus ".orig". */
1503   size_t         filename_len = strlen(file);
1504   char*          filename_plus_orig_suffix;
1505   boolean        already_wrote_backup_file = FALSE;
1506   slist*         converted_file_ptr;
1507   static slist*  converted_files = NULL;
1508
1509   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1510     {
1511       /* Just write "orig" over "html".  We need to do it this way
1512          because when we're checking to see if we've downloaded the
1513          file before (to see if we can skip downloading it), we don't
1514          know if it's a text/html file.  Therefore we don't know yet
1515          at that stage that -E is going to cause us to tack on
1516          ".html", so we need to compare vs. the original URL plus
1517          ".orig", not the original URL plus ".html.orig". */
1518       filename_plus_orig_suffix = alloca (filename_len + 1);
1519       strcpy(filename_plus_orig_suffix, file);
1520       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1521     }
1522   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1523     {
1524       /* Append ".orig" to the name. */
1525       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1526       strcpy(filename_plus_orig_suffix, file);
1527       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1528     }
1529
1530   /* We can get called twice on the same URL thanks to the
1531      convert_all_links() call in main().  If we write the .orig file
1532      each time in such a case, it'll end up containing the first-pass
1533      conversion, not the original file.  So, see if we've already been
1534      called on this file. */
1535   converted_file_ptr = converted_files;
1536   while (converted_file_ptr != NULL)
1537     if (strcmp(converted_file_ptr->string, file) == 0)
1538       {
1539         already_wrote_backup_file = TRUE;
1540         break;
1541       }
1542     else
1543       converted_file_ptr = converted_file_ptr->next;
1544
1545   if (!already_wrote_backup_file)
1546     {
1547       /* Rename <file> to <file>.orig before former gets written over. */
1548       if (rename(file, filename_plus_orig_suffix) != 0)
1549         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1550                    file, filename_plus_orig_suffix, strerror (errno));
1551
1552       /* Remember that we've already written a .orig backup for this file.
1553          Note that we never free this memory since we need it till the
1554          convert_all_links() call, which is one of the last things the
1555          program does before terminating.  BTW, I'm not sure if it would be
1556          safe to just set 'converted_file_ptr->string' to 'file' below,
1557          rather than making a copy of the string...  Another note is that I
1558          thought I could just add a field to the urlpos structure saying
1559          that we'd written a .orig file for this URL, but that didn't work,
1560          so I had to make this separate list.
1561
1562          This [adding a field to the urlpos structure] didn't work
1563          because convert_file() is called twice: once after all its
1564          sublinks have been retrieved in recursive_retrieve(), and
1565          once at the end of the day in convert_all_links().  The
1566          original linked list collected in recursive_retrieve() is
1567          lost after the first invocation of convert_links(), and
1568          convert_all_links() makes a new one (it calls get_urls_html()
1569          for each file it covers.)  That's why your approach didn't
1570          work.  The way to make it work is perhaps to make this flag a
1571          field in the `urls_html' list.  */
1572
1573       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1574       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1575       converted_file_ptr->next = converted_files;
1576       converted_files = converted_file_ptr;
1577     }
1578 }
1579
1580 typedef struct _downloaded_file_list {
1581   char*                          file;
1582   downloaded_file_t              download_type;
1583   struct _downloaded_file_list*  next;
1584 } downloaded_file_list;
1585
1586 static downloaded_file_list *downloaded_files;
1587
1588 /* Remembers which files have been downloaded.  In the standard case, should be
1589    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1590    download successfully (i.e. not for ones we have failures on or that we skip
1591    due to -N).
1592
1593    When we've downloaded a file and tacked on a ".html" extension due to -E,
1594    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1595    FILE_DOWNLOADED_NORMALLY.
1596
1597    If you just want to check if a file has been previously added without adding
1598    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1599    with local filenames, not remote URLs. */
1600 downloaded_file_t
1601 downloaded_file (downloaded_file_t  mode, const char*  file)
1602 {
1603   boolean                       found_file = FALSE;
1604   downloaded_file_list*         rover = downloaded_files;
1605
1606   while (rover != NULL)
1607     if (strcmp(rover->file, file) == 0)
1608       {
1609         found_file = TRUE;
1610         break;
1611       }
1612     else
1613       rover = rover->next;
1614
1615   if (found_file)
1616     return rover->download_type;  /* file had already been downloaded */
1617   else
1618     {
1619       if (mode != CHECK_FOR_FILE)
1620         {
1621           rover = xmalloc(sizeof(*rover));
1622           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1623           rover->download_type = mode;
1624           rover->next = downloaded_files;
1625           downloaded_files = rover;
1626         }
1627
1628       return FILE_NOT_ALREADY_DOWNLOADED;
1629     }
1630 }
1631
1632 void
1633 downloaded_files_free (void)
1634 {
1635   downloaded_file_list*         rover = downloaded_files;
1636   while (rover)
1637     {
1638       downloaded_file_list *next = rover->next;
1639       xfree (rover->file);
1640       xfree (rover);
1641       rover = next;
1642     }
1643 }
1644 \f
1645 /* Initialization of static stuff. */
1646 void
1647 url_init (void)
1648 {
1649   init_unsafe_char_table ();
1650 }