sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Table of Unsafe chars.  This is intialized in
  46    init_unsafe_char_table.  */
  47
  48 static char unsafe_char_table[256];
  49
  50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  51
  52 /* rfc1738 reserved chars.  This is too short to warrant a table.  We
  53    don't use this yet; preservation of reserved chars will be
  54    implemented when I integrate the new `reencode_string'
  55    function.  */
  56 #define RESERVED_CHAR(c) (   (c) == ';' || (c) == '/' || (c) == '?'     \
  57                           || (c) == '@' || (c) == '=' || (c) == '&'     \
  58                           || (c) == '+')
  59
  60 /* Is X "."?  */
  61 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  62 /* Is X ".."?  */
  63 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  64
  65 static int urlpath_length PARAMS ((const char *));
  66
  67 /* A NULL-terminated list of strings to be recognized as protocol
  68    types (URL schemes).  Note that recognized doesn't mean supported
  69    -- only HTTP, HTTPS and FTP are currently supported.
  70
  71    However, a string that does not match anything in the list will be
  72    considered a relative URL.  Thus it's important that this list has
  73    anything anyone could think of being legal.
  74
  75    #### This is probably broken.  Wget should use other means to
  76    distinguish between absolute and relative URIs in HTML links.
  77
  78    Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
  79    for more.  */
  80 static char *protostrings[] =
  81 {
  82   "cid:",
  83   "clsid:",
  84   "file:",
  85   "finger:",
  86   "ftp:",
  87   "gopher:",
  88   "hdl:",
  89   "http:",
  90   "https:",
  91   "ilu:",
  92   "ior:",
  93   "irc:",
  94   "java:",
  95   "javascript:",
  96   "lifn:",
  97   "mailto:",
  98   "mid:",
  99   "news:",
 100   "nntp:",
 101   "path:",
 102   "prospero:",
 103   "rlogin:",
 104   "service:",
 105   "shttp:",
 106   "snews:",
 107   "stanf:",
 108   "telnet:",
 109   "tn3270:",
 110   "wais:",
 111   "whois++:",
 112   NULL
 113 };
 114
 115 struct proto
 116 {
 117   char *name;
 118   uerr_t ind;
 119   unsigned short port;
 120 };
 121
 122 /* Similar to former, but for supported protocols: */
 123 static struct proto sup_protos[] =
 124 {
 125   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 126 #ifdef HAVE_SSL
 127   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 128 #endif
 129   { "ftp://", URLFTP, DEFAULT_FTP_PORT }
 130 };
 131
 132 static void parse_dir PARAMS ((const char *, char **, char **));
 133 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 134 static char *construct_relative PARAMS ((const char *, const char *));
 135 static char process_ftp_type PARAMS ((char *));
 136
 137 \f
 138 /* Unsafe chars:
 139    - anything <= 32;
 140    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 141    - @ and :, for user/password encoding.
 142    - everything over 127 (but we don't bother with recording those.  */
 143 void
 144 init_unsafe_char_table (void)
 145 {
 146   int i;
 147   for (i = 0; i < 256; i++)
 148     if (i < 32 || i >= 127
 149         || i == ' '
 150         || i == '<'
 151         || i == '>'
 152         || i == '\"'
 153         || i == '#'
 154         || i == '%'
 155         || i == '{'
 156         || i == '}'
 157         || i == '|'
 158         || i == '\\'
 159         || i == '^'
 160         || i == '~'
 161         || i == '['
 162         || i == ']'
 163         || i == '`')
 164       unsafe_char_table[i] = 1;
 165 }
 166
 167 /* Decodes the forms %xy in a URL to the character the hexadecimal
 168    code of which is xy.  xy are hexadecimal digits from
 169    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 170    hex-digits or `%' precedes `\0', the sequence is inserted
 171    literally.  */
 172
 173 static void
 174 decode_string (char *s)
 175 {
 176   char *p = s;
 177
 178   for (; *s; s++, p++)
 179     {
 180       if (*s != '%')
 181         *p = *s;
 182       else
 183         {
 184           /* Do nothing if at the end of the string, or if the chars
 185              are not hex-digits.  */
 186           if (!*(s + 1) || !*(s + 2)
 187               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 188             {
 189               *p = *s;
 190               continue;
 191             }
 192           *p = (XCHAR_TO_XDIGIT (*(s + 1)) << 4) + XCHAR_TO_XDIGIT (*(s + 2));
 193           s += 2;
 194         }
 195     }
 196   *p = '\0';
 197 }
 198
 199 /* Like encode_string, but return S if there are no unsafe chars.  */
 200
 201 static char *
 202 encode_string_maybe (const char *s)
 203 {
 204   const char *p1;
 205   char *p2, *newstr;
 206   int newlen;
 207   int addition = 0;
 208
 209   for (p1 = s; *p1; p1++)
 210     if (UNSAFE_CHAR (*p1))
 211       addition += 2;            /* Two more characters (hex digits) */
 212
 213   if (!addition)
 214     return (char *)s;
 215
 216   newlen = (p1 - s) + addition;
 217   newstr = (char *)xmalloc (newlen + 1);
 218
 219   p1 = s;
 220   p2 = newstr;
 221   while (*p1)
 222     {
 223       if (UNSAFE_CHAR (*p1))
 224         {
 225           const unsigned char c = *p1++;
 226           *p2++ = '%';
 227           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 228           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 229         }
 230       else
 231         *p2++ = *p1++;
 232     }
 233   *p2 = '\0';
 234   assert (p2 - newstr == newlen);
 235
 236   return newstr;
 237 }
 238
 239 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 240    given string, returning a malloc-ed %XX encoded string.  */
 241
 242 char *
 243 encode_string (const char *s)
 244 {
 245   char *encoded = encode_string_maybe (s);
 246   if (encoded != s)
 247     return encoded;
 248   else
 249     return xstrdup (s);
 250 }
 251
 252 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 253    the old value of PTR is freed and PTR is made to point to the newly
 254    allocated storage.  */
 255
 256 #define ENCODE(ptr) do {                        \
 257   char *e_new = encode_string_maybe (ptr);      \
 258   if (e_new != ptr)                             \
 259     {                                           \
 260       xfree (ptr);                              \
 261       ptr = e_new;                              \
 262     }                                           \
 263 } while (0)
 264 \f
 265 /* Returns the protocol type if URL's protocol is supported, or
 266    URLUNKNOWN if not.  */
 267 uerr_t
 268 urlproto (const char *url)
 269 {
 270   int i;
 271
 272   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 273     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 274       return sup_protos[i].ind;
 275   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 276   if (url[i] == ':')
 277     {
 278       for (++i; url[i] && url[i] != '/'; i++)
 279         if (!ISDIGIT (url[i]))
 280           return URLBADPORT;
 281       if (url[i - 1] == ':')
 282         return URLFTP;
 283       else
 284         return URLHTTP;
 285     }
 286   else
 287     return URLHTTP;
 288 }
 289
 290 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 291    part is found, returns 0.  */
 292 int
 293 skip_proto (const char *url)
 294 {
 295   char **s;
 296   int l;
 297
 298   for (s = protostrings; *s; s++)
 299     if (!strncasecmp (*s, url, strlen (*s)))
 300       break;
 301   if (!*s)
 302     return 0;
 303   l = strlen (*s);
 304   /* HTTP and FTP protocols are expected to yield exact host names
 305      (i.e. the `//' part must be skipped, too).  */
 306   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 307     l += 2;
 308   return l;
 309 }
 310
 311 /* Returns 1 if the URL begins with a protocol (supported or
 312    unsupported), 0 otherwise.  */
 313 int
 314 has_proto (const char *url)
 315 {
 316   char **s;
 317
 318   for (s = protostrings; *s; s++)
 319     if (strncasecmp (url, *s, strlen (*s)) == 0)
 320       return 1;
 321   return 0;
 322 }
 323
 324 /* Skip the username and password, if present here.  The function
 325    should be called *not* with the complete URL, but with the part
 326    right after the protocol.
 327
 328    If no username and password are found, return 0.  */
 329 int
 330 skip_uname (const char *url)
 331 {
 332   const char *p;
 333   const char *q = NULL;
 334   for (p = url ; *p && *p != '/'; p++)
 335     if (*p == '@') q = p;
 336   /* If a `@' was found before the first occurrence of `/', skip
 337      it.  */
 338   if (q != NULL)
 339     return q - url + 1;
 340   else
 341     return 0;
 342 }
 343 \f
 344 /* Allocate a new urlinfo structure, fill it with default values and
 345    return a pointer to it.  */
 346 struct urlinfo *
 347 newurl (void)
 348 {
 349   struct urlinfo *u;
 350
 351   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 352   memset (u, 0, sizeof (*u));
 353   u->proto = URLUNKNOWN;
 354   return u;
 355 }
 356
 357 /* Perform a "deep" free of the urlinfo structure.  The structure
 358    should have been created with newurl, but need not have been used.
 359    If free_pointer is non-0, free the pointer itself.  */
 360 void
 361 freeurl (struct urlinfo *u, int complete)
 362 {
 363   assert (u != NULL);
 364   FREE_MAYBE (u->url);
 365   FREE_MAYBE (u->host);
 366   FREE_MAYBE (u->path);
 367   FREE_MAYBE (u->file);
 368   FREE_MAYBE (u->dir);
 369   FREE_MAYBE (u->user);
 370   FREE_MAYBE (u->passwd);
 371   FREE_MAYBE (u->local);
 372   FREE_MAYBE (u->referer);
 373   if (u->proxy)
 374     freeurl (u->proxy, 1);
 375   if (complete)
 376     xfree (u);
 377   return;
 378 }
 379 \f
 380 /* Extract the given URL of the form
 381    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 382    1. hostname (terminated with `/' or `:')
 383    2. port number (terminated with `/'), or chosen for the protocol
 384    3. dirname (everything after hostname)
 385    Most errors are handled.  No allocation is done, you must supply
 386    pointers to allocated memory.
 387    ...and a host of other stuff :-)
 388
 389    - Recognizes hostname:dir/file for FTP and
 390      hostname (:portnum)?/dir/file for HTTP.
 391    - Parses the path to yield directory and file
 392    - Parses the URL to yield the username and passwd (if present)
 393    - Decodes the strings, in case they contain "forbidden" characters
 394    - Writes the result to struct urlinfo
 395
 396    If the argument STRICT is set, it recognizes only the canonical
 397    form.  */
 398 uerr_t
 399 parseurl (const char *url, struct urlinfo *u, int strict)
 400 {
 401   int i, l, abs_ftp;
 402   int recognizable;            /* Recognizable URL is the one where
 403                                   the protocol name was explicitly
 404                                   named, i.e. it wasn't deduced from
 405                                   the URL format.  */
 406   uerr_t type;
 407
 408   DEBUGP (("parseurl (\"%s\") -> ", url));
 409   recognizable = has_proto (url);
 410   if (strict && !recognizable)
 411     return URLUNKNOWN;
 412   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 413     {
 414       l = strlen (sup_protos[i].name);
 415       if (!strncasecmp (sup_protos[i].name, url, l))
 416         break;
 417     }
 418   /* If protocol is recognizable, but unsupported, bail out, else
 419      suppose unknown.  */
 420   if (recognizable && i == ARRAY_SIZE (sup_protos))
 421     return URLUNKNOWN;
 422   else if (i == ARRAY_SIZE (sup_protos))
 423     type = URLUNKNOWN;
 424   else
 425     u->proto = type = sup_protos[i].ind;
 426
 427   if (type == URLUNKNOWN)
 428     l = 0;
 429   /* Allow a username and password to be specified (i.e. just skip
 430      them for now).  */
 431   if (recognizable)
 432     l += skip_uname (url + l);
 433   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 434   if (i == l)
 435     return URLBADHOST;
 436   /* Get the hostname.  */
 437   u->host = strdupdelim (url + l, url + i);
 438   DEBUGP (("host %s -> ", u->host));
 439
 440   /* Assume no port has been given.  */
 441   u->port = 0;
 442   if (url[i] == ':')
 443     {
 444       /* We have a colon delimiting the hostname.  It could mean that
 445          a port number is following it, or a directory.  */
 446       if (ISDIGIT (url[++i]))    /* A port number */
 447         {
 448           if (type == URLUNKNOWN)
 449             u->proto = type = URLHTTP;
 450           for (; url[i] && url[i] != '/'; i++)
 451             if (ISDIGIT (url[i]))
 452               u->port = 10 * u->port + (url[i] - '0');
 453             else
 454               return URLBADPORT;
 455           if (!u->port)
 456             return URLBADPORT;
 457           DEBUGP (("port %hu -> ", u->port));
 458         }
 459       else if (type == URLUNKNOWN) /* or a directory */
 460         u->proto = type = URLFTP;
 461       else                      /* or just a misformed port number */
 462         return URLBADPORT;
 463     }
 464   else if (type == URLUNKNOWN)
 465     u->proto = type = URLHTTP;
 466   if (!u->port)
 467     {
 468       int ind;
 469       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 470         if (sup_protos[ind].ind == type)
 471           break;
 472       if (ind == ARRAY_SIZE (sup_protos))
 473         return URLUNKNOWN;
 474       u->port = sup_protos[ind].port;
 475     }
 476   /* Some delimiter troubles...  */
 477   if (url[i] == '/' && url[i - 1] != ':')
 478     ++i;
 479   if (type == URLHTTP)
 480     while (url[i] && url[i] == '/')
 481       ++i;
 482   u->path = (char *)xmalloc (strlen (url + i) + 8);
 483   strcpy (u->path, url + i);
 484   if (type == URLFTP)
 485     {
 486       u->ftp_type = process_ftp_type (u->path);
 487       /* #### We don't handle type `d' correctly yet.  */
 488       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 489         u->ftp_type = 'I';
 490       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 491     }
 492   DEBUGP (("opath %s -> ", u->path));
 493   /* Parse the username and password (if existing).  */
 494   parse_uname (url, &u->user, &u->passwd);
 495   /* Decode the strings, as per RFC 1738.  */
 496   decode_string (u->host);
 497   decode_string (u->path);
 498   if (u->user)
 499     decode_string (u->user);
 500   if (u->passwd)
 501     decode_string (u->passwd);
 502   /* Parse the directory.  */
 503   parse_dir (u->path, &u->dir, &u->file);
 504   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 505   /* Simplify the directory.  */
 506   path_simplify (u->dir);
 507   /* Remove the leading `/' in HTTP.  */
 508   if (type == URLHTTP && *u->dir == '/')
 509     strcpy (u->dir, u->dir + 1);
 510   DEBUGP (("ndir %s\n", u->dir));
 511   /* Strip trailing `/'.  */
 512   l = strlen (u->dir);
 513   if (l > 1 && u->dir[l - 1] == '/')
 514     u->dir[l - 1] = '\0';
 515   /* Re-create the path: */
 516   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 517   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 518       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 519   strcpy (u->path, abs_ftp ? "%2F" : "/");
 520   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 521   strcat (u->path, *u->dir ? "/" : "");
 522   strcat (u->path, u->file);
 523   ENCODE (u->path);
 524   DEBUGP (("newpath: %s\n", u->path));
 525   /* Create the clean URL.  */
 526   u->url = str_url (u, 0);
 527   return URLOK;
 528 }
 529 \f
 530 /* Special versions of DOTP and DDOTP for parse_dir().  They work like
 531    DOTP and DDOTP, but they also recognize `?' as end-of-string
 532    delimiter.  This is needed for correct handling of query
 533    strings.  */
 534
 535 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 536 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 537                      && (!*((x) + 2) || *((x) + 2) == '?'))
 538
 539 /* Build the directory and filename components of the path.  Both
 540    components are *separately* malloc-ed strings!  It does not change
 541    the contents of path.
 542
 543    If the path ends with "." or "..", they are (correctly) counted as
 544    directories.  */
 545 static void
 546 parse_dir (const char *path, char **dir, char **file)
 547 {
 548   int i, l;
 549
 550   l = urlpath_length (path);
 551   for (i = l; i && path[i] != '/'; i--);
 552
 553   if (!i && *path != '/')   /* Just filename */
 554     {
 555       if (PD_DOTP (path) || PD_DDOTP (path))
 556         {
 557           *dir = strdupdelim (path, path + l);
 558           *file = xstrdup (path + l); /* normally empty, but could
 559                                          contain ?... */
 560         }
 561       else
 562         {
 563           *dir = xstrdup ("");     /* This is required because of FTP */
 564           *file = xstrdup (path);
 565         }
 566     }
 567   else if (!i)                 /* /filename */
 568     {
 569       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 570         {
 571           *dir = strdupdelim (path, path + l);
 572           *file = xstrdup (path + l); /* normally empty, but could
 573                                          contain ?... */
 574         }
 575       else
 576         {
 577           *dir = xstrdup ("/");
 578           *file = xstrdup (path + 1);
 579         }
 580     }
 581   else /* Nonempty directory with or without a filename */
 582     {
 583       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 584         {
 585           *dir = strdupdelim (path, path + l);
 586           *file = xstrdup (path + l); /* normally empty, but could
 587                                          contain ?... */
 588         }
 589       else
 590         {
 591           *dir = strdupdelim (path, path + i);
 592           *file = xstrdup (path + i + 1);
 593         }
 594     }
 595 }
 596
 597 /* Find the optional username and password within the URL, as per
 598    RFC1738.  The returned user and passwd char pointers are
 599    malloc-ed.  */
 600 static uerr_t
 601 parse_uname (const char *url, char **user, char **passwd)
 602 {
 603   int l;
 604   const char *p, *q, *col;
 605   char **where;
 606
 607   *user = NULL;
 608   *passwd = NULL;
 609
 610   /* Look for the end of the protocol string.  */
 611   l = skip_proto (url);
 612   if (!l)
 613     return URLUNKNOWN;
 614   /* Add protocol offset.  */
 615   url += l;
 616   /* Is there an `@' character?  */
 617   for (p = url; *p && *p != '/'; p++)
 618     if (*p == '@')
 619       break;
 620   /* If not, return.  */
 621   if (*p != '@')
 622     return URLOK;
 623   /* Else find the username and password.  */
 624   for (p = q = col = url; *p && *p != '/'; p++)
 625     {
 626       if (*p == ':' && !*user)
 627         {
 628           *user = (char *)xmalloc (p - url + 1);
 629           memcpy (*user, url, p - url);
 630           (*user)[p - url] = '\0';
 631           col = p + 1;
 632         }
 633       if (*p == '@') q = p;
 634     }
 635   /* Decide whether you have only the username or both.  */
 636   where = *user ? passwd : user;
 637   *where = (char *)xmalloc (q - col + 1);
 638   memcpy (*where, col, q - col);
 639   (*where)[q - col] = '\0';
 640   return URLOK;
 641 }
 642
 643 /* If PATH ends with `;type=X', return the character X.  */
 644 static char
 645 process_ftp_type (char *path)
 646 {
 647   int len = strlen (path);
 648
 649   if (len >= 7
 650       && !memcmp (path + len - 7, ";type=", 6))
 651     {
 652       path[len - 7] = '\0';
 653       return path[len - 1];
 654     }
 655   else
 656     return '\0';
 657 }
 658 \f
 659 /* Return the URL as fine-formed string, with a proper protocol, optional port
 660    number, directory and optional user/password.  If `hide' is non-zero (as it
 661    is when we're calling this on a URL we plan to print, but not when calling it
 662    to canonicalize a URL for use within the program), password will be hidden.
 663    The forbidden characters in the URL will be cleansed.  */
 664 char *
 665 str_url (const struct urlinfo *u, int hide)
 666 {
 667   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 668   int i, l, ln, lu, lh, lp, lf, ld;
 669   unsigned short proto_default_port;
 670
 671   /* Look for the protocol name.  */
 672   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 673     if (sup_protos[i].ind == u->proto)
 674       break;
 675   if (i == ARRAY_SIZE (sup_protos))
 676     return NULL;
 677   proto_name = sup_protos[i].name;
 678   proto_default_port = sup_protos[i].port;
 679   host = encode_string (u->host);
 680   dir = encode_string (u->dir);
 681   file = encode_string (u->file);
 682   user = passwd = NULL;
 683   if (u->user)
 684     user = encode_string (u->user);
 685   if (u->passwd)
 686     {
 687       if (hide)
 688         /* Don't output the password, or someone might see it over the user's
 689            shoulder (or in saved wget output).  Don't give away the number of
 690            characters in the password, either, as we did in past versions of
 691            this code, when we replaced the password characters with 'x's. */
 692         passwd = xstrdup("<password>");
 693       else
 694         passwd = encode_string (u->passwd);
 695     }
 696   if (u->proto == URLFTP && *dir == '/')
 697     {
 698       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 699       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 700       tmp[0] = '%';
 701       tmp[1] = '2';
 702       tmp[2] = 'F';
 703       strcpy (tmp + 3, dir + 1);
 704       xfree (dir);
 705       dir = tmp;
 706     }
 707
 708   ln = strlen (proto_name);
 709   lu = user ? strlen (user) : 0;
 710   lp = passwd ? strlen (passwd) : 0;
 711   lh = strlen (host);
 712   ld = strlen (dir);
 713   lf = strlen (file);
 714   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 715   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 716      (user ? user : ""), (passwd ? ":" : ""),
 717      (passwd ? passwd : ""), (user ? "@" : ""),
 718      host, u->port, dir, *dir ? "/" : "", file); */
 719   l = 0;
 720   memcpy (res, proto_name, ln);
 721   l += ln;
 722   if (user)
 723     {
 724       memcpy (res + l, user, lu);
 725       l += lu;
 726       if (passwd)
 727         {
 728           res[l++] = ':';
 729           memcpy (res + l, passwd, lp);
 730           l += lp;
 731         }
 732       res[l++] = '@';
 733     }
 734   memcpy (res + l, host, lh);
 735   l += lh;
 736   if (u->port != proto_default_port)
 737     {
 738       res[l++] = ':';
 739       long_to_string (res + l, (long)u->port);
 740       l += numdigit (u->port);
 741     }
 742   res[l++] = '/';
 743   memcpy (res + l, dir, ld);
 744   l += ld;
 745   if (*dir)
 746     res[l++] = '/';
 747   strcpy (res + l, file);
 748   xfree (host);
 749   xfree (dir);
 750   xfree (file);
 751   FREE_MAYBE (user);
 752   FREE_MAYBE (passwd);
 753   return res;
 754 }
 755
 756 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 757    location.  Uses parseurl to parse them, and compares the canonical
 758    forms.
 759
 760    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 761    return 0 on error.  */
 762 int
 763 url_equal (const char *url1, const char *url2)
 764 {
 765   struct urlinfo *u1, *u2;
 766   uerr_t err;
 767   int res;
 768
 769   u1 = newurl ();
 770   err = parseurl (url1, u1, 0);
 771   if (err != URLOK)
 772     {
 773       freeurl (u1, 1);
 774       return 0;
 775     }
 776   u2 = newurl ();
 777   err = parseurl (url2, u2, 0);
 778   if (err != URLOK)
 779     {
 780       freeurl (u2, 1);
 781       return 0;
 782     }
 783   res = !strcmp (u1->url, u2->url);
 784   freeurl (u1, 1);
 785   freeurl (u2, 1);
 786   return res;
 787 }
 788 \f
 789 urlpos *
 790 get_urls_file (const char *file)
 791 {
 792   struct file_memory *fm;
 793   urlpos *head, *tail;
 794   const char *text, *text_end;
 795
 796   /* Load the file.  */
 797   fm = read_file (file);
 798   if (!fm)
 799     {
 800       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 801       return NULL;
 802     }
 803   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 804   head = tail = NULL;
 805   text = fm->content;
 806   text_end = fm->content + fm->length;
 807   while (text < text_end)
 808     {
 809       const char *line_beg = text;
 810       const char *line_end = memchr (text, '\n', text_end - text);
 811       if (!line_end)
 812         line_end = text_end;
 813       else
 814         ++line_end;
 815       text = line_end;
 816       while (line_beg < line_end
 817              && ISSPACE (*line_beg))
 818         ++line_beg;
 819       while (line_end > line_beg + 1
 820              && ISSPACE (*(line_end - 1)))
 821         --line_end;
 822       if (line_end > line_beg)
 823         {
 824           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 825           memset (entry, 0, sizeof (*entry));
 826           entry->next = NULL;
 827           entry->url = strdupdelim (line_beg, line_end);
 828           if (!head)
 829             head = entry;
 830           else
 831             tail->next = entry;
 832           tail = entry;
 833         }
 834     }
 835   read_file_free (fm);
 836   return head;
 837 }
 838 \f
 839 /* Free the linked list of urlpos.  */
 840 void
 841 free_urlpos (urlpos *l)
 842 {
 843   while (l)
 844     {
 845       urlpos *next = l->next;
 846       xfree (l->url);
 847       FREE_MAYBE (l->local_name);
 848       xfree (l);
 849       l = next;
 850     }
 851 }
 852
 853 /* Rotate FNAME opt.backups times */
 854 void
 855 rotate_backups(const char *fname)
 856 {
 857   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 858   char *from = (char *)alloca (maxlen);
 859   char *to = (char *)alloca (maxlen);
 860   struct stat sb;
 861   int i;
 862
 863   if (stat (fname, &sb) == 0)
 864     if (S_ISREG (sb.st_mode) == 0)
 865       return;
 866
 867   for (i = opt.backups; i > 1; i--)
 868     {
 869       sprintf (from, "%s.%d", fname, i - 1);
 870       sprintf (to, "%s.%d", fname, i);
 871       /* #### This will fail on machines without the rename() system
 872          call.  */
 873       rename (from, to);
 874     }
 875
 876   sprintf (to, "%s.%d", fname, 1);
 877   rename(fname, to);
 878 }
 879
 880 /* Create all the necessary directories for PATH (a file).  Calls
 881    mkdirhier() internally.  */
 882 int
 883 mkalldirs (const char *path)
 884 {
 885   const char *p;
 886   char *t;
 887   struct stat st;
 888   int res;
 889
 890   p = path + strlen (path);
 891   for (; *p != '/' && p != path; p--);
 892   /* Don't create if it's just a file.  */
 893   if ((p == path) && (*p != '/'))
 894     return 0;
 895   t = strdupdelim (path, p);
 896   /* Check whether the directory exists.  */
 897   if ((stat (t, &st) == 0))
 898     {
 899       if (S_ISDIR (st.st_mode))
 900         {
 901           xfree (t);
 902           return 0;
 903         }
 904       else
 905         {
 906           /* If the dir exists as a file name, remove it first.  This
 907              is *only* for Wget to work with buggy old CERN http
 908              servers.  Here is the scenario: When Wget tries to
 909              retrieve a directory without a slash, e.g.
 910              http://foo/bar (bar being a directory), CERN server will
 911              not redirect it too http://foo/bar/ -- it will generate a
 912              directory listing containing links to bar/file1,
 913              bar/file2, etc.  Wget will lose because it saves this
 914              HTML listing to a file `bar', so it cannot create the
 915              directory.  To work around this, if the file of the same
 916              name exists, we just remove it and create the directory
 917              anyway.  */
 918           DEBUGP (("Removing %s because of directory danger!\n", t));
 919           unlink (t);
 920         }
 921     }
 922   res = make_directory (t);
 923   if (res != 0)
 924     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 925   xfree (t);
 926   return res;
 927 }
 928
 929 static int
 930 count_slashes (const char *s)
 931 {
 932   int i = 0;
 933   while (*s)
 934     if (*s++ == '/')
 935       ++i;
 936   return i;
 937 }
 938
 939 /* Return the path name of the URL-equivalent file name, with a
 940    remote-like structure of directories.  */
 941 static char *
 942 mkstruct (const struct urlinfo *u)
 943 {
 944   char *host, *dir, *file, *res, *dirpref;
 945   int l;
 946
 947   assert (u->dir != NULL);
 948   assert (u->host != NULL);
 949
 950   if (opt.cut_dirs)
 951     {
 952       char *ptr = u->dir + (*u->dir == '/');
 953       int slash_count = 1 + count_slashes (ptr);
 954       int cut = MINVAL (opt.cut_dirs, slash_count);
 955       for (; cut && *ptr; ptr++)
 956         if (*ptr == '/')
 957           --cut;
 958       STRDUP_ALLOCA (dir, ptr);
 959     }
 960   else
 961     dir = u->dir + (*u->dir == '/');
 962
 963   host = xstrdup (u->host);
 964   /* Check for the true name (or at least a consistent name for saving
 965      to directory) of HOST, reusing the hlist if possible.  */
 966   if (opt.add_hostdir && !opt.simple_check)
 967     {
 968       char *nhost = realhost (host);
 969       xfree (host);
 970       host = nhost;
 971     }
 972   /* Add dir_prefix and hostname (if required) to the beginning of
 973      dir.  */
 974   if (opt.add_hostdir)
 975     {
 976       if (!DOTP (opt.dir_prefix))
 977         {
 978           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 979                                     + strlen (host) + 1);
 980           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 981         }
 982       else
 983         STRDUP_ALLOCA (dirpref, host);
 984     }
 985   else                         /* not add_hostdir */
 986     {
 987       if (!DOTP (opt.dir_prefix))
 988         dirpref = opt.dir_prefix;
 989       else
 990         dirpref = "";
 991     }
 992   xfree (host);
 993
 994   /* If there is a prefix, prepend it.  */
 995   if (*dirpref)
 996     {
 997       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
 998       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
 999       dir = newdir;
1000     }
1001   dir = encode_string (dir);
1002   l = strlen (dir);
1003   if (l && dir[l - 1] == '/')
1004     dir[l - 1] = '\0';
1005
1006   if (!*u->file)
1007     file = "index.html";
1008   else
1009     file = u->file;
1010
1011   /* Finally, construct the full name.  */
1012   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1013   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1014   xfree (dir);
1015   return res;
1016 }
1017
1018 /* Create a unique filename, corresponding to a given URL.  Calls
1019    mkstruct if necessary.  Does *not* actually create any directories.  */
1020 char *
1021 url_filename (const struct urlinfo *u)
1022 {
1023   char *file, *name;
1024   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1025
1026   if (opt.dirstruct)
1027     {
1028       file = mkstruct (u);
1029       have_prefix = 1;
1030     }
1031   else
1032     {
1033       if (!*u->file)
1034         file = xstrdup ("index.html");
1035       else
1036         file = xstrdup (u->file);
1037     }
1038
1039   if (!have_prefix)
1040     {
1041       /* Check whether the prefix directory is something other than "."
1042          before prepending it.  */
1043       if (!DOTP (opt.dir_prefix))
1044         {
1045           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1046                                          + 1 + strlen (file) + 1);
1047           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1048           xfree (file);
1049           file = nfile;
1050         }
1051     }
1052   /* DOS-ish file systems don't like `%' signs in them; we change it
1053      to `@'.  */
1054 #ifdef WINDOWS
1055   {
1056     char *p = file;
1057     for (p = file; *p; p++)
1058       if (*p == '%')
1059         *p = '@';
1060   }
1061 #endif /* WINDOWS */
1062
1063   /* Check the cases in which the unique extensions are not used:
1064      1) Clobbering is turned off (-nc).
1065      2) Retrieval with regetting.
1066      3) Timestamping is used.
1067      4) Hierarchy is built.
1068
1069      The exception is the case when file does exist and is a
1070      directory (actually support for bad httpd-s).  */
1071   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1072       && !(file_exists_p (file) && !file_non_directory_p (file)))
1073     return file;
1074
1075   /* Find a unique name.  */
1076   name = unique_name (file);
1077   xfree (file);
1078   return name;
1079 }
1080
1081 /* Like strlen(), but allow the URL to be ended with '?'.  */
1082 static int
1083 urlpath_length (const char *url)
1084 {
1085   const char *q = strchr (url, '?');
1086   if (q)
1087     return q - url;
1088   return strlen (url);
1089 }
1090
1091 /* Find the last occurrence of character C in the range [b, e), or
1092    NULL, if none are present.  This is almost completely equivalent to
1093    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1094    the contents of the string.  */
1095 static const char *
1096 find_last_char (const char *b, const char *e, char c)
1097 {
1098   for (; e > b; e--)
1099     if (*e == c)
1100       return e;
1101   return NULL;
1102 }
1103
1104 /* Resolve the result of "linking" a base URI (BASE) to a
1105    link-specified URI (LINK).
1106
1107    Either of the URIs may be absolute or relative, complete with the
1108    host name, or path only.  This tries to behave "reasonably" in all
1109    foreseeable cases.  It employs little specific knowledge about
1110    protocols or URL-specific stuff -- it just works on strings.
1111
1112    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1113    See uri_merge for a gentler interface to this functionality.
1114
1115    #### This function should handle `./' and `../' so that the evil
1116    path_simplify can go.  */
1117 static char *
1118 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1119 {
1120   char *constr;
1121
1122   if (no_proto)
1123     {
1124       const char *end = base + urlpath_length (base);
1125
1126       if (*link != '/')
1127         {
1128           /* LINK is a relative URL: we need to replace everything
1129              after last slash (possibly empty) with LINK.
1130
1131              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1132              our result should be "whatever/foo/qux/xyzzy".  */
1133           int need_explicit_slash = 0;
1134           int span;
1135           const char *start_insert;
1136           const char *last_slash = find_last_char (base, end, '/');
1137           if (!last_slash)
1138             {
1139               /* No slash found at all.  Append LINK to what we have,
1140                  but we'll need a slash as a separator.
1141
1142                  Example: if base == "foo" and link == "qux/xyzzy", then
1143                  we cannot just append link to base, because we'd get
1144                  "fooqux/xyzzy", whereas what we want is
1145                  "foo/qux/xyzzy".
1146
1147                  To make sure the / gets inserted, we set
1148                  need_explicit_slash to 1.  We also set start_insert
1149                  to end + 1, so that the length calculations work out
1150                  correctly for one more (slash) character.  Accessing
1151                  that character is fine, since it will be the
1152                  delimiter, '\0' or '?'.  */
1153               /* example: "foo?..." */
1154               /*               ^    ('?' gets changed to '/') */
1155               start_insert = end + 1;
1156               need_explicit_slash = 1;
1157             }
1158           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1159             {
1160               /* example: http://host"  */
1161               /*                      ^ */
1162               start_insert = end + 1;
1163               need_explicit_slash = 1;
1164             }
1165           else
1166             {
1167               /* example: "whatever/foo/bar" */
1168               /*                        ^    */
1169               start_insert = last_slash + 1;
1170             }
1171
1172           span = start_insert - base;
1173           constr = (char *)xmalloc (span + linklength + 1);
1174           if (span)
1175             memcpy (constr, base, span);
1176           if (need_explicit_slash)
1177             constr[span - 1] = '/';
1178           if (linklength)
1179             memcpy (constr + span, link, linklength);
1180           constr[span + linklength] = '\0';
1181         }
1182       else /* *link == `/' */
1183         {
1184           /* LINK is an absolute path: we need to replace everything
1185              after (and including) the FIRST slash with LINK.
1186
1187              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1188              "/qux/xyzzy", our result should be
1189              "http://host/qux/xyzzy".  */
1190           int span;
1191           const char *slash;
1192           const char *start_insert = NULL; /* for gcc to shut up. */
1193           const char *pos = base;
1194           int seen_slash_slash = 0;
1195           /* We're looking for the first slash, but want to ignore
1196              double slash. */
1197         again:
1198           slash = memchr (pos, '/', end - pos);
1199           if (slash && !seen_slash_slash)
1200             if (*(slash + 1) == '/')
1201               {
1202                 pos = slash + 2;
1203                 seen_slash_slash = 1;
1204                 goto again;
1205               }
1206
1207           /* At this point, SLASH is the location of the first / after
1208              "//", or the first slash altogether.  START_INSERT is the
1209              pointer to the location where LINK will be inserted.  When
1210              examining the last two examples, keep in mind that LINK
1211              begins with '/'. */
1212
1213           if (!slash && !seen_slash_slash)
1214             /* example: "foo" */
1215             /*           ^    */
1216             start_insert = base;
1217           else if (!slash && seen_slash_slash)
1218             /* example: "http://foo" */
1219             /*                     ^ */
1220             start_insert = end;
1221           else if (slash && !seen_slash_slash)
1222             /* example: "foo/bar" */
1223             /*           ^        */
1224             start_insert = base;
1225           else if (slash && seen_slash_slash)
1226             /* example: "http://something/" */
1227             /*                           ^  */
1228             start_insert = slash;
1229
1230           span = start_insert - base;
1231           constr = (char *)xmalloc (span + linklength + 1);
1232           if (span)
1233             memcpy (constr, base, span);
1234           if (linklength)
1235             memcpy (constr + span, link, linklength);
1236           constr[span + linklength] = '\0';
1237         }
1238     }
1239   else /* !no_proto */
1240     {
1241       constr = strdupdelim (link, link + linklength);
1242     }
1243   return constr;
1244 }
1245
1246 /* Merge BASE with LINK and return the resulting URI.  This is an
1247    interface to uri_merge_1 that assumes that LINK is a
1248    zero-terminated string.  */
1249 char *
1250 uri_merge (const char *base, const char *link)
1251 {
1252   return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1253 }
1254 \f
1255 /* Optimize URL by host, destructively replacing u->host with realhost
1256    (u->host).  Do this regardless of opt.simple_check.  */
1257 void
1258 opt_url (struct urlinfo *u)
1259 {
1260   /* Find the "true" host.  */
1261   char *host = realhost (u->host);
1262   xfree (u->host);
1263   u->host = host;
1264   assert (u->dir != NULL);      /* the URL must have been parsed */
1265   /* Refresh the printed representation.  */
1266   xfree (u->url);
1267   u->url = str_url (u, 0);
1268 }
1269 \f
1270 /* Returns proxy host address, in accordance with PROTO.  */
1271 char *
1272 getproxy (uerr_t proto)
1273 {
1274   if (proto == URLHTTP)
1275     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1276   else if (proto == URLFTP)
1277     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1278 #ifdef HAVE_SSL
1279   else if (proto == URLHTTPS)
1280     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1281 #endif /* HAVE_SSL */
1282   else
1283     return NULL;
1284 }
1285
1286 /* Should a host be accessed through proxy, concerning no_proxy?  */
1287 int
1288 no_proxy_match (const char *host, const char **no_proxy)
1289 {
1290   if (!no_proxy)
1291     return 1;
1292   else
1293     return !sufmatch (no_proxy, host);
1294 }
1295 \f
1296 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1297 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1298
1299 /* Change the links in an HTML document.  Accepts a structure that
1300    defines the positions of all the links.  */
1301 void
1302 convert_links (const char *file, urlpos *l)
1303 {
1304   struct file_memory *fm;
1305   FILE               *fp;
1306   const char         *p;
1307   downloaded_file_t  downloaded_file_return;
1308
1309   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1310
1311   {
1312     /* First we do a "dry run": go through the list L and see whether
1313        any URL needs to be converted in the first place.  If not, just
1314        leave the file alone.  */
1315     int count = 0;
1316     urlpos *dry = l;
1317     for (dry = l; dry; dry = dry->next)
1318       if (dry->convert != CO_NOCONVERT)
1319         ++count;
1320     if (!count)
1321       {
1322         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1323         return;
1324       }
1325   }
1326
1327   fm = read_file (file);
1328   if (!fm)
1329     {
1330       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1331                  file, strerror (errno));
1332       return;
1333     }
1334
1335   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1336   if (opt.backup_converted && downloaded_file_return)
1337     write_backup_file (file, downloaded_file_return);
1338
1339   /* Before opening the file for writing, unlink the file.  This is
1340      important if the data in FM is mmaped.  In such case, nulling the
1341      file, which is what fopen() below does, would make us read all
1342      zeroes from the mmaped region.  */
1343   if (unlink (file) < 0 && errno != ENOENT)
1344     {
1345       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1346                  file, strerror (errno));
1347       read_file_free (fm);
1348       return;
1349     }
1350   /* Now open the file for writing.  */
1351   fp = fopen (file, "wb");
1352   if (!fp)
1353     {
1354       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1355                  file, strerror (errno));
1356       read_file_free (fm);
1357       return;
1358     }
1359   /* Here we loop through all the URLs in file, replacing those of
1360      them that are downloaded with relative references.  */
1361   p = fm->content;
1362   for (; l; l = l->next)
1363     {
1364       char *url_start = fm->content + l->pos;
1365
1366       if (l->pos >= fm->length)
1367         {
1368           DEBUGP (("Something strange is going on.  Please investigate."));
1369           break;
1370         }
1371       /* If the URL is not to be converted, skip it.  */
1372       if (l->convert == CO_NOCONVERT)
1373         {
1374           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1375           continue;
1376         }
1377
1378       /* Echo the file contents, up to the offending URL's opening
1379          quote, to the outfile.  */
1380       fwrite (p, 1, url_start - p, fp);
1381       p = url_start;
1382       if (l->convert == CO_CONVERT_TO_RELATIVE)
1383         {
1384           /* Convert absolute URL to relative. */
1385           char *newname = construct_relative (file, l->local_name);
1386           char *quoted_newname = html_quote_string (newname);
1387           replace_attr (&p, l->size, fp, quoted_newname);
1388           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1389                    l->url, newname, l->pos, file));
1390           xfree (newname);
1391           xfree (quoted_newname);
1392         }
1393       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1394         {
1395           /* Convert the link to absolute URL. */
1396           char *newlink = l->url;
1397           char *quoted_newlink = html_quote_string (newlink);
1398           replace_attr (&p, l->size, fp, quoted_newlink);
1399           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1400                    newlink, l->pos, file));
1401           xfree (quoted_newlink);
1402         }
1403     }
1404   /* Output the rest of the file. */
1405   if (p - fm->content < fm->length)
1406     fwrite (p, 1, fm->length - (p - fm->content), fp);
1407   fclose (fp);
1408   read_file_free (fm);
1409   logputs (LOG_VERBOSE, _("done.\n"));
1410 }
1411
1412 /* Construct and return a malloced copy of the relative link from two
1413    pieces of information: local name S1 of the referring file and
1414    local name S2 of the referred file.
1415
1416    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1417    "jagor.srce.hr/images/news.gif", the function will return
1418    "images/news.gif".
1419
1420    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1421    "fly.cc.fer.hr/images/fly.gif", the function will return
1422    "../images/fly.gif".
1423
1424    Caveats: S1 should not begin with `/', unless S2 also begins with
1425    '/'.  S1 should not contain things like ".." and such --
1426    construct_relative ("fly/ioccc/../index.html",
1427    "fly/images/fly.gif") will fail.  (A workaround is to call
1428    something like path_simplify() on S1).  */
1429 static char *
1430 construct_relative (const char *s1, const char *s2)
1431 {
1432   int i, cnt, sepdirs1;
1433   char *res;
1434
1435   if (*s2 == '/')
1436     return xstrdup (s2);
1437   /* S1 should *not* be absolute, if S2 wasn't.  */
1438   assert (*s1 != '/');
1439   i = cnt = 0;
1440   /* Skip the directories common to both strings.  */
1441   while (1)
1442     {
1443       while (s1[i] && s2[i]
1444              && (s1[i] == s2[i])
1445              && (s1[i] != '/')
1446              && (s2[i] != '/'))
1447         ++i;
1448       if (s1[i] == '/' && s2[i] == '/')
1449         cnt = ++i;
1450       else
1451         break;
1452     }
1453   for (sepdirs1 = 0; s1[i]; i++)
1454     if (s1[i] == '/')
1455       ++sepdirs1;
1456   /* Now, construct the file as of:
1457      - ../ repeated sepdirs1 time
1458      - all the non-mutual directories of S2.  */
1459   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1460   for (i = 0; i < sepdirs1; i++)
1461     memcpy (res + 3 * i, "../", 3);
1462   strcpy (res + 3 * i, s2 + cnt);
1463   return res;
1464 }
1465 \f
1466 /* Add URL to the head of the list L.  */
1467 urlpos *
1468 add_url (urlpos *l, const char *url, const char *file)
1469 {
1470   urlpos *t;
1471
1472   t = (urlpos *)xmalloc (sizeof (urlpos));
1473   memset (t, 0, sizeof (*t));
1474   t->url = xstrdup (url);
1475   t->local_name = xstrdup (file);
1476   t->next = l;
1477   return t;
1478 }
1479
1480 static void
1481 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1482 {
1483   /* Rather than just writing over the original .html file with the
1484      converted version, save the former to *.orig.  Note we only do
1485      this for files we've _successfully_ downloaded, so we don't
1486      clobber .orig files sitting around from previous invocations. */
1487
1488   /* Construct the backup filename as the original name plus ".orig". */
1489   size_t         filename_len = strlen(file);
1490   char*          filename_plus_orig_suffix;
1491   boolean        already_wrote_backup_file = FALSE;
1492   slist*         converted_file_ptr;
1493   static slist*  converted_files = NULL;
1494
1495   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1496     {
1497       /* Just write "orig" over "html".  We need to do it this way
1498          because when we're checking to see if we've downloaded the
1499          file before (to see if we can skip downloading it), we don't
1500          know if it's a text/html file.  Therefore we don't know yet
1501          at that stage that -E is going to cause us to tack on
1502          ".html", so we need to compare vs. the original URL plus
1503          ".orig", not the original URL plus ".html.orig". */
1504       filename_plus_orig_suffix = alloca (filename_len + 1);
1505       strcpy(filename_plus_orig_suffix, file);
1506       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1507     }
1508   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1509     {
1510       /* Append ".orig" to the name. */
1511       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1512       strcpy(filename_plus_orig_suffix, file);
1513       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1514     }
1515
1516   /* We can get called twice on the same URL thanks to the
1517      convert_all_links() call in main().  If we write the .orig file
1518      each time in such a case, it'll end up containing the first-pass
1519      conversion, not the original file.  So, see if we've already been
1520      called on this file. */
1521   converted_file_ptr = converted_files;
1522   while (converted_file_ptr != NULL)
1523     if (strcmp(converted_file_ptr->string, file) == 0)
1524       {
1525         already_wrote_backup_file = TRUE;
1526         break;
1527       }
1528     else
1529       converted_file_ptr = converted_file_ptr->next;
1530
1531   if (!already_wrote_backup_file)
1532     {
1533       /* Rename <file> to <file>.orig before former gets written over. */
1534       if (rename(file, filename_plus_orig_suffix) != 0)
1535         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1536                    file, filename_plus_orig_suffix, strerror (errno));
1537
1538       /* Remember that we've already written a .orig backup for this file.
1539          Note that we never free this memory since we need it till the
1540          convert_all_links() call, which is one of the last things the
1541          program does before terminating.  BTW, I'm not sure if it would be
1542          safe to just set 'converted_file_ptr->string' to 'file' below,
1543          rather than making a copy of the string...  Another note is that I
1544          thought I could just add a field to the urlpos structure saying
1545          that we'd written a .orig file for this URL, but that didn't work,
1546          so I had to make this separate list.
1547          -- Dan Harkless <wget@harkless.org>
1548
1549          This [adding a field to the urlpos structure] didn't work
1550          because convert_file() is called twice: once after all its
1551          sublinks have been retrieved in recursive_retrieve(), and
1552          once at the end of the day in convert_all_links().  The
1553          original linked list collected in recursive_retrieve() is
1554          lost after the first invocation of convert_links(), and
1555          convert_all_links() makes a new one (it calls get_urls_html()
1556          for each file it covers.)  That's why your first approach didn't
1557          work.  The way to make it work is perhaps to make this flag a
1558          field in the `urls_html' list.
1559          -- Hrvoje Niksic <hniksic@arsdigita.com>
1560       */
1561       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1562       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1563       converted_file_ptr->next = converted_files;
1564       converted_files = converted_file_ptr;
1565     }
1566 }
1567
1568 static int find_fragment PARAMS ((const char *, int, const char **,
1569                                   const char **));
1570
1571 static void
1572 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1573 {
1574   const char *p = *pp;
1575   int quote_flag = 0;
1576   int size = raw_size;
1577   char quote_char = '\"';
1578   const char *frag_beg, *frag_end;
1579
1580   /* Structure of our string is:
1581        "...old-contents..."
1582        <---  l->size   --->  (with quotes)
1583      OR:
1584        ...old-contents...
1585        <---  l->size  -->    (no quotes)   */
1586
1587   if (*p == '\"' || *p == '\'')
1588     {
1589       quote_char = *p;
1590       quote_flag = 1;
1591       ++p;
1592       size -= 2;                /* disregard opening and closing quote */
1593     }
1594   putc (quote_char, fp);
1595   fputs (new_str, fp);
1596
1597   /* Look for fragment identifier, if any. */
1598   if (find_fragment (p, size, &frag_beg, &frag_end))
1599     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1600   p += size;
1601   if (quote_flag)
1602     ++p;
1603   putc (quote_char, fp);
1604   *pp = p;
1605 }
1606
1607 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1608    preceded by '&'.  If the character is not found, return zero.  If
1609    the character is found, return 1 and set BP and EP to point to the
1610    beginning and end of the region.
1611
1612    This is used for finding the fragment indentifiers in URLs.  */
1613
1614 static int
1615 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1616 {
1617   const char *end = beg + size;
1618   int saw_amp = 0;
1619   for (; beg < end; beg++)
1620     {
1621       switch (*beg)
1622         {
1623         case '&':
1624           saw_amp = 1;
1625           break;
1626         case '#':
1627           if (!saw_amp)
1628             {
1629               *bp = beg;
1630               *ep = end;
1631               return 1;
1632             }
1633           /* fallthrough */
1634         default:
1635           saw_amp = 0;
1636         }
1637     }
1638   return 0;
1639 }
1640
1641 typedef struct _downloaded_file_list {
1642   char*                          file;
1643   downloaded_file_t              download_type;
1644   struct _downloaded_file_list*  next;
1645 } downloaded_file_list;
1646
1647 static downloaded_file_list *downloaded_files;
1648
1649 /* Remembers which files have been downloaded.  In the standard case, should be
1650    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1651    download successfully (i.e. not for ones we have failures on or that we skip
1652    due to -N).
1653
1654    When we've downloaded a file and tacked on a ".html" extension due to -E,
1655    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1656    FILE_DOWNLOADED_NORMALLY.
1657
1658    If you just want to check if a file has been previously added without adding
1659    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1660    with local filenames, not remote URLs. */
1661 downloaded_file_t
1662 downloaded_file (downloaded_file_t  mode, const char*  file)
1663 {
1664   boolean                       found_file = FALSE;
1665   downloaded_file_list*         rover = downloaded_files;
1666
1667   while (rover != NULL)
1668     if (strcmp(rover->file, file) == 0)
1669       {
1670         found_file = TRUE;
1671         break;
1672       }
1673     else
1674       rover = rover->next;
1675
1676   if (found_file)
1677     return rover->download_type;  /* file had already been downloaded */
1678   else
1679     {
1680       if (mode != CHECK_FOR_FILE)
1681         {
1682           rover = xmalloc(sizeof(*rover));
1683           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1684           rover->download_type = mode;
1685           rover->next = downloaded_files;
1686           downloaded_files = rover;
1687         }
1688
1689       return FILE_NOT_ALREADY_DOWNLOADED;
1690     }
1691 }
1692
1693 void
1694 downloaded_files_free (void)
1695 {
1696   downloaded_file_list*         rover = downloaded_files;
1697   while (rover)
1698     {
1699       downloaded_file_list *next = rover->next;
1700       xfree (rover->file);
1701       xfree (rover);
1702       rover = next;
1703     }
1704 }
1705 \f
1706 /* Initialization of static stuff. */
1707 void
1708 url_init (void)
1709 {
1710   init_unsafe_char_table ();
1711 }