sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Table of Unsafe chars.  This is intialized in
  46    init_unsafe_char_table.  */
  47
  48 static char unsafe_char_table[256];
  49
  50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  51
  52 /* If S contains unsafe characters, free it and replace it with a
  53    version that doesn't.  */
  54 #define URL_CLEANSE(s) do                       \
  55 {                                               \
  56   if (contains_unsafe (s))                      \
  57     {                                           \
  58       char *uc_tmp = encode_string (s);         \
  59       xfree (s);                                \
  60       (s) = uc_tmp;                             \
  61     }                                           \
  62 } while (0)
  63
  64 /* Is a directory "."?  */
  65 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  66 /* Is a directory ".."?  */
  67 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  68
  69 #if 0
  70 static void path_simplify_with_kludge PARAMS ((char *));
  71 #endif
  72 static int urlpath_length PARAMS ((const char *));
  73
  74 /* NULL-terminated list of strings to be recognized as prototypes (URL
  75    schemes).  Note that recognized doesn't mean supported -- only HTTP,
  76    HTTPS and FTP are currently supported .
  77
  78    However, a string that does not match anything in the list will be
  79    considered a relative URL.  Thus it's important that this list has
  80    anything anyone could think of being legal.
  81
  82    There are wild things here.  :-) Take a look at
  83    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  84    fun.  */
  85 static char *protostrings[] =
  86 {
  87   "cid:",
  88   "clsid:",
  89   "file:",
  90   "finger:",
  91   "ftp:",
  92   "gopher:",
  93   "hdl:",
  94   "http:",
  95   "https:",
  96   "ilu:",
  97   "ior:",
  98   "irc:",
  99   "java:",
 100   "javascript:",
 101   "lifn:",
 102   "mailto:",
 103   "mid:",
 104   "news:",
 105   "nntp:",
 106   "path:",
 107   "prospero:",
 108   "rlogin:",
 109   "service:",
 110   "shttp:",
 111   "snews:",
 112   "stanf:",
 113   "telnet:",
 114   "tn3270:",
 115   "wais:",
 116   "whois++:",
 117   NULL
 118 };
 119
 120 struct proto
 121 {
 122   char *name;
 123   uerr_t ind;
 124   unsigned short port;
 125 };
 126
 127 /* Similar to former, but for supported protocols: */
 128 static struct proto sup_protos[] =
 129 {
 130   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 131 #ifdef HAVE_SSL
 132   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 133 #endif
 134   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 135   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 136 };
 137
 138 static void parse_dir PARAMS ((const char *, char **, char **));
 139 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 140 static char *construct PARAMS ((const char *, const char *, int , int));
 141 static char *construct_relative PARAMS ((const char *, const char *));
 142 static char process_ftp_type PARAMS ((char *));
 143
 144 \f
 145 /* Returns the number of characters to be skipped if the first thing
 146    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 147    URL: are also skipped.  */
 148 int
 149 skip_url (const char *url)
 150 {
 151   int i;
 152
 153   if (TOUPPER (url[0]) == 'U'
 154       && TOUPPER (url[1]) == 'R'
 155       && TOUPPER (url[2]) == 'L'
 156       && url[3] == ':')
 157     {
 158       /* Skip blanks.  */
 159       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 160       return i;
 161     }
 162   else
 163     return 0;
 164 }
 165
 166 /* Unsafe chars:
 167    - anything <= 32;
 168    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 169    - @ and :, for user/password encoding.
 170    - everything over 127 (but we don't bother with recording those.  */
 171 void
 172 init_unsafe_char_table (void)
 173 {
 174   int i;
 175   for (i = 0; i < 256; i++)
 176     if (i < 32 || i >= 127
 177         || i == ' '
 178         || i == '<'
 179         || i == '>'
 180         || i == '\"'
 181         || i == '#'
 182         || i == '%'
 183         || i == '{'
 184         || i == '}'
 185         || i == '|'
 186         || i == '\\'
 187         || i == '^'
 188         || i == '~'
 189         || i == '['
 190         || i == ']'
 191         || i == '`')
 192       unsafe_char_table[i] = 1;
 193 }
 194
 195 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 196 int
 197 contains_unsafe (const char *s)
 198 {
 199   for (; *s; s++)
 200     if (UNSAFE_CHAR (*s))
 201       return 1;
 202   return 0;
 203 }
 204
 205 /* Decodes the forms %xy in a URL to the character the hexadecimal
 206    code of which is xy.  xy are hexadecimal digits from
 207    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 208    hex-digits or `%' precedes `\0', the sequence is inserted
 209    literally.  */
 210
 211 static void
 212 decode_string (char *s)
 213 {
 214   char *p = s;
 215
 216   for (; *s; s++, p++)
 217     {
 218       if (*s != '%')
 219         *p = *s;
 220       else
 221         {
 222           /* Do nothing if at the end of the string, or if the chars
 223              are not hex-digits.  */
 224           if (!*(s + 1) || !*(s + 2)
 225               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 226             {
 227               *p = *s;
 228               continue;
 229             }
 230           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 231           s += 2;
 232         }
 233     }
 234   *p = '\0';
 235 }
 236
 237 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 238    given string, returning a malloc-ed %XX encoded string.  */
 239 char *
 240 encode_string (const char *s)
 241 {
 242   const char *b;
 243   char *p, *res;
 244   int i;
 245
 246   b = s;
 247   for (i = 0; *s; s++, i++)
 248     if (UNSAFE_CHAR (*s))
 249       i += 2; /* Two more characters (hex digits) */
 250   res = (char *)xmalloc (i + 1);
 251   s = b;
 252   for (p = res; *s; s++)
 253     if (UNSAFE_CHAR (*s))
 254       {
 255         const unsigned char c = *s;
 256         *p++ = '%';
 257         *p++ = HEXD2ASC (c >> 4);
 258         *p++ = HEXD2ASC (c & 0xf);
 259       }
 260     else
 261       *p++ = *s;
 262   *p = '\0';
 263   return res;
 264 }
 265 \f
 266 /* Returns the proto-type if URL's protocol is supported, or
 267    URLUNKNOWN if not.  */
 268 uerr_t
 269 urlproto (const char *url)
 270 {
 271   int i;
 272
 273   url += skip_url (url);
 274   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 275     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 276       return sup_protos[i].ind;
 277   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 278   if (url[i] == ':')
 279     {
 280       for (++i; url[i] && url[i] != '/'; i++)
 281         if (!ISDIGIT (url[i]))
 282           return URLBADPORT;
 283       if (url[i - 1] == ':')
 284         return URLFTP;
 285       else
 286         return URLHTTP;
 287     }
 288   else
 289     return URLHTTP;
 290 }
 291
 292 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 293    part is found, returns 0.  */
 294 int
 295 skip_proto (const char *url)
 296 {
 297   char **s;
 298   int l;
 299
 300   for (s = protostrings; *s; s++)
 301     if (!strncasecmp (*s, url, strlen (*s)))
 302       break;
 303   if (!*s)
 304     return 0;
 305   l = strlen (*s);
 306   /* HTTP and FTP protocols are expected to yield exact host names
 307      (i.e. the `//' part must be skipped, too).  */
 308   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 309     l += 2;
 310   return l;
 311 }
 312
 313 /* Returns 1 if the URL begins with a protocol (supported or
 314    unsupported), 0 otherwise.  */
 315 int
 316 has_proto (const char *url)
 317 {
 318   char **s;
 319
 320   url += skip_url (url);
 321   for (s = protostrings; *s; s++)
 322     if (strncasecmp (url, *s, strlen (*s)) == 0)
 323       return 1;
 324   return 0;
 325 }
 326
 327 /* Skip the username and password, if present here.  The function
 328    should be called *not* with the complete URL, but with the part
 329    right after the protocol.
 330
 331    If no username and password are found, return 0.  */
 332 int
 333 skip_uname (const char *url)
 334 {
 335   const char *p;
 336   const char *q = NULL;
 337   for (p = url ; *p && *p != '/'; p++)
 338     if (*p == '@') q = p;
 339   /* If a `@' was found before the first occurrence of `/', skip
 340      it.  */
 341   if (q != NULL)
 342     return q - url + 1;
 343   else
 344     return 0;
 345 }
 346 \f
 347 /* Allocate a new urlinfo structure, fill it with default values and
 348    return a pointer to it.  */
 349 struct urlinfo *
 350 newurl (void)
 351 {
 352   struct urlinfo *u;
 353
 354   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 355   memset (u, 0, sizeof (*u));
 356   u->proto = URLUNKNOWN;
 357   return u;
 358 }
 359
 360 /* Perform a "deep" free of the urlinfo structure.  The structure
 361    should have been created with newurl, but need not have been used.
 362    If free_pointer is non-0, free the pointer itself.  */
 363 void
 364 freeurl (struct urlinfo *u, int complete)
 365 {
 366   assert (u != NULL);
 367   FREE_MAYBE (u->url);
 368   FREE_MAYBE (u->host);
 369   FREE_MAYBE (u->path);
 370   FREE_MAYBE (u->file);
 371   FREE_MAYBE (u->dir);
 372   FREE_MAYBE (u->user);
 373   FREE_MAYBE (u->passwd);
 374   FREE_MAYBE (u->local);
 375   FREE_MAYBE (u->referer);
 376   if (u->proxy)
 377     freeurl (u->proxy, 1);
 378   if (complete)
 379     xfree (u);
 380   return;
 381 }
 382 \f
 383 /* Extract the given URL of the form
 384    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 385    1. hostname (terminated with `/' or `:')
 386    2. port number (terminated with `/'), or chosen for the protocol
 387    3. dirname (everything after hostname)
 388    Most errors are handled.  No allocation is done, you must supply
 389    pointers to allocated memory.
 390    ...and a host of other stuff :-)
 391
 392    - Recognizes hostname:dir/file for FTP and
 393      hostname (:portnum)?/dir/file for HTTP.
 394    - Parses the path to yield directory and file
 395    - Parses the URL to yield the username and passwd (if present)
 396    - Decodes the strings, in case they contain "forbidden" characters
 397    - Writes the result to struct urlinfo
 398
 399    If the argument STRICT is set, it recognizes only the canonical
 400    form.  */
 401 uerr_t
 402 parseurl (const char *url, struct urlinfo *u, int strict)
 403 {
 404   int i, l, abs_ftp;
 405   int recognizable;            /* Recognizable URL is the one where
 406                                   the protocol name was explicitly
 407                                   named, i.e. it wasn't deduced from
 408                                   the URL format.  */
 409   uerr_t type;
 410
 411   DEBUGP (("parseurl (\"%s\") -> ", url));
 412   url += skip_url (url);
 413   recognizable = has_proto (url);
 414   if (strict && !recognizable)
 415     return URLUNKNOWN;
 416   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 417     {
 418       l = strlen (sup_protos[i].name);
 419       if (!strncasecmp (sup_protos[i].name, url, l))
 420         break;
 421     }
 422   /* If protocol is recognizable, but unsupported, bail out, else
 423      suppose unknown.  */
 424   if (recognizable && i == ARRAY_SIZE (sup_protos))
 425     return URLUNKNOWN;
 426   else if (i == ARRAY_SIZE (sup_protos))
 427     type = URLUNKNOWN;
 428   else
 429     u->proto = type = sup_protos[i].ind;
 430
 431   if (type == URLUNKNOWN)
 432     l = 0;
 433   /* Allow a username and password to be specified (i.e. just skip
 434      them for now).  */
 435   if (recognizable)
 436     l += skip_uname (url + l);
 437   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 438   if (i == l)
 439     return URLBADHOST;
 440   /* Get the hostname.  */
 441   u->host = strdupdelim (url + l, url + i);
 442   DEBUGP (("host %s -> ", u->host));
 443
 444   /* Assume no port has been given.  */
 445   u->port = 0;
 446   if (url[i] == ':')
 447     {
 448       /* We have a colon delimiting the hostname.  It could mean that
 449          a port number is following it, or a directory.  */
 450       if (ISDIGIT (url[++i]))    /* A port number */
 451         {
 452           if (type == URLUNKNOWN)
 453             u->proto = type = URLHTTP;
 454           for (; url[i] && url[i] != '/'; i++)
 455             if (ISDIGIT (url[i]))
 456               u->port = 10 * u->port + (url[i] - '0');
 457             else
 458               return URLBADPORT;
 459           if (!u->port)
 460             return URLBADPORT;
 461           DEBUGP (("port %hu -> ", u->port));
 462         }
 463       else if (type == URLUNKNOWN) /* or a directory */
 464         u->proto = type = URLFTP;
 465       else                      /* or just a misformed port number */
 466         return URLBADPORT;
 467     }
 468   else if (type == URLUNKNOWN)
 469     u->proto = type = URLHTTP;
 470   if (!u->port)
 471     {
 472       int ind;
 473       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 474         if (sup_protos[ind].ind == type)
 475           break;
 476       if (ind == ARRAY_SIZE (sup_protos))
 477         return URLUNKNOWN;
 478       u->port = sup_protos[ind].port;
 479     }
 480   /* Some delimiter troubles...  */
 481   if (url[i] == '/' && url[i - 1] != ':')
 482     ++i;
 483   if (type == URLHTTP)
 484     while (url[i] && url[i] == '/')
 485       ++i;
 486   u->path = (char *)xmalloc (strlen (url + i) + 8);
 487   strcpy (u->path, url + i);
 488   if (type == URLFTP)
 489     {
 490       u->ftp_type = process_ftp_type (u->path);
 491       /* #### We don't handle type `d' correctly yet.  */
 492       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 493         u->ftp_type = 'I';
 494       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 495     }
 496   DEBUGP (("opath %s -> ", u->path));
 497   /* Parse the username and password (if existing).  */
 498   parse_uname (url, &u->user, &u->passwd);
 499   /* Decode the strings, as per RFC 1738.  */
 500   decode_string (u->host);
 501   decode_string (u->path);
 502   if (u->user)
 503     decode_string (u->user);
 504   if (u->passwd)
 505     decode_string (u->passwd);
 506   /* Parse the directory.  */
 507   parse_dir (u->path, &u->dir, &u->file);
 508   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 509   /* Simplify the directory.  */
 510   path_simplify (u->dir);
 511   /* Remove the leading `/' in HTTP.  */
 512   if (type == URLHTTP && *u->dir == '/')
 513     strcpy (u->dir, u->dir + 1);
 514   DEBUGP (("ndir %s\n", u->dir));
 515   /* Strip trailing `/'.  */
 516   l = strlen (u->dir);
 517   if (l && u->dir[l - 1] == '/')
 518     u->dir[l - 1] = '\0';
 519   /* Re-create the path: */
 520   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 521   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 522       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 523   strcpy (u->path, abs_ftp ? "%2F" : "/");
 524   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 525   strcat (u->path, *u->dir ? "/" : "");
 526   strcat (u->path, u->file);
 527   URL_CLEANSE (u->path);
 528   DEBUGP (("newpath: %s\n", u->path));
 529   /* Create the clean URL.  */
 530   u->url = str_url (u, 0);
 531   return URLOK;
 532 }
 533 \f
 534 /* Special versions of DOTP and DDOTP for parse_dir(). */
 535
 536 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 537 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 538                      && (!*((x) + 2) || *((x) + 2) == '?'))
 539
 540 /* Build the directory and filename components of the path.  Both
 541    components are *separately* malloc-ed strings!  It does not change
 542    the contents of path.
 543
 544    If the path ends with "." or "..", they are (correctly) counted as
 545    directories.  */
 546 static void
 547 parse_dir (const char *path, char **dir, char **file)
 548 {
 549   int i, l;
 550
 551   l = urlpath_length (path);
 552   for (i = l; i && path[i] != '/'; i--);
 553
 554   if (!i && *path != '/')   /* Just filename */
 555     {
 556       if (PD_DOTP (path) || PD_DDOTP (path))
 557         {
 558           *dir = strdupdelim (path, path + l);
 559           *file = xstrdup (path + l); /* normally empty, but could
 560                                          contain ?... */
 561         }
 562       else
 563         {
 564           *dir = xstrdup ("");     /* This is required because of FTP */
 565           *file = xstrdup (path);
 566         }
 567     }
 568   else if (!i)                 /* /filename */
 569     {
 570       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 571         {
 572           *dir = strdupdelim (path, path + l);
 573           *file = xstrdup (path + l); /* normally empty, but could
 574                                          contain ?... */
 575         }
 576       else
 577         {
 578           *dir = xstrdup ("/");
 579           *file = xstrdup (path + 1);
 580         }
 581     }
 582   else /* Nonempty directory with or without a filename */
 583     {
 584       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 585         {
 586           *dir = strdupdelim (path, path + l);
 587           *file = xstrdup (path + l); /* normally empty, but could
 588                                          contain ?... */
 589         }
 590       else
 591         {
 592           *dir = strdupdelim (path, path + i);
 593           *file = xstrdup (path + i + 1);
 594         }
 595     }
 596 }
 597
 598 /* Find the optional username and password within the URL, as per
 599    RFC1738.  The returned user and passwd char pointers are
 600    malloc-ed.  */
 601 static uerr_t
 602 parse_uname (const char *url, char **user, char **passwd)
 603 {
 604   int l;
 605   const char *p, *q, *col;
 606   char **where;
 607
 608   *user = NULL;
 609   *passwd = NULL;
 610   url += skip_url (url);
 611   /* Look for end of protocol string.  */
 612   l = skip_proto (url);
 613   if (!l)
 614     return URLUNKNOWN;
 615   /* Add protocol offset.  */
 616   url += l;
 617   /* Is there an `@' character?  */
 618   for (p = url; *p && *p != '/'; p++)
 619     if (*p == '@')
 620       break;
 621   /* If not, return.  */
 622   if (*p != '@')
 623     return URLOK;
 624   /* Else find the username and password.  */
 625   for (p = q = col = url; *p != '/'; p++)
 626     {
 627       if (*p == ':' && !*user)
 628         {
 629           *user = (char *)xmalloc (p - url + 1);
 630           memcpy (*user, url, p - url);
 631           (*user)[p - url] = '\0';
 632           col = p + 1;
 633         }
 634       if (*p == '@') q = p;
 635     }
 636   /* Decide whether you have only the username or both.  */
 637   where = *user ? passwd : user;
 638   *where = (char *)xmalloc (q - col + 1);
 639   memcpy (*where, col, q - col);
 640   (*where)[q - col] = '\0';
 641   return URLOK;
 642 }
 643
 644 /* If PATH ends with `;type=X', return the character X.  */
 645 static char
 646 process_ftp_type (char *path)
 647 {
 648   int len = strlen (path);
 649
 650   if (len >= 7
 651       && !memcmp (path + len - 7, ";type=", 6))
 652     {
 653       path[len - 7] = '\0';
 654       return path[len - 1];
 655     }
 656   else
 657     return '\0';
 658 }
 659 \f
 660 /* Return the URL as fine-formed string, with a proper protocol, optional port
 661    number, directory and optional user/password.  If `hide' is non-zero (as it
 662    is when we're calling this on a URL we plan to print, but not when calling it
 663    to canonicalize a URL for use within the program), password will be hidden.
 664    The forbidden characters in the URL will be cleansed.  */
 665 char *
 666 str_url (const struct urlinfo *u, int hide)
 667 {
 668   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 669   int i, l, ln, lu, lh, lp, lf, ld;
 670   unsigned short proto_default_port;
 671
 672   /* Look for the protocol name.  */
 673   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 674     if (sup_protos[i].ind == u->proto)
 675       break;
 676   if (i == ARRAY_SIZE (sup_protos))
 677     return NULL;
 678   proto_name = sup_protos[i].name;
 679   proto_default_port = sup_protos[i].port;
 680   host = CLEANDUP (u->host);
 681   dir = CLEANDUP (u->dir);
 682   file = CLEANDUP (u->file);
 683   user = passwd = NULL;
 684   if (u->user)
 685     user = CLEANDUP (u->user);
 686   if (u->passwd)
 687     {
 688       if (hide)
 689         /* Don't output the password, or someone might see it over the user's
 690            shoulder (or in saved wget output).  Don't give away the number of
 691            characters in the password, either, as we did in past versions of
 692            this code, when we replaced the password characters with 'x's. */
 693         passwd = xstrdup("<password>");
 694       else
 695         passwd = CLEANDUP (u->passwd);
 696     }
 697   if (u->proto == URLFTP && *dir == '/')
 698     {
 699       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 700       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 701       tmp[0] = '%';
 702       tmp[1] = '2';
 703       tmp[2] = 'F';
 704       strcpy (tmp + 3, dir + 1);
 705       xfree (dir);
 706       dir = tmp;
 707     }
 708
 709   ln = strlen (proto_name);
 710   lu = user ? strlen (user) : 0;
 711   lp = passwd ? strlen (passwd) : 0;
 712   lh = strlen (host);
 713   ld = strlen (dir);
 714   lf = strlen (file);
 715   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 716   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 717      (user ? user : ""), (passwd ? ":" : ""),
 718      (passwd ? passwd : ""), (user ? "@" : ""),
 719      host, u->port, dir, *dir ? "/" : "", file); */
 720   l = 0;
 721   memcpy (res, proto_name, ln);
 722   l += ln;
 723   if (user)
 724     {
 725       memcpy (res + l, user, lu);
 726       l += lu;
 727       if (passwd)
 728         {
 729           res[l++] = ':';
 730           memcpy (res + l, passwd, lp);
 731           l += lp;
 732         }
 733       res[l++] = '@';
 734     }
 735   memcpy (res + l, host, lh);
 736   l += lh;
 737   if (u->port != proto_default_port)
 738     {
 739       res[l++] = ':';
 740       long_to_string (res + l, (long)u->port);
 741       l += numdigit (u->port);
 742     }
 743   res[l++] = '/';
 744   memcpy (res + l, dir, ld);
 745   l += ld;
 746   if (*dir)
 747     res[l++] = '/';
 748   strcpy (res + l, file);
 749   xfree (host);
 750   xfree (dir);
 751   xfree (file);
 752   FREE_MAYBE (user);
 753   FREE_MAYBE (passwd);
 754   return res;
 755 }
 756
 757 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 758    location.  Uses parseurl to parse them, and compares the canonical
 759    forms.
 760
 761    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 762    return 0 on error.  */
 763 int
 764 url_equal (const char *url1, const char *url2)
 765 {
 766   struct urlinfo *u1, *u2;
 767   uerr_t err;
 768   int res;
 769
 770   u1 = newurl ();
 771   err = parseurl (url1, u1, 0);
 772   if (err != URLOK)
 773     {
 774       freeurl (u1, 1);
 775       return 0;
 776     }
 777   u2 = newurl ();
 778   err = parseurl (url2, u2, 0);
 779   if (err != URLOK)
 780     {
 781       freeurl (u2, 1);
 782       return 0;
 783     }
 784   res = !strcmp (u1->url, u2->url);
 785   freeurl (u1, 1);
 786   freeurl (u2, 1);
 787   return res;
 788 }
 789 \f
 790 urlpos *
 791 get_urls_file (const char *file)
 792 {
 793   struct file_memory *fm;
 794   urlpos *head, *tail;
 795   const char *text, *text_end;
 796
 797   /* Load the file.  */
 798   fm = read_file (file);
 799   if (!fm)
 800     {
 801       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 802       return NULL;
 803     }
 804   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 805   head = tail = NULL;
 806   text = fm->content;
 807   text_end = fm->content + fm->length;
 808   while (text < text_end)
 809     {
 810       const char *line_beg = text;
 811       const char *line_end = memchr (text, '\n', text_end - text);
 812       if (!line_end)
 813         line_end = text_end;
 814       else
 815         ++line_end;
 816       text = line_end;
 817       while (line_beg < line_end
 818              && ISSPACE (*line_beg))
 819         ++line_beg;
 820       while (line_end > line_beg + 1
 821              && ISSPACE (*(line_end - 1)))
 822         --line_end;
 823       if (line_end > line_beg)
 824         {
 825           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 826           memset (entry, 0, sizeof (*entry));
 827           entry->next = NULL;
 828           entry->url = strdupdelim (line_beg, line_end);
 829           if (!head)
 830             head = entry;
 831           else
 832             tail->next = entry;
 833           tail = entry;
 834         }
 835     }
 836   read_file_free (fm);
 837   return head;
 838 }
 839 \f
 840 /* Free the linked list of urlpos.  */
 841 void
 842 free_urlpos (urlpos *l)
 843 {
 844   while (l)
 845     {
 846       urlpos *next = l->next;
 847       xfree (l->url);
 848       FREE_MAYBE (l->local_name);
 849       xfree (l);
 850       l = next;
 851     }
 852 }
 853
 854 /* Rotate FNAME opt.backups times */
 855 void
 856 rotate_backups(const char *fname)
 857 {
 858   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 859   char *from = (char *)alloca (maxlen);
 860   char *to = (char *)alloca (maxlen);
 861   struct stat sb;
 862   int i;
 863
 864   if (stat (fname, &sb) == 0)
 865     if (S_ISREG (sb.st_mode) == 0)
 866       return;
 867
 868   for (i = opt.backups; i > 1; i--)
 869     {
 870       sprintf (from, "%s.%d", fname, i - 1);
 871       sprintf (to, "%s.%d", fname, i);
 872       /* #### This will fail on machines without the rename() system
 873          call.  */
 874       rename (from, to);
 875     }
 876
 877   sprintf (to, "%s.%d", fname, 1);
 878   rename(fname, to);
 879 }
 880
 881 /* Create all the necessary directories for PATH (a file).  Calls
 882    mkdirhier() internally.  */
 883 int
 884 mkalldirs (const char *path)
 885 {
 886   const char *p;
 887   char *t;
 888   struct stat st;
 889   int res;
 890
 891   p = path + strlen (path);
 892   for (; *p != '/' && p != path; p--);
 893   /* Don't create if it's just a file.  */
 894   if ((p == path) && (*p != '/'))
 895     return 0;
 896   t = strdupdelim (path, p);
 897   /* Check whether the directory exists.  */
 898   if ((stat (t, &st) == 0))
 899     {
 900       if (S_ISDIR (st.st_mode))
 901         {
 902           xfree (t);
 903           return 0;
 904         }
 905       else
 906         {
 907           /* If the dir exists as a file name, remove it first.  This
 908              is *only* for Wget to work with buggy old CERN http
 909              servers.  Here is the scenario: When Wget tries to
 910              retrieve a directory without a slash, e.g.
 911              http://foo/bar (bar being a directory), CERN server will
 912              not redirect it too http://foo/bar/ -- it will generate a
 913              directory listing containing links to bar/file1,
 914              bar/file2, etc.  Wget will lose because it saves this
 915              HTML listing to a file `bar', so it cannot create the
 916              directory.  To work around this, if the file of the same
 917              name exists, we just remove it and create the directory
 918              anyway.  */
 919           DEBUGP (("Removing %s because of directory danger!\n", t));
 920           unlink (t);
 921         }
 922     }
 923   res = make_directory (t);
 924   if (res != 0)
 925     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 926   xfree (t);
 927   return res;
 928 }
 929
 930 static int
 931 count_slashes (const char *s)
 932 {
 933   int i = 0;
 934   while (*s)
 935     if (*s++ == '/')
 936       ++i;
 937   return i;
 938 }
 939
 940 /* Return the path name of the URL-equivalent file name, with a
 941    remote-like structure of directories.  */
 942 static char *
 943 mkstruct (const struct urlinfo *u)
 944 {
 945   char *host, *dir, *file, *res, *dirpref;
 946   int l;
 947
 948   assert (u->dir != NULL);
 949   assert (u->host != NULL);
 950
 951   if (opt.cut_dirs)
 952     {
 953       char *ptr = u->dir + (*u->dir == '/');
 954       int slash_count = 1 + count_slashes (ptr);
 955       int cut = MINVAL (opt.cut_dirs, slash_count);
 956       for (; cut && *ptr; ptr++)
 957         if (*ptr == '/')
 958           --cut;
 959       STRDUP_ALLOCA (dir, ptr);
 960     }
 961   else
 962     dir = u->dir + (*u->dir == '/');
 963
 964   host = xstrdup (u->host);
 965   /* Check for the true name (or at least a consistent name for saving
 966      to directory) of HOST, reusing the hlist if possible.  */
 967   if (opt.add_hostdir && !opt.simple_check)
 968     {
 969       char *nhost = realhost (host);
 970       xfree (host);
 971       host = nhost;
 972     }
 973   /* Add dir_prefix and hostname (if required) to the beginning of
 974      dir.  */
 975   if (opt.add_hostdir)
 976     {
 977       if (!DOTP (opt.dir_prefix))
 978         {
 979           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 980                                     + strlen (host) + 1);
 981           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 982         }
 983       else
 984         STRDUP_ALLOCA (dirpref, host);
 985     }
 986   else                         /* not add_hostdir */
 987     {
 988       if (!DOTP (opt.dir_prefix))
 989         dirpref = opt.dir_prefix;
 990       else
 991         dirpref = "";
 992     }
 993   xfree (host);
 994
 995   /* If there is a prefix, prepend it.  */
 996   if (*dirpref)
 997     {
 998       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
 999       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1000       dir = newdir;
1001     }
1002   dir = xstrdup (dir);
1003   URL_CLEANSE (dir);
1004   l = strlen (dir);
1005   if (l && dir[l - 1] == '/')
1006     dir[l - 1] = '\0';
1007
1008   if (!*u->file)
1009     file = "index.html";
1010   else
1011     file = u->file;
1012
1013   /* Finally, construct the full name.  */
1014   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1015   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1016   xfree (dir);
1017   return res;
1018 }
1019
1020 /* Create a unique filename, corresponding to a given URL.  Calls
1021    mkstruct if necessary.  Does *not* actually create any directories.  */
1022 char *
1023 url_filename (const struct urlinfo *u)
1024 {
1025   char *file, *name;
1026   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1027
1028   if (opt.dirstruct)
1029     {
1030       file = mkstruct (u);
1031       have_prefix = 1;
1032     }
1033   else
1034     {
1035       if (!*u->file)
1036         file = xstrdup ("index.html");
1037       else
1038         file = xstrdup (u->file);
1039     }
1040
1041   if (!have_prefix)
1042     {
1043       /* Check whether the prefix directory is something other than "."
1044          before prepending it.  */
1045       if (!DOTP (opt.dir_prefix))
1046         {
1047           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1048                                          + 1 + strlen (file) + 1);
1049           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1050           xfree (file);
1051           file = nfile;
1052         }
1053     }
1054   /* DOS-ish file systems don't like `%' signs in them; we change it
1055      to `@'.  */
1056 #ifdef WINDOWS
1057   {
1058     char *p = file;
1059     for (p = file; *p; p++)
1060       if (*p == '%')
1061         *p = '@';
1062   }
1063 #endif /* WINDOWS */
1064
1065   /* Check the cases in which the unique extensions are not used:
1066      1) Clobbering is turned off (-nc).
1067      2) Retrieval with regetting.
1068      3) Timestamping is used.
1069      4) Hierarchy is built.
1070
1071      The exception is the case when file does exist and is a
1072      directory (actually support for bad httpd-s).  */
1073   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1074       && !(file_exists_p (file) && !file_non_directory_p (file)))
1075     return file;
1076
1077   /* Find a unique name.  */
1078   name = unique_name (file);
1079   xfree (file);
1080   return name;
1081 }
1082
1083 /* Like strlen(), but allow the URL to be ended with '?'.  */
1084 static int
1085 urlpath_length (const char *url)
1086 {
1087   const char *q = strchr (url, '?');
1088   if (q)
1089     return q - url;
1090   return strlen (url);
1091 }
1092
1093 /* Find the last occurrence of character C in the range [b, e), or
1094    NULL, if none are present.  This is almost completely equivalent to
1095    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1096    the contents of the string.  */
1097 static const char *
1098 find_last_char (const char *b, const char *e, char c)
1099 {
1100   for (; e > b; e--)
1101     if (*e == c)
1102       return e;
1103   return NULL;
1104 }
1105
1106 /* Construct a URL by concatenating an absolute URL and a path, which
1107    may or may not be absolute.  This tries to behave "reasonably" in
1108    all foreseeable cases.  It employs little specific knowledge about
1109    protocols or URL-specific stuff -- it just works on strings.  */
1110 static char *
1111 construct (const char *url, const char *sub, int subsize, int no_proto)
1112 {
1113   char *constr;
1114
1115   if (no_proto)
1116     {
1117       const char *end = url + urlpath_length (url);
1118
1119       if (*sub != '/')
1120         {
1121           /* SUB is a relative URL: we need to replace everything
1122              after last slash (possibly empty) with SUB.
1123
1124              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1125              our result should be "whatever/foo/qux/xyzzy".  */
1126           int need_explicit_slash = 0;
1127           int span;
1128           const char *start_insert;
1129           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1130           if (!last_slash)
1131             {
1132               /* No slash found at all.  Append SUB to what we have,
1133                  but we'll need a slash as a separator.
1134
1135                  Example: if url == "foo" and sub == "qux/xyzzy", then
1136                  we cannot just append sub to url, because we'd get
1137                  "fooqux/xyzzy", whereas what we want is
1138                  "foo/qux/xyzzy".
1139
1140                  To make sure the / gets inserted, we set
1141                  need_explicit_slash to 1.  We also set start_insert
1142                  to end + 1, so that the length calculations work out
1143                  correctly for one more (slash) character.  Accessing
1144                  that character is fine, since it will be the
1145                  delimiter, '\0' or '?'.  */
1146               /* example: "foo?..." */
1147               /*               ^    ('?' gets changed to '/') */
1148               start_insert = end + 1;
1149               need_explicit_slash = 1;
1150             }
1151           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1152             {
1153               /* example: http://host"  */
1154               /*                      ^ */
1155               start_insert = end + 1;
1156               need_explicit_slash = 1;
1157             }
1158           else
1159             {
1160               /* example: "whatever/foo/bar" */
1161               /*                        ^    */
1162               start_insert = last_slash + 1;
1163             }
1164
1165           span = start_insert - url;
1166           constr = (char *)xmalloc (span + subsize + 1);
1167           if (span)
1168             memcpy (constr, url, span);
1169           if (need_explicit_slash)
1170             constr[span - 1] = '/';
1171           if (subsize)
1172             memcpy (constr + span, sub, subsize);
1173           constr[span + subsize] = '\0';
1174         }
1175       else /* *sub == `/' */
1176         {
1177           /* SUB is an absolute path: we need to replace everything
1178              after (and including) the FIRST slash with SUB.
1179
1180              So, if URL is "http://host/whatever/foo/bar", and SUB is
1181              "/qux/xyzzy", our result should be
1182              "http://host/qux/xyzzy".  */
1183           int span;
1184           const char *slash;
1185           const char *start_insert = NULL; /* for gcc to shut up. */
1186           const char *pos = url;
1187           int seen_slash_slash = 0;
1188           /* We're looking for the first slash, but want to ignore
1189              double slash. */
1190         again:
1191           slash = memchr (pos, '/', end - pos);
1192           if (slash && !seen_slash_slash)
1193             if (*(slash + 1) == '/')
1194               {
1195                 pos = slash + 2;
1196                 seen_slash_slash = 1;
1197                 goto again;
1198               }
1199
1200           /* At this point, SLASH is the location of the first / after
1201              "//", or the first slash altogether.  START_INSERT is the
1202              pointer to the location where SUB will be inserted.  When
1203              examining the last two examples, keep in mind that SUB
1204              begins with '/'. */
1205
1206           if (!slash && !seen_slash_slash)
1207             /* example: "foo" */
1208             /*           ^    */
1209             start_insert = url;
1210           else if (!slash && seen_slash_slash)
1211             /* example: "http://foo" */
1212             /*                     ^ */
1213             start_insert = end;
1214           else if (slash && !seen_slash_slash)
1215             /* example: "foo/bar" */
1216             /*           ^        */
1217             start_insert = url;
1218           else if (slash && seen_slash_slash)
1219             /* example: "http://something/" */
1220             /*                           ^  */
1221             start_insert = slash;
1222
1223           span = start_insert - url;
1224           constr = (char *)xmalloc (span + subsize + 1);
1225           if (span)
1226             memcpy (constr, url, span);
1227           if (subsize)
1228             memcpy (constr + span, sub, subsize);
1229           constr[span + subsize] = '\0';
1230         }
1231     }
1232   else /* !no_proto */
1233     {
1234       constr = strdupdelim (sub, sub + subsize);
1235     }
1236   return constr;
1237 }
1238
1239 /* Like the function above, but with a saner caller interface. */
1240 char *
1241 url_concat (const char *base_url, const char *new_url)
1242 {
1243   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1244 }
1245 \f
1246 /* Optimize URL by host, destructively replacing u->host with realhost
1247    (u->host).  Do this regardless of opt.simple_check.  */
1248 void
1249 opt_url (struct urlinfo *u)
1250 {
1251   /* Find the "true" host.  */
1252   char *host = realhost (u->host);
1253   xfree (u->host);
1254   u->host = host;
1255   assert (u->dir != NULL);      /* the URL must have been parsed */
1256   /* Refresh the printed representation.  */
1257   xfree (u->url);
1258   u->url = str_url (u, 0);
1259 }
1260
1261 /* This beautiful kludge is fortunately not needed, as I've made
1262    parse_dir do the (almost) right thing, so that a query can never
1263    become a part of directory.  */
1264 #if 0
1265 /* Call path_simplify, but make sure that the part after the
1266    question-mark, if any, is not destroyed by path_simplify's
1267    "optimizations".  */
1268 void
1269 path_simplify_with_kludge (char *path)
1270 {
1271   char *query = strchr (path, '?');
1272   if (query)
1273     /* path_simplify also works destructively, so we also have the
1274        license to write. */
1275     *query = '\0';
1276   path_simplify (path);
1277   if (query)
1278     {
1279       char *newend = path + strlen (path);
1280       *query = '?';
1281       if (newend != query)
1282         memmove (newend, query, strlen (query) + 1);
1283     }
1284 }
1285 #endif
1286 \f
1287 /* Returns proxy host address, in accordance with PROTO.  */
1288 char *
1289 getproxy (uerr_t proto)
1290 {
1291   if (proto == URLHTTP)
1292     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1293   else if (proto == URLFTP)
1294     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1295 #ifdef HAVE_SSL
1296   else if (proto == URLHTTPS)
1297     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1298 #endif /* HAVE_SSL */
1299   else
1300     return NULL;
1301 }
1302
1303 /* Should a host be accessed through proxy, concerning no_proxy?  */
1304 int
1305 no_proxy_match (const char *host, const char **no_proxy)
1306 {
1307   if (!no_proxy)
1308     return 1;
1309   else
1310     return !sufmatch (no_proxy, host);
1311 }
1312 \f
1313 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1314 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1315
1316 /* Change the links in an HTML document.  Accepts a structure that
1317    defines the positions of all the links.  */
1318 void
1319 convert_links (const char *file, urlpos *l)
1320 {
1321   struct file_memory *fm;
1322   FILE               *fp;
1323   const char         *p;
1324   downloaded_file_t  downloaded_file_return;
1325
1326   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1327
1328   {
1329     /* First we do a "dry run": go through the list L and see whether
1330        any URL needs to be converted in the first place.  If not, just
1331        leave the file alone.  */
1332     int count = 0;
1333     urlpos *dry = l;
1334     for (dry = l; dry; dry = dry->next)
1335       if (dry->convert != CO_NOCONVERT)
1336         ++count;
1337     if (!count)
1338       {
1339         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1340         return;
1341       }
1342   }
1343
1344   fm = read_file (file);
1345   if (!fm)
1346     {
1347       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1348                  file, strerror (errno));
1349       return;
1350     }
1351
1352   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1353   if (opt.backup_converted && downloaded_file_return)
1354     write_backup_file (file, downloaded_file_return);
1355
1356   /* Before opening the file for writing, unlink the file.  This is
1357      important if the data in FM is mmaped.  In such case, nulling the
1358      file, which is what fopen() below does, would make us read all
1359      zeroes from the mmaped region.  */
1360   if (unlink (file) < 0 && errno != ENOENT)
1361     {
1362       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1363                  file, strerror (errno));
1364       read_file_free (fm);
1365       return;
1366     }
1367   /* Now open the file for writing.  */
1368   fp = fopen (file, "wb");
1369   if (!fp)
1370     {
1371       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1372                  file, strerror (errno));
1373       read_file_free (fm);
1374       return;
1375     }
1376   /* Here we loop through all the URLs in file, replacing those of
1377      them that are downloaded with relative references.  */
1378   p = fm->content;
1379   for (; l; l = l->next)
1380     {
1381       char *url_start = fm->content + l->pos;
1382
1383       if (l->pos >= fm->length)
1384         {
1385           DEBUGP (("Something strange is going on.  Please investigate."));
1386           break;
1387         }
1388       /* If the URL is not to be converted, skip it.  */
1389       if (l->convert == CO_NOCONVERT)
1390         {
1391           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1392           continue;
1393         }
1394
1395       /* Echo the file contents, up to the offending URL's opening
1396          quote, to the outfile.  */
1397       fwrite (p, 1, url_start - p, fp);
1398       p = url_start;
1399       if (l->convert == CO_CONVERT_TO_RELATIVE)
1400         {
1401           /* Convert absolute URL to relative. */
1402           char *newname = construct_relative (file, l->local_name);
1403           char *quoted_newname = html_quote_string (newname);
1404           replace_attr (&p, l->size, fp, quoted_newname);
1405           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1406                    l->url, newname, l->pos, file));
1407           xfree (newname);
1408           xfree (quoted_newname);
1409         }
1410       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1411         {
1412           /* Convert the link to absolute URL. */
1413           char *newlink = l->url;
1414           char *quoted_newlink = html_quote_string (newlink);
1415           replace_attr (&p, l->size, fp, quoted_newlink);
1416           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1417                    newlink, l->pos, file));
1418           xfree (quoted_newlink);
1419         }
1420     }
1421   /* Output the rest of the file. */
1422   if (p - fm->content < fm->length)
1423     fwrite (p, 1, fm->length - (p - fm->content), fp);
1424   fclose (fp);
1425   read_file_free (fm);
1426   logputs (LOG_VERBOSE, _("done.\n"));
1427 }
1428
1429 /* Construct and return a malloced copy of the relative link from two
1430    pieces of information: local name S1 of the referring file and
1431    local name S2 of the referred file.
1432
1433    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1434    "jagor.srce.hr/images/news.gif", the function will return
1435    "images/news.gif".
1436
1437    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1438    "fly.cc.fer.hr/images/fly.gif", the function will return
1439    "../images/fly.gif".
1440
1441    Caveats: S1 should not begin with `/', unless S2 also begins with
1442    '/'.  S1 should not contain things like ".." and such --
1443    construct_relative ("fly/ioccc/../index.html",
1444    "fly/images/fly.gif") will fail.  (A workaround is to call
1445    something like path_simplify() on S1).  */
1446 static char *
1447 construct_relative (const char *s1, const char *s2)
1448 {
1449   int i, cnt, sepdirs1;
1450   char *res;
1451
1452   if (*s2 == '/')
1453     return xstrdup (s2);
1454   /* S1 should *not* be absolute, if S2 wasn't.  */
1455   assert (*s1 != '/');
1456   i = cnt = 0;
1457   /* Skip the directories common to both strings.  */
1458   while (1)
1459     {
1460       while (s1[i] && s2[i]
1461              && (s1[i] == s2[i])
1462              && (s1[i] != '/')
1463              && (s2[i] != '/'))
1464         ++i;
1465       if (s1[i] == '/' && s2[i] == '/')
1466         cnt = ++i;
1467       else
1468         break;
1469     }
1470   for (sepdirs1 = 0; s1[i]; i++)
1471     if (s1[i] == '/')
1472       ++sepdirs1;
1473   /* Now, construct the file as of:
1474      - ../ repeated sepdirs1 time
1475      - all the non-mutual directories of S2.  */
1476   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1477   for (i = 0; i < sepdirs1; i++)
1478     memcpy (res + 3 * i, "../", 3);
1479   strcpy (res + 3 * i, s2 + cnt);
1480   return res;
1481 }
1482 \f
1483 /* Add URL to the head of the list L.  */
1484 urlpos *
1485 add_url (urlpos *l, const char *url, const char *file)
1486 {
1487   urlpos *t;
1488
1489   t = (urlpos *)xmalloc (sizeof (urlpos));
1490   memset (t, 0, sizeof (*t));
1491   t->url = xstrdup (url);
1492   t->local_name = xstrdup (file);
1493   t->next = l;
1494   return t;
1495 }
1496
1497 static void
1498 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1499 {
1500   /* Rather than just writing over the original .html file with the
1501      converted version, save the former to *.orig.  Note we only do
1502      this for files we've _successfully_ downloaded, so we don't
1503      clobber .orig files sitting around from previous invocations. */
1504
1505   /* Construct the backup filename as the original name plus ".orig". */
1506   size_t         filename_len = strlen(file);
1507   char*          filename_plus_orig_suffix;
1508   boolean        already_wrote_backup_file = FALSE;
1509   slist*         converted_file_ptr;
1510   static slist*  converted_files = NULL;
1511
1512   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1513     {
1514       /* Just write "orig" over "html".  We need to do it this way
1515          because when we're checking to see if we've downloaded the
1516          file before (to see if we can skip downloading it), we don't
1517          know if it's a text/html file.  Therefore we don't know yet
1518          at that stage that -E is going to cause us to tack on
1519          ".html", so we need to compare vs. the original URL plus
1520          ".orig", not the original URL plus ".html.orig". */
1521       filename_plus_orig_suffix = alloca (filename_len + 1);
1522       strcpy(filename_plus_orig_suffix, file);
1523       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1524     }
1525   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1526     {
1527       /* Append ".orig" to the name. */
1528       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1529       strcpy(filename_plus_orig_suffix, file);
1530       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1531     }
1532
1533   /* We can get called twice on the same URL thanks to the
1534      convert_all_links() call in main().  If we write the .orig file
1535      each time in such a case, it'll end up containing the first-pass
1536      conversion, not the original file.  So, see if we've already been
1537      called on this file. */
1538   converted_file_ptr = converted_files;
1539   while (converted_file_ptr != NULL)
1540     if (strcmp(converted_file_ptr->string, file) == 0)
1541       {
1542         already_wrote_backup_file = TRUE;
1543         break;
1544       }
1545     else
1546       converted_file_ptr = converted_file_ptr->next;
1547
1548   if (!already_wrote_backup_file)
1549     {
1550       /* Rename <file> to <file>.orig before former gets written over. */
1551       if (rename(file, filename_plus_orig_suffix) != 0)
1552         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1553                    file, filename_plus_orig_suffix, strerror (errno));
1554
1555       /* Remember that we've already written a .orig backup for this file.
1556          Note that we never free this memory since we need it till the
1557          convert_all_links() call, which is one of the last things the
1558          program does before terminating.  BTW, I'm not sure if it would be
1559          safe to just set 'converted_file_ptr->string' to 'file' below,
1560          rather than making a copy of the string...  Another note is that I
1561          thought I could just add a field to the urlpos structure saying
1562          that we'd written a .orig file for this URL, but that didn't work,
1563          so I had to make this separate list.
1564          -- Dan Harkless <wget@harkless.org>
1565
1566          This [adding a field to the urlpos structure] didn't work
1567          because convert_file() is called twice: once after all its
1568          sublinks have been retrieved in recursive_retrieve(), and
1569          once at the end of the day in convert_all_links().  The
1570          original linked list collected in recursive_retrieve() is
1571          lost after the first invocation of convert_links(), and
1572          convert_all_links() makes a new one (it calls get_urls_html()
1573          for each file it covers.)  That's why your first approach didn't
1574          work.  The way to make it work is perhaps to make this flag a
1575          field in the `urls_html' list.
1576          -- Hrvoje Niksic <hniksic@arsdigita.com>
1577       */
1578       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1579       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1580       converted_file_ptr->next = converted_files;
1581       converted_files = converted_file_ptr;
1582     }
1583 }
1584
1585 static int find_fragment PARAMS ((const char *, int, const char **,
1586                                   const char **));
1587
1588 static void
1589 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1590 {
1591   const char *p = *pp;
1592   int quote_flag = 0;
1593   int size = raw_size;
1594   char quote_char = '\"';
1595   const char *frag_beg, *frag_end;
1596
1597   /* Structure of our string is:
1598        "...old-contents..."
1599        <---  l->size   --->  (with quotes)
1600      OR:
1601        ...old-contents...
1602        <---  l->size  -->    (no quotes)   */
1603
1604   if (*p == '\"' || *p == '\'')
1605     {
1606       quote_char = *p;
1607       quote_flag = 1;
1608       ++p;
1609       size -= 2;                /* disregard opening and closing quote */
1610     }
1611   putc (quote_char, fp);
1612   fputs (new_str, fp);
1613
1614   /* Look for fragment identifier, if any. */
1615   if (find_fragment (p, size, &frag_beg, &frag_end))
1616     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1617   p += size;
1618   if (quote_flag)
1619     ++p;
1620   putc (quote_char, fp);
1621   *pp = p;
1622 }
1623
1624 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1625    preceded by '&'.  If the character is not found, return zero.  If
1626    the character is found, return 1 and set BP and EP to point to the
1627    beginning and end of the region.
1628
1629    This is used for finding the fragment indentifiers in URLs.  */
1630
1631 static int
1632 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1633 {
1634   const char *end = beg + size;
1635   int saw_amp = 0;
1636   for (; beg < end; beg++)
1637     {
1638       switch (*beg)
1639         {
1640         case '&':
1641           saw_amp = 1;
1642           break;
1643         case '#':
1644           if (!saw_amp)
1645             {
1646               *bp = beg;
1647               *ep = end;
1648               return 1;
1649             }
1650           /* fallthrough */
1651         default:
1652           saw_amp = 0;
1653         }
1654     }
1655   return 0;
1656 }
1657
1658 typedef struct _downloaded_file_list {
1659   char*                          file;
1660   downloaded_file_t              download_type;
1661   struct _downloaded_file_list*  next;
1662 } downloaded_file_list;
1663
1664 static downloaded_file_list *downloaded_files;
1665
1666 /* Remembers which files have been downloaded.  In the standard case, should be
1667    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1668    download successfully (i.e. not for ones we have failures on or that we skip
1669    due to -N).
1670
1671    When we've downloaded a file and tacked on a ".html" extension due to -E,
1672    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1673    FILE_DOWNLOADED_NORMALLY.
1674
1675    If you just want to check if a file has been previously added without adding
1676    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1677    with local filenames, not remote URLs. */
1678 downloaded_file_t
1679 downloaded_file (downloaded_file_t  mode, const char*  file)
1680 {
1681   boolean                       found_file = FALSE;
1682   downloaded_file_list*         rover = downloaded_files;
1683
1684   while (rover != NULL)
1685     if (strcmp(rover->file, file) == 0)
1686       {
1687         found_file = TRUE;
1688         break;
1689       }
1690     else
1691       rover = rover->next;
1692
1693   if (found_file)
1694     return rover->download_type;  /* file had already been downloaded */
1695   else
1696     {
1697       if (mode != CHECK_FOR_FILE)
1698         {
1699           rover = xmalloc(sizeof(*rover));
1700           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1701           rover->download_type = mode;
1702           rover->next = downloaded_files;
1703           downloaded_files = rover;
1704         }
1705
1706       return FILE_NOT_ALREADY_DOWNLOADED;
1707     }
1708 }
1709
1710 void
1711 downloaded_files_free (void)
1712 {
1713   downloaded_file_list*         rover = downloaded_files;
1714   while (rover)
1715     {
1716       downloaded_file_list *next = rover->next;
1717       xfree (rover->file);
1718       xfree (rover);
1719       rover = next;
1720     }
1721 }
1722 \f
1723 /* Initialization of static stuff. */
1724 void
1725 url_init (void)
1726 {
1727   init_unsafe_char_table ();
1728 }