sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Default port definitions */
  47 #define DEFAULT_HTTP_PORT 80
  48 #define DEFAULT_FTP_PORT 21
  49 #define DEFAULT_HTTPS_PORT 443
  50
  51 /* Table of Unsafe chars.  This is intialized in
  52    init_unsafe_char_table.  */
  53
  54 static char unsafe_char_table[256];
  55
  56 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  57
  58 /* If S contains unsafe characters, free it and replace it with a
  59    version that doesn't.  */
  60 #define URL_CLEANSE(s) do                       \
  61 {                                               \
  62   if (contains_unsafe (s))                      \
  63     {                                           \
  64       char *uc_tmp = encode_string (s);         \
  65       xfree (s);                                \
  66       (s) = uc_tmp;                             \
  67     }                                           \
  68 } while (0)
  69
  70 /* Is a directory "."?  */
  71 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  72 /* Is a directory ".."?  */
  73 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  74
  75 #if 0
  76 static void path_simplify_with_kludge PARAMS ((char *));
  77 #endif
  78 static int urlpath_length PARAMS ((const char *));
  79
  80 /* NULL-terminated list of strings to be recognized as prototypes (URL
  81    schemes).  Note that recognized doesn't mean supported -- only HTTP,
  82    HTTPS and FTP are currently supported .
  83
  84    However, a string that does not match anything in the list will be
  85    considered a relative URL.  Thus it's important that this list has
  86    anything anyone could think of being legal.
  87
  88    There are wild things here.  :-) Take a look at
  89    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  90    fun.  */
  91 static char *protostrings[] =
  92 {
  93   "cid:",
  94   "clsid:",
  95   "file:",
  96   "finger:",
  97   "ftp:",
  98   "gopher:",
  99   "hdl:",
 100   "http:",
 101   "https:",
 102   "ilu:",
 103   "ior:",
 104   "irc:",
 105   "java:",
 106   "javascript:",
 107   "lifn:",
 108   "mailto:",
 109   "mid:",
 110   "news:",
 111   "nntp:",
 112   "path:",
 113   "prospero:",
 114   "rlogin:",
 115   "service:",
 116   "shttp:",
 117   "snews:",
 118   "stanf:",
 119   "telnet:",
 120   "tn3270:",
 121   "wais:",
 122   "whois++:",
 123   NULL
 124 };
 125
 126 struct proto
 127 {
 128   char *name;
 129   uerr_t ind;
 130   unsigned short port;
 131 };
 132
 133 /* Similar to former, but for supported protocols: */
 134 static struct proto sup_protos[] =
 135 {
 136   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 137 #ifdef HAVE_SSL
 138   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 139 #endif
 140   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 141   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 142 };
 143
 144 static void parse_dir PARAMS ((const char *, char **, char **));
 145 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 146 static char *construct PARAMS ((const char *, const char *, int , int));
 147 static char *construct_relative PARAMS ((const char *, const char *));
 148 static char process_ftp_type PARAMS ((char *));
 149
 150 \f
 151 /* Returns the number of characters to be skipped if the first thing
 152    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 153    URL: are also skipped.  */
 154 int
 155 skip_url (const char *url)
 156 {
 157   int i;
 158
 159   if (TOUPPER (url[0]) == 'U'
 160       && TOUPPER (url[1]) == 'R'
 161       && TOUPPER (url[2]) == 'L'
 162       && url[3] == ':')
 163     {
 164       /* Skip blanks.  */
 165       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 166       return i;
 167     }
 168   else
 169     return 0;
 170 }
 171
 172 /* Unsafe chars:
 173    - anything <= 32;
 174    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 175    - @ and :, for user/password encoding.
 176    - everything over 127 (but we don't bother with recording those.  */
 177 void
 178 init_unsafe_char_table (void)
 179 {
 180   int i;
 181   for (i = 0; i < 256; i++)
 182     if (i < 32 || i >= 127
 183         || i == ' '
 184         || i == '<'
 185         || i == '>'
 186         || i == '\"'
 187         || i == '#'
 188         || i == '%'
 189         || i == '{'
 190         || i == '}'
 191         || i == '|'
 192         || i == '\\'
 193         || i == '^'
 194         || i == '~'
 195         || i == '['
 196         || i == ']'
 197         || i == '`')
 198       unsafe_char_table[i] = 1;
 199 }
 200
 201 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 202 int
 203 contains_unsafe (const char *s)
 204 {
 205   for (; *s; s++)
 206     if (UNSAFE_CHAR (*s))
 207       return 1;
 208   return 0;
 209 }
 210
 211 /* Decodes the forms %xy in a URL to the character the hexadecimal
 212    code of which is xy.  xy are hexadecimal digits from
 213    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 214    hex-digits or `%' precedes `\0', the sequence is inserted
 215    literally.  */
 216
 217 static void
 218 decode_string (char *s)
 219 {
 220   char *p = s;
 221
 222   for (; *s; s++, p++)
 223     {
 224       if (*s != '%')
 225         *p = *s;
 226       else
 227         {
 228           /* Do nothing if at the end of the string, or if the chars
 229              are not hex-digits.  */
 230           if (!*(s + 1) || !*(s + 2)
 231               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 232             {
 233               *p = *s;
 234               continue;
 235             }
 236           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 237           s += 2;
 238         }
 239     }
 240   *p = '\0';
 241 }
 242
 243 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 244    given string, returning a malloc-ed %XX encoded string.  */
 245 char *
 246 encode_string (const char *s)
 247 {
 248   const char *b;
 249   char *p, *res;
 250   int i;
 251
 252   b = s;
 253   for (i = 0; *s; s++, i++)
 254     if (UNSAFE_CHAR (*s))
 255       i += 2; /* Two more characters (hex digits) */
 256   res = (char *)xmalloc (i + 1);
 257   s = b;
 258   for (p = res; *s; s++)
 259     if (UNSAFE_CHAR (*s))
 260       {
 261         const unsigned char c = *s;
 262         *p++ = '%';
 263         *p++ = HEXD2ASC (c >> 4);
 264         *p++ = HEXD2ASC (c & 0xf);
 265       }
 266     else
 267       *p++ = *s;
 268   *p = '\0';
 269   return res;
 270 }
 271 \f
 272 /* Returns the proto-type if URL's protocol is supported, or
 273    URLUNKNOWN if not.  */
 274 uerr_t
 275 urlproto (const char *url)
 276 {
 277   int i;
 278
 279   url += skip_url (url);
 280   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 281     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 282       return sup_protos[i].ind;
 283   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 284   if (url[i] == ':')
 285     {
 286       for (++i; url[i] && url[i] != '/'; i++)
 287         if (!ISDIGIT (url[i]))
 288           return URLBADPORT;
 289       if (url[i - 1] == ':')
 290         return URLFTP;
 291       else
 292         return URLHTTP;
 293     }
 294   else
 295     return URLHTTP;
 296 }
 297
 298 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 299    part is found, returns 0.  */
 300 int
 301 skip_proto (const char *url)
 302 {
 303   char **s;
 304   int l;
 305
 306   for (s = protostrings; *s; s++)
 307     if (!strncasecmp (*s, url, strlen (*s)))
 308       break;
 309   if (!*s)
 310     return 0;
 311   l = strlen (*s);
 312   /* HTTP and FTP protocols are expected to yield exact host names
 313      (i.e. the `//' part must be skipped, too).  */
 314   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 315     l += 2;
 316   return l;
 317 }
 318
 319 /* Returns 1 if the URL begins with a protocol (supported or
 320    unsupported), 0 otherwise.  */
 321 int
 322 has_proto (const char *url)
 323 {
 324   char **s;
 325
 326   url += skip_url (url);
 327   for (s = protostrings; *s; s++)
 328     if (strncasecmp (url, *s, strlen (*s)) == 0)
 329       return 1;
 330   return 0;
 331 }
 332
 333 /* Skip the username and password, if present here.  The function
 334    should be called *not* with the complete URL, but with the part
 335    right after the protocol.
 336
 337    If no username and password are found, return 0.  */
 338 int
 339 skip_uname (const char *url)
 340 {
 341   const char *p;
 342   for (p = url; *p && *p != '/'; p++)
 343     if (*p == '@')
 344       break;
 345   /* If a `@' was found before the first occurrence of `/', skip
 346      it.  */
 347   if (*p == '@')
 348     return p - url + 1;
 349   else
 350     return 0;
 351 }
 352 \f
 353 /* Allocate a new urlinfo structure, fill it with default values and
 354    return a pointer to it.  */
 355 struct urlinfo *
 356 newurl (void)
 357 {
 358   struct urlinfo *u;
 359
 360   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 361   memset (u, 0, sizeof (*u));
 362   u->proto = URLUNKNOWN;
 363   return u;
 364 }
 365
 366 /* Perform a "deep" free of the urlinfo structure.  The structure
 367    should have been created with newurl, but need not have been used.
 368    If free_pointer is non-0, free the pointer itself.  */
 369 void
 370 freeurl (struct urlinfo *u, int complete)
 371 {
 372   assert (u != NULL);
 373   FREE_MAYBE (u->url);
 374   FREE_MAYBE (u->host);
 375   FREE_MAYBE (u->path);
 376   FREE_MAYBE (u->file);
 377   FREE_MAYBE (u->dir);
 378   FREE_MAYBE (u->user);
 379   FREE_MAYBE (u->passwd);
 380   FREE_MAYBE (u->local);
 381   FREE_MAYBE (u->referer);
 382   if (u->proxy)
 383     freeurl (u->proxy, 1);
 384   if (complete)
 385     xfree (u);
 386   return;
 387 }
 388 \f
 389 /* Extract the given URL of the form
 390    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 391    1. hostname (terminated with `/' or `:')
 392    2. port number (terminated with `/'), or chosen for the protocol
 393    3. dirname (everything after hostname)
 394    Most errors are handled.  No allocation is done, you must supply
 395    pointers to allocated memory.
 396    ...and a host of other stuff :-)
 397
 398    - Recognizes hostname:dir/file for FTP and
 399      hostname (:portnum)?/dir/file for HTTP.
 400    - Parses the path to yield directory and file
 401    - Parses the URL to yield the username and passwd (if present)
 402    - Decodes the strings, in case they contain "forbidden" characters
 403    - Writes the result to struct urlinfo
 404
 405    If the argument STRICT is set, it recognizes only the canonical
 406    form.  */
 407 uerr_t
 408 parseurl (const char *url, struct urlinfo *u, int strict)
 409 {
 410   int i, l, abs_ftp;
 411   int recognizable;            /* Recognizable URL is the one where
 412                                   the protocol name was explicitly
 413                                   named, i.e. it wasn't deduced from
 414                                   the URL format.  */
 415   uerr_t type;
 416
 417   DEBUGP (("parseurl (\"%s\") -> ", url));
 418   url += skip_url (url);
 419   recognizable = has_proto (url);
 420   if (strict && !recognizable)
 421     return URLUNKNOWN;
 422   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 423     {
 424       l = strlen (sup_protos[i].name);
 425       if (!strncasecmp (sup_protos[i].name, url, l))
 426         break;
 427     }
 428   /* If protocol is recognizable, but unsupported, bail out, else
 429      suppose unknown.  */
 430   if (recognizable && i == ARRAY_SIZE (sup_protos))
 431     return URLUNKNOWN;
 432   else if (i == ARRAY_SIZE (sup_protos))
 433     type = URLUNKNOWN;
 434   else
 435     u->proto = type = sup_protos[i].ind;
 436
 437   if (type == URLUNKNOWN)
 438     l = 0;
 439   /* Allow a username and password to be specified (i.e. just skip
 440      them for now).  */
 441   if (recognizable)
 442     l += skip_uname (url + l);
 443   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 444   if (i == l)
 445     return URLBADHOST;
 446   /* Get the hostname.  */
 447   u->host = strdupdelim (url + l, url + i);
 448   DEBUGP (("host %s -> ", u->host));
 449
 450   /* Assume no port has been given.  */
 451   u->port = 0;
 452   if (url[i] == ':')
 453     {
 454       /* We have a colon delimiting the hostname.  It could mean that
 455          a port number is following it, or a directory.  */
 456       if (ISDIGIT (url[++i]))    /* A port number */
 457         {
 458           if (type == URLUNKNOWN)
 459             u->proto = type = URLHTTP;
 460           for (; url[i] && url[i] != '/'; i++)
 461             if (ISDIGIT (url[i]))
 462               u->port = 10 * u->port + (url[i] - '0');
 463             else
 464               return URLBADPORT;
 465           if (!u->port)
 466             return URLBADPORT;
 467           DEBUGP (("port %hu -> ", u->port));
 468         }
 469       else if (type == URLUNKNOWN) /* or a directory */
 470         u->proto = type = URLFTP;
 471       else                      /* or just a misformed port number */
 472         return URLBADPORT;
 473     }
 474   else if (type == URLUNKNOWN)
 475     u->proto = type = URLHTTP;
 476   if (!u->port)
 477     {
 478       int ind;
 479       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 480         if (sup_protos[ind].ind == type)
 481           break;
 482       if (ind == ARRAY_SIZE (sup_protos))
 483         return URLUNKNOWN;
 484       u->port = sup_protos[ind].port;
 485     }
 486   /* Some delimiter troubles...  */
 487   if (url[i] == '/' && url[i - 1] != ':')
 488     ++i;
 489   if (type == URLHTTP)
 490     while (url[i] && url[i] == '/')
 491       ++i;
 492   u->path = (char *)xmalloc (strlen (url + i) + 8);
 493   strcpy (u->path, url + i);
 494   if (type == URLFTP)
 495     {
 496       u->ftp_type = process_ftp_type (u->path);
 497       /* #### We don't handle type `d' correctly yet.  */
 498       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 499         u->ftp_type = 'I';
 500     }
 501   DEBUGP (("opath %s -> ", u->path));
 502   /* Parse the username and password (if existing).  */
 503   parse_uname (url, &u->user, &u->passwd);
 504   /* Decode the strings, as per RFC 1738.  */
 505   decode_string (u->host);
 506   decode_string (u->path);
 507   if (u->user)
 508     decode_string (u->user);
 509   if (u->passwd)
 510     decode_string (u->passwd);
 511   /* Parse the directory.  */
 512   parse_dir (u->path, &u->dir, &u->file);
 513   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 514   /* Simplify the directory.  */
 515   path_simplify (u->dir);
 516   /* Remove the leading `/' in HTTP.  */
 517   if (type == URLHTTP && *u->dir == '/')
 518     strcpy (u->dir, u->dir + 1);
 519   DEBUGP (("ndir %s\n", u->dir));
 520   /* Strip trailing `/'.  */
 521   l = strlen (u->dir);
 522   if (l && u->dir[l - 1] == '/')
 523     u->dir[l - 1] = '\0';
 524   /* Re-create the path: */
 525   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 526   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 527       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 528   strcpy (u->path, abs_ftp ? "%2F" : "/");
 529   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 530   strcat (u->path, *u->dir ? "/" : "");
 531   strcat (u->path, u->file);
 532   URL_CLEANSE (u->path);
 533   DEBUGP (("newpath: %s\n", u->path));
 534   /* Create the clean URL.  */
 535   u->url = str_url (u, 0);
 536   return URLOK;
 537 }
 538 \f
 539 /* Special versions of DOTP and DDOTP for parse_dir(). */
 540
 541 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 542 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 543                      && (!*((x) + 2) || *((x) + 2) == '?'))
 544
 545 /* Build the directory and filename components of the path.  Both
 546    components are *separately* malloc-ed strings!  It does not change
 547    the contents of path.
 548
 549    If the path ends with "." or "..", they are (correctly) counted as
 550    directories.  */
 551 static void
 552 parse_dir (const char *path, char **dir, char **file)
 553 {
 554   int i, l;
 555
 556   l = urlpath_length (path);
 557   for (i = l; i && path[i] != '/'; i--);
 558
 559   if (!i && *path != '/')   /* Just filename */
 560     {
 561       if (PD_DOTP (path) || PD_DDOTP (path))
 562         {
 563           *dir = strdupdelim (path, path + l);
 564           *file = xstrdup (path + l); /* normally empty, but could
 565                                          contain ?... */
 566         }
 567       else
 568         {
 569           *dir = xstrdup ("");     /* This is required because of FTP */
 570           *file = xstrdup (path);
 571         }
 572     }
 573   else if (!i)                 /* /filename */
 574     {
 575       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 576         {
 577           *dir = strdupdelim (path, path + l);
 578           *file = xstrdup (path + l); /* normally empty, but could
 579                                          contain ?... */
 580         }
 581       else
 582         {
 583           *dir = xstrdup ("/");
 584           *file = xstrdup (path + 1);
 585         }
 586     }
 587   else /* Nonempty directory with or without a filename */
 588     {
 589       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 590         {
 591           *dir = strdupdelim (path, path + l);
 592           *file = xstrdup (path + l); /* normally empty, but could
 593                                          contain ?... */
 594         }
 595       else
 596         {
 597           *dir = strdupdelim (path, path + i);
 598           *file = xstrdup (path + i + 1);
 599         }
 600     }
 601 }
 602
 603 /* Find the optional username and password within the URL, as per
 604    RFC1738.  The returned user and passwd char pointers are
 605    malloc-ed.  */
 606 static uerr_t
 607 parse_uname (const char *url, char **user, char **passwd)
 608 {
 609   int l;
 610   const char *p, *col;
 611   char **where;
 612
 613   *user = NULL;
 614   *passwd = NULL;
 615   url += skip_url (url);
 616   /* Look for end of protocol string.  */
 617   l = skip_proto (url);
 618   if (!l)
 619     return URLUNKNOWN;
 620   /* Add protocol offset.  */
 621   url += l;
 622   /* Is there an `@' character?  */
 623   for (p = url; *p && *p != '/'; p++)
 624     if (*p == '@')
 625       break;
 626   /* If not, return.  */
 627   if (*p != '@')
 628     return URLOK;
 629   /* Else find the username and password.  */
 630   for (p = col = url; *p != '@'; p++)
 631     {
 632       if (*p == ':' && !*user)
 633         {
 634           *user = (char *)xmalloc (p - url + 1);
 635           memcpy (*user, url, p - url);
 636           (*user)[p - url] = '\0';
 637           col = p + 1;
 638         }
 639     }
 640   /* Decide whether you have only the username or both.  */
 641   where = *user ? passwd : user;
 642   *where = (char *)xmalloc (p - col + 1);
 643   memcpy (*where, col, p - col);
 644   (*where)[p - col] = '\0';
 645   return URLOK;
 646 }
 647
 648 /* If PATH ends with `;type=X', return the character X.  */
 649 static char
 650 process_ftp_type (char *path)
 651 {
 652   int len = strlen (path);
 653
 654   if (len >= 7
 655       && !memcmp (path + len - 7, ";type=", 6))
 656     {
 657       path[len - 7] = '\0';
 658       return path[len - 1];
 659     }
 660   else
 661     return '\0';
 662 }
 663 \f
 664 /* Return the URL as fine-formed string, with a proper protocol, optional port
 665    number, directory and optional user/password.  If `hide' is non-zero (as it
 666    is when we're calling this on a URL we plan to print, but not when calling it
 667    to canonicalize a URL for use within the program), password will be hidden.
 668    The forbidden characters in the URL will be cleansed.  */
 669 char *
 670 str_url (const struct urlinfo *u, int hide)
 671 {
 672   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 673   int i, l, ln, lu, lh, lp, lf, ld;
 674   unsigned short proto_default_port;
 675
 676   /* Look for the protocol name.  */
 677   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 678     if (sup_protos[i].ind == u->proto)
 679       break;
 680   if (i == ARRAY_SIZE (sup_protos))
 681     return NULL;
 682   proto_name = sup_protos[i].name;
 683   proto_default_port = sup_protos[i].port;
 684   host = CLEANDUP (u->host);
 685   dir = CLEANDUP (u->dir);
 686   file = CLEANDUP (u->file);
 687   user = passwd = NULL;
 688   if (u->user)
 689     user = CLEANDUP (u->user);
 690   if (u->passwd)
 691     {
 692       if (hide)
 693         /* Don't output the password, or someone might see it over the user's
 694            shoulder (or in saved wget output).  Don't give away the number of
 695            characters in the password, either, as we did in past versions of
 696            this code, when we replaced the password characters with 'x's. */
 697         passwd = xstrdup("<password>");
 698       else
 699         passwd = CLEANDUP (u->passwd);
 700     }
 701   if (u->proto == URLFTP && *dir == '/')
 702     {
 703       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 704       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 705       tmp[0] = '%';
 706       tmp[1] = '2';
 707       tmp[2] = 'F';
 708       strcpy (tmp + 3, dir + 1);
 709       xfree (dir);
 710       dir = tmp;
 711     }
 712
 713   ln = strlen (proto_name);
 714   lu = user ? strlen (user) : 0;
 715   lp = passwd ? strlen (passwd) : 0;
 716   lh = strlen (host);
 717   ld = strlen (dir);
 718   lf = strlen (file);
 719   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 720   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 721      (user ? user : ""), (passwd ? ":" : ""),
 722      (passwd ? passwd : ""), (user ? "@" : ""),
 723      host, u->port, dir, *dir ? "/" : "", file); */
 724   l = 0;
 725   memcpy (res, proto_name, ln);
 726   l += ln;
 727   if (user)
 728     {
 729       memcpy (res + l, user, lu);
 730       l += lu;
 731       if (passwd)
 732         {
 733           res[l++] = ':';
 734           memcpy (res + l, passwd, lp);
 735           l += lp;
 736         }
 737       res[l++] = '@';
 738     }
 739   memcpy (res + l, host, lh);
 740   l += lh;
 741   if (u->port != proto_default_port)
 742     {
 743       res[l++] = ':';
 744       long_to_string (res + l, (long)u->port);
 745       l += numdigit (u->port);
 746     }
 747   res[l++] = '/';
 748   memcpy (res + l, dir, ld);
 749   l += ld;
 750   if (*dir)
 751     res[l++] = '/';
 752   strcpy (res + l, file);
 753   xfree (host);
 754   xfree (dir);
 755   xfree (file);
 756   FREE_MAYBE (user);
 757   FREE_MAYBE (passwd);
 758   return res;
 759 }
 760
 761 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 762    location.  Uses parseurl to parse them, and compares the canonical
 763    forms.
 764
 765    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 766    return 0 on error.  */
 767 int
 768 url_equal (const char *url1, const char *url2)
 769 {
 770   struct urlinfo *u1, *u2;
 771   uerr_t err;
 772   int res;
 773
 774   u1 = newurl ();
 775   err = parseurl (url1, u1, 0);
 776   if (err != URLOK)
 777     {
 778       freeurl (u1, 1);
 779       return 0;
 780     }
 781   u2 = newurl ();
 782   err = parseurl (url2, u2, 0);
 783   if (err != URLOK)
 784     {
 785       freeurl (u2, 1);
 786       return 0;
 787     }
 788   res = !strcmp (u1->url, u2->url);
 789   freeurl (u1, 1);
 790   freeurl (u2, 1);
 791   return res;
 792 }
 793 \f
 794 urlpos *
 795 get_urls_file (const char *file)
 796 {
 797   struct file_memory *fm;
 798   urlpos *head, *tail;
 799   const char *text, *text_end;
 800
 801   /* Load the file.  */
 802   fm = read_file (file);
 803   if (!fm)
 804     {
 805       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 806       return NULL;
 807     }
 808   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 809   head = tail = NULL;
 810   text = fm->content;
 811   text_end = fm->content + fm->length;
 812   while (text < text_end)
 813     {
 814       const char *line_beg = text;
 815       const char *line_end = memchr (text, '\n', text_end - text);
 816       if (!line_end)
 817         line_end = text_end;
 818       else
 819         ++line_end;
 820       text = line_end;
 821       while (line_beg < line_end
 822              && ISSPACE (*line_beg))
 823         ++line_beg;
 824       while (line_end > line_beg + 1
 825              && ISSPACE (*(line_end - 1)))
 826         --line_end;
 827       if (line_end > line_beg)
 828         {
 829           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 830           memset (entry, 0, sizeof (*entry));
 831           entry->next = NULL;
 832           entry->url = strdupdelim (line_beg, line_end);
 833           if (!head)
 834             head = entry;
 835           else
 836             tail->next = entry;
 837           tail = entry;
 838         }
 839     }
 840   read_file_free (fm);
 841   return head;
 842 }
 843 \f
 844 /* Free the linked list of urlpos.  */
 845 void
 846 free_urlpos (urlpos *l)
 847 {
 848   while (l)
 849     {
 850       urlpos *next = l->next;
 851       xfree (l->url);
 852       FREE_MAYBE (l->local_name);
 853       xfree (l);
 854       l = next;
 855     }
 856 }
 857
 858 /* Rotate FNAME opt.backups times */
 859 void
 860 rotate_backups(const char *fname)
 861 {
 862   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 863   char *from = (char *)alloca (maxlen);
 864   char *to = (char *)alloca (maxlen);
 865   struct stat sb;
 866   int i;
 867
 868   if (stat (fname, &sb) == 0)
 869     if (S_ISREG (sb.st_mode) == 0)
 870       return;
 871
 872   for (i = opt.backups; i > 1; i--)
 873     {
 874       sprintf (from, "%s.%d", fname, i - 1);
 875       sprintf (to, "%s.%d", fname, i);
 876       /* #### This will fail on machines without the rename() system
 877          call.  */
 878       rename (from, to);
 879     }
 880
 881   sprintf (to, "%s.%d", fname, 1);
 882   rename(fname, to);
 883 }
 884
 885 /* Create all the necessary directories for PATH (a file).  Calls
 886    mkdirhier() internally.  */
 887 int
 888 mkalldirs (const char *path)
 889 {
 890   const char *p;
 891   char *t;
 892   struct stat st;
 893   int res;
 894
 895   p = path + strlen (path);
 896   for (; *p != '/' && p != path; p--);
 897   /* Don't create if it's just a file.  */
 898   if ((p == path) && (*p != '/'))
 899     return 0;
 900   t = strdupdelim (path, p);
 901   /* Check whether the directory exists.  */
 902   if ((stat (t, &st) == 0))
 903     {
 904       if (S_ISDIR (st.st_mode))
 905         {
 906           xfree (t);
 907           return 0;
 908         }
 909       else
 910         {
 911           /* If the dir exists as a file name, remove it first.  This
 912              is *only* for Wget to work with buggy old CERN http
 913              servers.  Here is the scenario: When Wget tries to
 914              retrieve a directory without a slash, e.g.
 915              http://foo/bar (bar being a directory), CERN server will
 916              not redirect it too http://foo/bar/ -- it will generate a
 917              directory listing containing links to bar/file1,
 918              bar/file2, etc.  Wget will lose because it saves this
 919              HTML listing to a file `bar', so it cannot create the
 920              directory.  To work around this, if the file of the same
 921              name exists, we just remove it and create the directory
 922              anyway.  */
 923           DEBUGP (("Removing %s because of directory danger!\n", t));
 924           unlink (t);
 925         }
 926     }
 927   res = make_directory (t);
 928   if (res != 0)
 929     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 930   xfree (t);
 931   return res;
 932 }
 933
 934 static int
 935 count_slashes (const char *s)
 936 {
 937   int i = 0;
 938   while (*s)
 939     if (*s++ == '/')
 940       ++i;
 941   return i;
 942 }
 943
 944 /* Return the path name of the URL-equivalent file name, with a
 945    remote-like structure of directories.  */
 946 static char *
 947 mkstruct (const struct urlinfo *u)
 948 {
 949   char *host, *dir, *file, *res, *dirpref;
 950   int l;
 951
 952   assert (u->dir != NULL);
 953   assert (u->host != NULL);
 954
 955   if (opt.cut_dirs)
 956     {
 957       char *ptr = u->dir + (*u->dir == '/');
 958       int slash_count = 1 + count_slashes (ptr);
 959       int cut = MINVAL (opt.cut_dirs, slash_count);
 960       for (; cut && *ptr; ptr++)
 961         if (*ptr == '/')
 962           --cut;
 963       STRDUP_ALLOCA (dir, ptr);
 964     }
 965   else
 966     dir = u->dir + (*u->dir == '/');
 967
 968   host = xstrdup (u->host);
 969   /* Check for the true name (or at least a consistent name for saving
 970      to directory) of HOST, reusing the hlist if possible.  */
 971   if (opt.add_hostdir && !opt.simple_check)
 972     {
 973       char *nhost = realhost (host);
 974       xfree (host);
 975       host = nhost;
 976     }
 977   /* Add dir_prefix and hostname (if required) to the beginning of
 978      dir.  */
 979   if (opt.add_hostdir)
 980     {
 981       if (!DOTP (opt.dir_prefix))
 982         {
 983           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 984                                     + strlen (host) + 1);
 985           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 986         }
 987       else
 988         STRDUP_ALLOCA (dirpref, host);
 989     }
 990   else                         /* not add_hostdir */
 991     {
 992       if (!DOTP (opt.dir_prefix))
 993         dirpref = opt.dir_prefix;
 994       else
 995         dirpref = "";
 996     }
 997   xfree (host);
 998
 999   /* If there is a prefix, prepend it.  */
1000   if (*dirpref)
1001     {
1002       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1003       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1004       dir = newdir;
1005     }
1006   dir = xstrdup (dir);
1007   URL_CLEANSE (dir);
1008   l = strlen (dir);
1009   if (l && dir[l - 1] == '/')
1010     dir[l - 1] = '\0';
1011
1012   if (!*u->file)
1013     file = "index.html";
1014   else
1015     file = u->file;
1016
1017   /* Finally, construct the full name.  */
1018   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1019   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1020   xfree (dir);
1021   return res;
1022 }
1023
1024 /* Create a unique filename, corresponding to a given URL.  Calls
1025    mkstruct if necessary.  Does *not* actually create any directories.  */
1026 char *
1027 url_filename (const struct urlinfo *u)
1028 {
1029   char *file, *name;
1030   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1031
1032   if (opt.dirstruct)
1033     {
1034       file = mkstruct (u);
1035       have_prefix = 1;
1036     }
1037   else
1038     {
1039       if (!*u->file)
1040         file = xstrdup ("index.html");
1041       else
1042         file = xstrdup (u->file);
1043     }
1044
1045   if (!have_prefix)
1046     {
1047       /* Check whether the prefix directory is something other than "."
1048          before prepending it.  */
1049       if (!DOTP (opt.dir_prefix))
1050         {
1051           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1052                                          + 1 + strlen (file) + 1);
1053           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1054           xfree (file);
1055           file = nfile;
1056         }
1057     }
1058   /* DOS-ish file systems don't like `%' signs in them; we change it
1059      to `@'.  */
1060 #ifdef WINDOWS
1061   {
1062     char *p = file;
1063     for (p = file; *p; p++)
1064       if (*p == '%')
1065         *p = '@';
1066   }
1067 #endif /* WINDOWS */
1068
1069   /* Check the cases in which the unique extensions are not used:
1070      1) Clobbering is turned off (-nc).
1071      2) Retrieval with regetting.
1072      3) Timestamping is used.
1073      4) Hierarchy is built.
1074
1075      The exception is the case when file does exist and is a
1076      directory (actually support for bad httpd-s).  */
1077   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1078       && !(file_exists_p (file) && !file_non_directory_p (file)))
1079     return file;
1080
1081   /* Find a unique name.  */
1082   name = unique_name (file);
1083   xfree (file);
1084   return name;
1085 }
1086
1087 /* Like strlen(), but allow the URL to be ended with '?'.  */
1088 static int
1089 urlpath_length (const char *url)
1090 {
1091   const char *q = strchr (url, '?');
1092   if (q)
1093     return q - url;
1094   return strlen (url);
1095 }
1096
1097 /* Find the last occurrence of character C in the range [b, e), or
1098    NULL, if none are present.  This is almost completely equivalent to
1099    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1100    the contents of the string.  */
1101 static const char *
1102 find_last_char (const char *b, const char *e, char c)
1103 {
1104   for (; e > b; e--)
1105     if (*e == c)
1106       return e;
1107   return NULL;
1108 }
1109
1110 /* Construct a URL by concatenating an absolute URL and a path, which
1111    may or may not be absolute.  This tries to behave "reasonably" in
1112    all foreseeable cases.  It employs little specific knowledge about
1113    protocols or URL-specific stuff -- it just works on strings.  */
1114 static char *
1115 construct (const char *url, const char *sub, int subsize, int no_proto)
1116 {
1117   char *constr;
1118
1119   if (no_proto)
1120     {
1121       const char *end = url + urlpath_length (url);
1122
1123       if (*sub != '/')
1124         {
1125           /* SUB is a relative URL: we need to replace everything
1126              after last slash (possibly empty) with SUB.
1127
1128              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1129              our result should be "whatever/foo/qux/xyzzy".  */
1130           int need_explicit_slash = 0;
1131           int span;
1132           const char *start_insert;
1133           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1134           if (!last_slash)
1135             {
1136               /* No slash found at all.  Append SUB to what we have,
1137                  but we'll need a slash as a separator.
1138
1139                  Example: if url == "foo" and sub == "qux/xyzzy", then
1140                  we cannot just append sub to url, because we'd get
1141                  "fooqux/xyzzy", whereas what we want is
1142                  "foo/qux/xyzzy".
1143
1144                  To make sure the / gets inserted, we set
1145                  need_explicit_slash to 1.  We also set start_insert
1146                  to end + 1, so that the length calculations work out
1147                  correctly for one more (slash) character.  Accessing
1148                  that character is fine, since it will be the
1149                  delimiter, '\0' or '?'.  */
1150               /* example: "foo?..." */
1151               /*               ^    ('?' gets changed to '/') */
1152               start_insert = end + 1;
1153               need_explicit_slash = 1;
1154             }
1155           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1156             {
1157               /* example: http://host"  */
1158               /*                      ^ */
1159               start_insert = end + 1;
1160               need_explicit_slash = 1;
1161             }
1162           else
1163             {
1164               /* example: "whatever/foo/bar" */
1165               /*                        ^    */
1166               start_insert = last_slash + 1;
1167             }
1168
1169           span = start_insert - url;
1170           constr = (char *)xmalloc (span + subsize + 1);
1171           if (span)
1172             memcpy (constr, url, span);
1173           if (need_explicit_slash)
1174             constr[span - 1] = '/';
1175           if (subsize)
1176             memcpy (constr + span, sub, subsize);
1177           constr[span + subsize] = '\0';
1178         }
1179       else /* *sub == `/' */
1180         {
1181           /* SUB is an absolute path: we need to replace everything
1182              after (and including) the FIRST slash with SUB.
1183
1184              So, if URL is "http://host/whatever/foo/bar", and SUB is
1185              "/qux/xyzzy", our result should be
1186              "http://host/qux/xyzzy".  */
1187           int span;
1188           const char *slash;
1189           const char *start_insert = NULL; /* for gcc to shut up. */
1190           const char *pos = url;
1191           int seen_slash_slash = 0;
1192           /* We're looking for the first slash, but want to ignore
1193              double slash. */
1194         again:
1195           slash = memchr (pos, '/', end - pos);
1196           if (slash && !seen_slash_slash)
1197             if (*(slash + 1) == '/')
1198               {
1199                 pos = slash + 2;
1200                 seen_slash_slash = 1;
1201                 goto again;
1202               }
1203
1204           /* At this point, SLASH is the location of the first / after
1205              "//", or the first slash altogether.  START_INSERT is the
1206              pointer to the location where SUB will be inserted.  When
1207              examining the last two examples, keep in mind that SUB
1208              begins with '/'. */
1209
1210           if (!slash && !seen_slash_slash)
1211             /* example: "foo" */
1212             /*           ^    */
1213             start_insert = url;
1214           else if (!slash && seen_slash_slash)
1215             /* example: "http://foo" */
1216             /*                     ^ */
1217             start_insert = end;
1218           else if (slash && !seen_slash_slash)
1219             /* example: "foo/bar" */
1220             /*           ^        */
1221             start_insert = url;
1222           else if (slash && seen_slash_slash)
1223             /* example: "http://something/" */
1224             /*                           ^  */
1225             start_insert = slash;
1226
1227           span = start_insert - url;
1228           constr = (char *)xmalloc (span + subsize + 1);
1229           if (span)
1230             memcpy (constr, url, span);
1231           if (subsize)
1232             memcpy (constr + span, sub, subsize);
1233           constr[span + subsize] = '\0';
1234         }
1235     }
1236   else /* !no_proto */
1237     {
1238       constr = strdupdelim (sub, sub + subsize);
1239     }
1240   return constr;
1241 }
1242
1243 /* Like the function above, but with a saner caller interface. */
1244 char *
1245 url_concat (const char *base_url, const char *new_url)
1246 {
1247   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1248 }
1249 \f
1250 /* Optimize URL by host, destructively replacing u->host with realhost
1251    (u->host).  Do this regardless of opt.simple_check.  */
1252 void
1253 opt_url (struct urlinfo *u)
1254 {
1255   /* Find the "true" host.  */
1256   char *host = realhost (u->host);
1257   xfree (u->host);
1258   u->host = host;
1259   assert (u->dir != NULL);      /* the URL must have been parsed */
1260   /* Refresh the printed representation.  */
1261   xfree (u->url);
1262   u->url = str_url (u, 0);
1263 }
1264
1265 /* This beautiful kludge is fortunately not needed, as I've made
1266    parse_dir do the (almost) right thing, so that a query can never
1267    become a part of directory.  */
1268 #if 0
1269 /* Call path_simplify, but make sure that the part after the
1270    question-mark, if any, is not destroyed by path_simplify's
1271    "optimizations".  */
1272 void
1273 path_simplify_with_kludge (char *path)
1274 {
1275   char *query = strchr (path, '?');
1276   if (query)
1277     /* path_simplify also works destructively, so we also have the
1278        license to write. */
1279     *query = '\0';
1280   path_simplify (path);
1281   if (query)
1282     {
1283       char *newend = path + strlen (path);
1284       *query = '?';
1285       if (newend != query)
1286         memmove (newend, query, strlen (query) + 1);
1287     }
1288 }
1289 #endif
1290 \f
1291 /* Returns proxy host address, in accordance with PROTO.  */
1292 char *
1293 getproxy (uerr_t proto)
1294 {
1295   if (proto == URLHTTP)
1296     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1297   else if (proto == URLFTP)
1298     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1299 #ifdef HAVE_SSL
1300   else if (proto == URLHTTPS)
1301     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1302 #endif /* HAVE_SSL */
1303   else
1304     return NULL;
1305 }
1306
1307 /* Should a host be accessed through proxy, concerning no_proxy?  */
1308 int
1309 no_proxy_match (const char *host, const char **no_proxy)
1310 {
1311   if (!no_proxy)
1312     return 1;
1313   else
1314     return !sufmatch (no_proxy, host);
1315 }
1316 \f
1317 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1318 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1319
1320 /* Change the links in an HTML document.  Accepts a structure that
1321    defines the positions of all the links.  */
1322 void
1323 convert_links (const char *file, urlpos *l)
1324 {
1325   struct file_memory *fm;
1326   FILE               *fp;
1327   const char         *p;
1328   downloaded_file_t  downloaded_file_return;
1329
1330   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1331
1332   {
1333     /* First we do a "dry run": go through the list L and see whether
1334        any URL needs to be converted in the first place.  If not, just
1335        leave the file alone.  */
1336     int count = 0;
1337     urlpos *dry = l;
1338     for (dry = l; dry; dry = dry->next)
1339       if (dry->convert != CO_NOCONVERT)
1340         ++count;
1341     if (!count)
1342       {
1343         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1344         return;
1345       }
1346   }
1347
1348   fm = read_file (file);
1349   if (!fm)
1350     {
1351       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1352                  file, strerror (errno));
1353       return;
1354     }
1355
1356   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1357   if (opt.backup_converted && downloaded_file_return)
1358     write_backup_file (file, downloaded_file_return);
1359
1360   /* Before opening the file for writing, unlink the file.  This is
1361      important if the data in FM is mmaped.  In such case, nulling the
1362      file, which is what fopen() below does, would make us read all
1363      zeroes from the mmaped region.  */
1364   if (unlink (file) < 0 && errno != ENOENT)
1365     {
1366       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1367                  file, strerror (errno));
1368       read_file_free (fm);
1369       return;
1370     }
1371   /* Now open the file for writing.  */
1372   fp = fopen (file, "wb");
1373   if (!fp)
1374     {
1375       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1376                  file, strerror (errno));
1377       read_file_free (fm);
1378       return;
1379     }
1380   /* Here we loop through all the URLs in file, replacing those of
1381      them that are downloaded with relative references.  */
1382   p = fm->content;
1383   for (; l; l = l->next)
1384     {
1385       char *url_start = fm->content + l->pos;
1386
1387       if (l->pos >= fm->length)
1388         {
1389           DEBUGP (("Something strange is going on.  Please investigate."));
1390           break;
1391         }
1392       /* If the URL is not to be converted, skip it.  */
1393       if (l->convert == CO_NOCONVERT)
1394         {
1395           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1396           continue;
1397         }
1398
1399       /* Echo the file contents, up to the offending URL's opening
1400          quote, to the outfile.  */
1401       fwrite (p, 1, url_start - p, fp);
1402       p = url_start;
1403       if (l->convert == CO_CONVERT_TO_RELATIVE)
1404         {
1405           /* Convert absolute URL to relative. */
1406           char *newname = construct_relative (file, l->local_name);
1407           char *quoted_newname = html_quote_string (newname);
1408           replace_attr (&p, l->size, fp, quoted_newname);
1409           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1410                    l->url, newname, l->pos, file));
1411           xfree (newname);
1412           xfree (quoted_newname);
1413         }
1414       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1415         {
1416           /* Convert the link to absolute URL. */
1417           char *newlink = l->url;
1418           char *quoted_newlink = html_quote_string (newlink);
1419           replace_attr (&p, l->size, fp, quoted_newlink);
1420           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1421                    newlink, l->pos, file));
1422           xfree (quoted_newlink);
1423         }
1424     }
1425   /* Output the rest of the file. */
1426   if (p - fm->content < fm->length)
1427     fwrite (p, 1, fm->length - (p - fm->content), fp);
1428   fclose (fp);
1429   read_file_free (fm);
1430   logputs (LOG_VERBOSE, _("done.\n"));
1431 }
1432
1433 /* Construct and return a malloced copy of the relative link from two
1434    pieces of information: local name S1 of the referring file and
1435    local name S2 of the referred file.
1436
1437    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1438    "jagor.srce.hr/images/news.gif", the function will return
1439    "images/news.gif".
1440
1441    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1442    "fly.cc.fer.hr/images/fly.gif", the function will return
1443    "../images/fly.gif".
1444
1445    Caveats: S1 should not begin with `/', unless S2 also begins with
1446    '/'.  S1 should not contain things like ".." and such --
1447    construct_relative ("fly/ioccc/../index.html",
1448    "fly/images/fly.gif") will fail.  (A workaround is to call
1449    something like path_simplify() on S1).  */
1450 static char *
1451 construct_relative (const char *s1, const char *s2)
1452 {
1453   int i, cnt, sepdirs1;
1454   char *res;
1455
1456   if (*s2 == '/')
1457     return xstrdup (s2);
1458   /* S1 should *not* be absolute, if S2 wasn't.  */
1459   assert (*s1 != '/');
1460   i = cnt = 0;
1461   /* Skip the directories common to both strings.  */
1462   while (1)
1463     {
1464       while (s1[i] && s2[i]
1465              && (s1[i] == s2[i])
1466              && (s1[i] != '/')
1467              && (s2[i] != '/'))
1468         ++i;
1469       if (s1[i] == '/' && s2[i] == '/')
1470         cnt = ++i;
1471       else
1472         break;
1473     }
1474   for (sepdirs1 = 0; s1[i]; i++)
1475     if (s1[i] == '/')
1476       ++sepdirs1;
1477   /* Now, construct the file as of:
1478      - ../ repeated sepdirs1 time
1479      - all the non-mutual directories of S2.  */
1480   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1481   for (i = 0; i < sepdirs1; i++)
1482     memcpy (res + 3 * i, "../", 3);
1483   strcpy (res + 3 * i, s2 + cnt);
1484   return res;
1485 }
1486 \f
1487 /* Add URL to the head of the list L.  */
1488 urlpos *
1489 add_url (urlpos *l, const char *url, const char *file)
1490 {
1491   urlpos *t;
1492
1493   t = (urlpos *)xmalloc (sizeof (urlpos));
1494   memset (t, 0, sizeof (*t));
1495   t->url = xstrdup (url);
1496   t->local_name = xstrdup (file);
1497   t->next = l;
1498   return t;
1499 }
1500
1501 static void
1502 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1503 {
1504   /* Rather than just writing over the original .html file with the
1505      converted version, save the former to *.orig.  Note we only do
1506      this for files we've _successfully_ downloaded, so we don't
1507      clobber .orig files sitting around from previous invocations. */
1508
1509   /* Construct the backup filename as the original name plus ".orig". */
1510   size_t         filename_len = strlen(file);
1511   char*          filename_plus_orig_suffix;
1512   boolean        already_wrote_backup_file = FALSE;
1513   slist*         converted_file_ptr;
1514   static slist*  converted_files = NULL;
1515
1516   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1517     {
1518       /* Just write "orig" over "html".  We need to do it this way
1519          because when we're checking to see if we've downloaded the
1520          file before (to see if we can skip downloading it), we don't
1521          know if it's a text/html file.  Therefore we don't know yet
1522          at that stage that -E is going to cause us to tack on
1523          ".html", so we need to compare vs. the original URL plus
1524          ".orig", not the original URL plus ".html.orig". */
1525       filename_plus_orig_suffix = alloca (filename_len + 1);
1526       strcpy(filename_plus_orig_suffix, file);
1527       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1528     }
1529   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1530     {
1531       /* Append ".orig" to the name. */
1532       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1533       strcpy(filename_plus_orig_suffix, file);
1534       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1535     }
1536
1537   /* We can get called twice on the same URL thanks to the
1538      convert_all_links() call in main().  If we write the .orig file
1539      each time in such a case, it'll end up containing the first-pass
1540      conversion, not the original file.  So, see if we've already been
1541      called on this file. */
1542   converted_file_ptr = converted_files;
1543   while (converted_file_ptr != NULL)
1544     if (strcmp(converted_file_ptr->string, file) == 0)
1545       {
1546         already_wrote_backup_file = TRUE;
1547         break;
1548       }
1549     else
1550       converted_file_ptr = converted_file_ptr->next;
1551
1552   if (!already_wrote_backup_file)
1553     {
1554       /* Rename <file> to <file>.orig before former gets written over. */
1555       if (rename(file, filename_plus_orig_suffix) != 0)
1556         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1557                    file, filename_plus_orig_suffix, strerror (errno));
1558
1559       /* Remember that we've already written a .orig backup for this file.
1560          Note that we never free this memory since we need it till the
1561          convert_all_links() call, which is one of the last things the
1562          program does before terminating.  BTW, I'm not sure if it would be
1563          safe to just set 'converted_file_ptr->string' to 'file' below,
1564          rather than making a copy of the string...  Another note is that I
1565          thought I could just add a field to the urlpos structure saying
1566          that we'd written a .orig file for this URL, but that didn't work,
1567          so I had to make this separate list.
1568          -- Dan Harkless <wget@harkless.org>
1569
1570          This [adding a field to the urlpos structure] didn't work
1571          because convert_file() is called twice: once after all its
1572          sublinks have been retrieved in recursive_retrieve(), and
1573          once at the end of the day in convert_all_links().  The
1574          original linked list collected in recursive_retrieve() is
1575          lost after the first invocation of convert_links(), and
1576          convert_all_links() makes a new one (it calls get_urls_html()
1577          for each file it covers.)  That's why your first approach didn't
1578          work.  The way to make it work is perhaps to make this flag a
1579          field in the `urls_html' list.
1580          -- Hrvoje Niksic <hniksic@arsdigita.com>
1581       */
1582       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1583       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1584       converted_file_ptr->next = converted_files;
1585       converted_files = converted_file_ptr;
1586     }
1587 }
1588
1589 static int find_fragment PARAMS ((const char *, int, const char **,
1590                                   const char **));
1591
1592 static void
1593 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1594 {
1595   const char *p = *pp;
1596   int quote_flag = 0;
1597   int size = raw_size;
1598   char quote_char = '\"';
1599   const char *frag_beg, *frag_end;
1600
1601   /* Structure of our string is:
1602        "...old-contents..."
1603        <---  l->size   --->  (with quotes)
1604      OR:
1605        ...old-contents...
1606        <---  l->size  -->    (no quotes)   */
1607
1608   if (*p == '\"' || *p == '\'')
1609     {
1610       quote_char = *p;
1611       quote_flag = 1;
1612       ++p;
1613       size -= 2;                /* disregard opening and closing quote */
1614     }
1615   putc (quote_char, fp);
1616   fputs (new_str, fp);
1617
1618   /* Look for fragment identifier, if any. */
1619   if (find_fragment (p, size, &frag_beg, &frag_end))
1620     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1621   p += size;
1622   if (quote_flag)
1623     ++p;
1624   putc (quote_char, fp);
1625   *pp = p;
1626 }
1627
1628 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1629    preceded by '&'.  If the character is not found, return zero.  If
1630    the character is found, return 1 and set BP and EP to point to the
1631    beginning and end of the region.
1632
1633    This is used for finding the fragment indentifiers in URLs.  */
1634
1635 static int
1636 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1637 {
1638   const char *end = beg + size;
1639   int saw_amp = 0;
1640   for (; beg < end; beg++)
1641     {
1642       switch (*beg)
1643         {
1644         case '&':
1645           saw_amp = 1;
1646           break;
1647         case '#':
1648           if (!saw_amp)
1649             {
1650               *bp = beg;
1651               *ep = end;
1652               return 1;
1653             }
1654           /* fallthrough */
1655         default:
1656           saw_amp = 0;
1657         }
1658     }
1659   return 0;
1660 }
1661
1662 typedef struct _downloaded_file_list {
1663   char*                          file;
1664   downloaded_file_t              download_type;
1665   struct _downloaded_file_list*  next;
1666 } downloaded_file_list;
1667
1668 static downloaded_file_list *downloaded_files;
1669
1670 /* Remembers which files have been downloaded.  In the standard case, should be
1671    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1672    download successfully (i.e. not for ones we have failures on or that we skip
1673    due to -N).
1674
1675    When we've downloaded a file and tacked on a ".html" extension due to -E,
1676    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1677    FILE_DOWNLOADED_NORMALLY.
1678
1679    If you just want to check if a file has been previously added without adding
1680    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1681    with local filenames, not remote URLs. */
1682 downloaded_file_t
1683 downloaded_file (downloaded_file_t  mode, const char*  file)
1684 {
1685   boolean                       found_file = FALSE;
1686   downloaded_file_list*         rover = downloaded_files;
1687
1688   while (rover != NULL)
1689     if (strcmp(rover->file, file) == 0)
1690       {
1691         found_file = TRUE;
1692         break;
1693       }
1694     else
1695       rover = rover->next;
1696
1697   if (found_file)
1698     return rover->download_type;  /* file had already been downloaded */
1699   else
1700     {
1701       if (mode != CHECK_FOR_FILE)
1702         {
1703           rover = xmalloc(sizeof(*rover));
1704           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1705           rover->download_type = mode;
1706           rover->next = downloaded_files;
1707           downloaded_files = rover;
1708         }
1709
1710       return FILE_NOT_ALREADY_DOWNLOADED;
1711     }
1712 }
1713
1714 void
1715 downloaded_files_free (void)
1716 {
1717   downloaded_file_list*         rover = downloaded_files;
1718   while (rover)
1719     {
1720       downloaded_file_list *next = rover->next;
1721       xfree (rover->file);
1722       xfree (rover);
1723       rover = next;
1724     }
1725 }
1726 \f
1727 /* Initialization of static stuff. */
1728 void
1729 url_init (void)
1730 {
1731   init_unsafe_char_table ();
1732 }