sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <ctype.h>
  30 #include <sys/types.h>
  31 #ifdef HAVE_UNISTD_H
  32 # include <unistd.h>
  33 #endif
  34 #include <errno.h>
  35 #include <assert.h>
  36
  37 #include "wget.h"
  38 #include "utils.h"
  39 #include "url.h"
  40 #include "host.h"
  41
  42 #ifndef errno
  43 extern int errno;
  44 #endif
  45
  46 /* Table of Unsafe chars.  This is intialized in
  47    init_unsafe_char_table.  */
  48
  49 static char unsafe_char_table[256];
  50
  51 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  52
  53 /* If S contains unsafe characters, free it and replace it with a
  54    version that doesn't.  */
  55 #define URL_CLEANSE(s) do                       \
  56 {                                               \
  57   if (contains_unsafe (s))                      \
  58     {                                           \
  59       char *uc_tmp = encode_string (s);         \
  60       xfree (s);                                \
  61       (s) = uc_tmp;                             \
  62     }                                           \
  63 } while (0)
  64
  65 /* Is a directory "."?  */
  66 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  67 /* Is a directory ".."?  */
  68 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  69
  70 #if 0
  71 static void path_simplify_with_kludge PARAMS ((char *));
  72 #endif
  73 static int urlpath_length PARAMS ((const char *));
  74
  75 /* NULL-terminated list of strings to be recognized as prototypes (URL
  76    schemes).  Note that recognized doesn't mean supported -- only HTTP,
  77    HTTPS and FTP are currently supported .
  78
  79    However, a string that does not match anything in the list will be
  80    considered a relative URL.  Thus it's important that this list has
  81    anything anyone could think of being legal.
  82
  83    There are wild things here.  :-) Take a look at
  84    <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> for more
  85    fun.  */
  86 static char *protostrings[] =
  87 {
  88   "cid:",
  89   "clsid:",
  90   "file:",
  91   "finger:",
  92   "ftp:",
  93   "gopher:",
  94   "hdl:",
  95   "http:",
  96   "https:",
  97   "ilu:",
  98   "ior:",
  99   "irc:",
 100   "java:",
 101   "javascript:",
 102   "lifn:",
 103   "mailto:",
 104   "mid:",
 105   "news:",
 106   "nntp:",
 107   "path:",
 108   "prospero:",
 109   "rlogin:",
 110   "service:",
 111   "shttp:",
 112   "snews:",
 113   "stanf:",
 114   "telnet:",
 115   "tn3270:",
 116   "wais:",
 117   "whois++:",
 118   NULL
 119 };
 120
 121 struct proto
 122 {
 123   char *name;
 124   uerr_t ind;
 125   unsigned short port;
 126 };
 127
 128 /* Similar to former, but for supported protocols: */
 129 static struct proto sup_protos[] =
 130 {
 131   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 132 #ifdef HAVE_SSL
 133   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 134 #endif
 135   { "ftp://", URLFTP, DEFAULT_FTP_PORT },
 136   /*{ "file://", URLFILE, DEFAULT_FTP_PORT },*/
 137 };
 138
 139 static void parse_dir PARAMS ((const char *, char **, char **));
 140 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 141 static char *construct PARAMS ((const char *, const char *, int , int));
 142 static char *construct_relative PARAMS ((const char *, const char *));
 143 static char process_ftp_type PARAMS ((char *));
 144
 145 \f
 146 /* Returns the number of characters to be skipped if the first thing
 147    in a URL is URL: (which is 0 or 4+).  The optional spaces after
 148    URL: are also skipped.  */
 149 int
 150 skip_url (const char *url)
 151 {
 152   int i;
 153
 154   if (TOUPPER (url[0]) == 'U'
 155       && TOUPPER (url[1]) == 'R'
 156       && TOUPPER (url[2]) == 'L'
 157       && url[3] == ':')
 158     {
 159       /* Skip blanks.  */
 160       for (i = 4; url[i] && ISSPACE (url[i]); i++);
 161       return i;
 162     }
 163   else
 164     return 0;
 165 }
 166
 167 /* Unsafe chars:
 168    - anything <= 32;
 169    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 170    - @ and :, for user/password encoding.
 171    - everything over 127 (but we don't bother with recording those.  */
 172 void
 173 init_unsafe_char_table (void)
 174 {
 175   int i;
 176   for (i = 0; i < 256; i++)
 177     if (i < 32 || i >= 127
 178         || i == ' '
 179         || i == '<'
 180         || i == '>'
 181         || i == '\"'
 182         || i == '#'
 183         || i == '%'
 184         || i == '{'
 185         || i == '}'
 186         || i == '|'
 187         || i == '\\'
 188         || i == '^'
 189         || i == '~'
 190         || i == '['
 191         || i == ']'
 192         || i == '`')
 193       unsafe_char_table[i] = 1;
 194 }
 195
 196 /* Returns 1 if the string contains unsafe characters, 0 otherwise.  */
 197 int
 198 contains_unsafe (const char *s)
 199 {
 200   for (; *s; s++)
 201     if (UNSAFE_CHAR (*s))
 202       return 1;
 203   return 0;
 204 }
 205
 206 /* Decodes the forms %xy in a URL to the character the hexadecimal
 207    code of which is xy.  xy are hexadecimal digits from
 208    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 209    hex-digits or `%' precedes `\0', the sequence is inserted
 210    literally.  */
 211
 212 static void
 213 decode_string (char *s)
 214 {
 215   char *p = s;
 216
 217   for (; *s; s++, p++)
 218     {
 219       if (*s != '%')
 220         *p = *s;
 221       else
 222         {
 223           /* Do nothing if at the end of the string, or if the chars
 224              are not hex-digits.  */
 225           if (!*(s + 1) || !*(s + 2)
 226               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 227             {
 228               *p = *s;
 229               continue;
 230             }
 231           *p = (ASC2HEXD (*(s + 1)) << 4) + ASC2HEXD (*(s + 2));
 232           s += 2;
 233         }
 234     }
 235   *p = '\0';
 236 }
 237
 238 /* Encode the unsafe characters (as determined by URL_UNSAFE) in a
 239    given string, returning a malloc-ed %XX encoded string.  */
 240 char *
 241 encode_string (const char *s)
 242 {
 243   const char *b;
 244   char *p, *res;
 245   int i;
 246
 247   b = s;
 248   for (i = 0; *s; s++, i++)
 249     if (UNSAFE_CHAR (*s))
 250       i += 2; /* Two more characters (hex digits) */
 251   res = (char *)xmalloc (i + 1);
 252   s = b;
 253   for (p = res; *s; s++)
 254     if (UNSAFE_CHAR (*s))
 255       {
 256         const unsigned char c = *s;
 257         *p++ = '%';
 258         *p++ = HEXD2ASC (c >> 4);
 259         *p++ = HEXD2ASC (c & 0xf);
 260       }
 261     else
 262       *p++ = *s;
 263   *p = '\0';
 264   return res;
 265 }
 266 \f
 267 /* Returns the proto-type if URL's protocol is supported, or
 268    URLUNKNOWN if not.  */
 269 uerr_t
 270 urlproto (const char *url)
 271 {
 272   int i;
 273
 274   url += skip_url (url);
 275   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 276     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 277       return sup_protos[i].ind;
 278   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 279   if (url[i] == ':')
 280     {
 281       for (++i; url[i] && url[i] != '/'; i++)
 282         if (!ISDIGIT (url[i]))
 283           return URLBADPORT;
 284       if (url[i - 1] == ':')
 285         return URLFTP;
 286       else
 287         return URLHTTP;
 288     }
 289   else
 290     return URLHTTP;
 291 }
 292
 293 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 294    part is found, returns 0.  */
 295 int
 296 skip_proto (const char *url)
 297 {
 298   char **s;
 299   int l;
 300
 301   for (s = protostrings; *s; s++)
 302     if (!strncasecmp (*s, url, strlen (*s)))
 303       break;
 304   if (!*s)
 305     return 0;
 306   l = strlen (*s);
 307   /* HTTP and FTP protocols are expected to yield exact host names
 308      (i.e. the `//' part must be skipped, too).  */
 309   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 310     l += 2;
 311   return l;
 312 }
 313
 314 /* Returns 1 if the URL begins with a protocol (supported or
 315    unsupported), 0 otherwise.  */
 316 int
 317 has_proto (const char *url)
 318 {
 319   char **s;
 320
 321   url += skip_url (url);
 322   for (s = protostrings; *s; s++)
 323     if (strncasecmp (url, *s, strlen (*s)) == 0)
 324       return 1;
 325   return 0;
 326 }
 327
 328 /* Skip the username and password, if present here.  The function
 329    should be called *not* with the complete URL, but with the part
 330    right after the protocol.
 331
 332    If no username and password are found, return 0.  */
 333 int
 334 skip_uname (const char *url)
 335 {
 336   const char *p;
 337   const char *q = NULL;
 338   for (p = url ; *p && *p != '/'; p++)
 339     if (*p == '@') q = p;
 340   /* If a `@' was found before the first occurrence of `/', skip
 341      it.  */
 342   if (q != NULL)
 343     return q - url + 1;
 344   else
 345     return 0;
 346 }
 347 \f
 348 /* Allocate a new urlinfo structure, fill it with default values and
 349    return a pointer to it.  */
 350 struct urlinfo *
 351 newurl (void)
 352 {
 353   struct urlinfo *u;
 354
 355   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 356   memset (u, 0, sizeof (*u));
 357   u->proto = URLUNKNOWN;
 358   return u;
 359 }
 360
 361 /* Perform a "deep" free of the urlinfo structure.  The structure
 362    should have been created with newurl, but need not have been used.
 363    If free_pointer is non-0, free the pointer itself.  */
 364 void
 365 freeurl (struct urlinfo *u, int complete)
 366 {
 367   assert (u != NULL);
 368   FREE_MAYBE (u->url);
 369   FREE_MAYBE (u->host);
 370   FREE_MAYBE (u->path);
 371   FREE_MAYBE (u->file);
 372   FREE_MAYBE (u->dir);
 373   FREE_MAYBE (u->user);
 374   FREE_MAYBE (u->passwd);
 375   FREE_MAYBE (u->local);
 376   FREE_MAYBE (u->referer);
 377   if (u->proxy)
 378     freeurl (u->proxy, 1);
 379   if (complete)
 380     xfree (u);
 381   return;
 382 }
 383 \f
 384 /* Extract the given URL of the form
 385    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 386    1. hostname (terminated with `/' or `:')
 387    2. port number (terminated with `/'), or chosen for the protocol
 388    3. dirname (everything after hostname)
 389    Most errors are handled.  No allocation is done, you must supply
 390    pointers to allocated memory.
 391    ...and a host of other stuff :-)
 392
 393    - Recognizes hostname:dir/file for FTP and
 394      hostname (:portnum)?/dir/file for HTTP.
 395    - Parses the path to yield directory and file
 396    - Parses the URL to yield the username and passwd (if present)
 397    - Decodes the strings, in case they contain "forbidden" characters
 398    - Writes the result to struct urlinfo
 399
 400    If the argument STRICT is set, it recognizes only the canonical
 401    form.  */
 402 uerr_t
 403 parseurl (const char *url, struct urlinfo *u, int strict)
 404 {
 405   int i, l, abs_ftp;
 406   int recognizable;            /* Recognizable URL is the one where
 407                                   the protocol name was explicitly
 408                                   named, i.e. it wasn't deduced from
 409                                   the URL format.  */
 410   uerr_t type;
 411
 412   DEBUGP (("parseurl (\"%s\") -> ", url));
 413   url += skip_url (url);
 414   recognizable = has_proto (url);
 415   if (strict && !recognizable)
 416     return URLUNKNOWN;
 417   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 418     {
 419       l = strlen (sup_protos[i].name);
 420       if (!strncasecmp (sup_protos[i].name, url, l))
 421         break;
 422     }
 423   /* If protocol is recognizable, but unsupported, bail out, else
 424      suppose unknown.  */
 425   if (recognizable && i == ARRAY_SIZE (sup_protos))
 426     return URLUNKNOWN;
 427   else if (i == ARRAY_SIZE (sup_protos))
 428     type = URLUNKNOWN;
 429   else
 430     u->proto = type = sup_protos[i].ind;
 431
 432   if (type == URLUNKNOWN)
 433     l = 0;
 434   /* Allow a username and password to be specified (i.e. just skip
 435      them for now).  */
 436   if (recognizable)
 437     l += skip_uname (url + l);
 438   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 439   if (i == l)
 440     return URLBADHOST;
 441   /* Get the hostname.  */
 442   u->host = strdupdelim (url + l, url + i);
 443   DEBUGP (("host %s -> ", u->host));
 444
 445   /* Assume no port has been given.  */
 446   u->port = 0;
 447   if (url[i] == ':')
 448     {
 449       /* We have a colon delimiting the hostname.  It could mean that
 450          a port number is following it, or a directory.  */
 451       if (ISDIGIT (url[++i]))    /* A port number */
 452         {
 453           if (type == URLUNKNOWN)
 454             u->proto = type = URLHTTP;
 455           for (; url[i] && url[i] != '/'; i++)
 456             if (ISDIGIT (url[i]))
 457               u->port = 10 * u->port + (url[i] - '0');
 458             else
 459               return URLBADPORT;
 460           if (!u->port)
 461             return URLBADPORT;
 462           DEBUGP (("port %hu -> ", u->port));
 463         }
 464       else if (type == URLUNKNOWN) /* or a directory */
 465         u->proto = type = URLFTP;
 466       else                      /* or just a misformed port number */
 467         return URLBADPORT;
 468     }
 469   else if (type == URLUNKNOWN)
 470     u->proto = type = URLHTTP;
 471   if (!u->port)
 472     {
 473       int ind;
 474       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 475         if (sup_protos[ind].ind == type)
 476           break;
 477       if (ind == ARRAY_SIZE (sup_protos))
 478         return URLUNKNOWN;
 479       u->port = sup_protos[ind].port;
 480     }
 481   /* Some delimiter troubles...  */
 482   if (url[i] == '/' && url[i - 1] != ':')
 483     ++i;
 484   if (type == URLHTTP)
 485     while (url[i] && url[i] == '/')
 486       ++i;
 487   u->path = (char *)xmalloc (strlen (url + i) + 8);
 488   strcpy (u->path, url + i);
 489   if (type == URLFTP)
 490     {
 491       u->ftp_type = process_ftp_type (u->path);
 492       /* #### We don't handle type `d' correctly yet.  */
 493       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 494         u->ftp_type = 'I';
 495       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 496     }
 497   DEBUGP (("opath %s -> ", u->path));
 498   /* Parse the username and password (if existing).  */
 499   parse_uname (url, &u->user, &u->passwd);
 500   /* Decode the strings, as per RFC 1738.  */
 501   decode_string (u->host);
 502   decode_string (u->path);
 503   if (u->user)
 504     decode_string (u->user);
 505   if (u->passwd)
 506     decode_string (u->passwd);
 507   /* Parse the directory.  */
 508   parse_dir (u->path, &u->dir, &u->file);
 509   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 510   /* Simplify the directory.  */
 511   path_simplify (u->dir);
 512   /* Remove the leading `/' in HTTP.  */
 513   if (type == URLHTTP && *u->dir == '/')
 514     strcpy (u->dir, u->dir + 1);
 515   DEBUGP (("ndir %s\n", u->dir));
 516   /* Strip trailing `/'.  */
 517   l = strlen (u->dir);
 518   if (l && u->dir[l - 1] == '/')
 519     u->dir[l - 1] = '\0';
 520   /* Re-create the path: */
 521   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 522   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 523       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 524   strcpy (u->path, abs_ftp ? "%2F" : "/");
 525   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 526   strcat (u->path, *u->dir ? "/" : "");
 527   strcat (u->path, u->file);
 528   URL_CLEANSE (u->path);
 529   DEBUGP (("newpath: %s\n", u->path));
 530   /* Create the clean URL.  */
 531   u->url = str_url (u, 0);
 532   return URLOK;
 533 }
 534 \f
 535 /* Special versions of DOTP and DDOTP for parse_dir(). */
 536
 537 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 538 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 539                      && (!*((x) + 2) || *((x) + 2) == '?'))
 540
 541 /* Build the directory and filename components of the path.  Both
 542    components are *separately* malloc-ed strings!  It does not change
 543    the contents of path.
 544
 545    If the path ends with "." or "..", they are (correctly) counted as
 546    directories.  */
 547 static void
 548 parse_dir (const char *path, char **dir, char **file)
 549 {
 550   int i, l;
 551
 552   l = urlpath_length (path);
 553   for (i = l; i && path[i] != '/'; i--);
 554
 555   if (!i && *path != '/')   /* Just filename */
 556     {
 557       if (PD_DOTP (path) || PD_DDOTP (path))
 558         {
 559           *dir = strdupdelim (path, path + l);
 560           *file = xstrdup (path + l); /* normally empty, but could
 561                                          contain ?... */
 562         }
 563       else
 564         {
 565           *dir = xstrdup ("");     /* This is required because of FTP */
 566           *file = xstrdup (path);
 567         }
 568     }
 569   else if (!i)                 /* /filename */
 570     {
 571       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 572         {
 573           *dir = strdupdelim (path, path + l);
 574           *file = xstrdup (path + l); /* normally empty, but could
 575                                          contain ?... */
 576         }
 577       else
 578         {
 579           *dir = xstrdup ("/");
 580           *file = xstrdup (path + 1);
 581         }
 582     }
 583   else /* Nonempty directory with or without a filename */
 584     {
 585       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 586         {
 587           *dir = strdupdelim (path, path + l);
 588           *file = xstrdup (path + l); /* normally empty, but could
 589                                          contain ?... */
 590         }
 591       else
 592         {
 593           *dir = strdupdelim (path, path + i);
 594           *file = xstrdup (path + i + 1);
 595         }
 596     }
 597 }
 598
 599 /* Find the optional username and password within the URL, as per
 600    RFC1738.  The returned user and passwd char pointers are
 601    malloc-ed.  */
 602 static uerr_t
 603 parse_uname (const char *url, char **user, char **passwd)
 604 {
 605   int l;
 606   const char *p, *q, *col;
 607   char **where;
 608
 609   *user = NULL;
 610   *passwd = NULL;
 611   url += skip_url (url);
 612   /* Look for end of protocol string.  */
 613   l = skip_proto (url);
 614   if (!l)
 615     return URLUNKNOWN;
 616   /* Add protocol offset.  */
 617   url += l;
 618   /* Is there an `@' character?  */
 619   for (p = url; *p && *p != '/'; p++)
 620     if (*p == '@')
 621       break;
 622   /* If not, return.  */
 623   if (*p != '@')
 624     return URLOK;
 625   /* Else find the username and password.  */
 626   for (p = q = col = url; *p != '/'; p++)
 627     {
 628       if (*p == ':' && !*user)
 629         {
 630           *user = (char *)xmalloc (p - url + 1);
 631           memcpy (*user, url, p - url);
 632           (*user)[p - url] = '\0';
 633           col = p + 1;
 634         }
 635       if (*p == '@') q = p;
 636     }
 637   /* Decide whether you have only the username or both.  */
 638   where = *user ? passwd : user;
 639   *where = (char *)xmalloc (q - col + 1);
 640   memcpy (*where, col, q - col);
 641   (*where)[q - col] = '\0';
 642   return URLOK;
 643 }
 644
 645 /* If PATH ends with `;type=X', return the character X.  */
 646 static char
 647 process_ftp_type (char *path)
 648 {
 649   int len = strlen (path);
 650
 651   if (len >= 7
 652       && !memcmp (path + len - 7, ";type=", 6))
 653     {
 654       path[len - 7] = '\0';
 655       return path[len - 1];
 656     }
 657   else
 658     return '\0';
 659 }
 660 \f
 661 /* Return the URL as fine-formed string, with a proper protocol, optional port
 662    number, directory and optional user/password.  If `hide' is non-zero (as it
 663    is when we're calling this on a URL we plan to print, but not when calling it
 664    to canonicalize a URL for use within the program), password will be hidden.
 665    The forbidden characters in the URL will be cleansed.  */
 666 char *
 667 str_url (const struct urlinfo *u, int hide)
 668 {
 669   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 670   int i, l, ln, lu, lh, lp, lf, ld;
 671   unsigned short proto_default_port;
 672
 673   /* Look for the protocol name.  */
 674   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 675     if (sup_protos[i].ind == u->proto)
 676       break;
 677   if (i == ARRAY_SIZE (sup_protos))
 678     return NULL;
 679   proto_name = sup_protos[i].name;
 680   proto_default_port = sup_protos[i].port;
 681   host = CLEANDUP (u->host);
 682   dir = CLEANDUP (u->dir);
 683   file = CLEANDUP (u->file);
 684   user = passwd = NULL;
 685   if (u->user)
 686     user = CLEANDUP (u->user);
 687   if (u->passwd)
 688     {
 689       if (hide)
 690         /* Don't output the password, or someone might see it over the user's
 691            shoulder (or in saved wget output).  Don't give away the number of
 692            characters in the password, either, as we did in past versions of
 693            this code, when we replaced the password characters with 'x's. */
 694         passwd = xstrdup("<password>");
 695       else
 696         passwd = CLEANDUP (u->passwd);
 697     }
 698   if (u->proto == URLFTP && *dir == '/')
 699     {
 700       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 701       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 702       tmp[0] = '%';
 703       tmp[1] = '2';
 704       tmp[2] = 'F';
 705       strcpy (tmp + 3, dir + 1);
 706       xfree (dir);
 707       dir = tmp;
 708     }
 709
 710   ln = strlen (proto_name);
 711   lu = user ? strlen (user) : 0;
 712   lp = passwd ? strlen (passwd) : 0;
 713   lh = strlen (host);
 714   ld = strlen (dir);
 715   lf = strlen (file);
 716   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 717   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 718      (user ? user : ""), (passwd ? ":" : ""),
 719      (passwd ? passwd : ""), (user ? "@" : ""),
 720      host, u->port, dir, *dir ? "/" : "", file); */
 721   l = 0;
 722   memcpy (res, proto_name, ln);
 723   l += ln;
 724   if (user)
 725     {
 726       memcpy (res + l, user, lu);
 727       l += lu;
 728       if (passwd)
 729         {
 730           res[l++] = ':';
 731           memcpy (res + l, passwd, lp);
 732           l += lp;
 733         }
 734       res[l++] = '@';
 735     }
 736   memcpy (res + l, host, lh);
 737   l += lh;
 738   if (u->port != proto_default_port)
 739     {
 740       res[l++] = ':';
 741       long_to_string (res + l, (long)u->port);
 742       l += numdigit (u->port);
 743     }
 744   res[l++] = '/';
 745   memcpy (res + l, dir, ld);
 746   l += ld;
 747   if (*dir)
 748     res[l++] = '/';
 749   strcpy (res + l, file);
 750   xfree (host);
 751   xfree (dir);
 752   xfree (file);
 753   FREE_MAYBE (user);
 754   FREE_MAYBE (passwd);
 755   return res;
 756 }
 757
 758 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 759    location.  Uses parseurl to parse them, and compares the canonical
 760    forms.
 761
 762    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 763    return 0 on error.  */
 764 int
 765 url_equal (const char *url1, const char *url2)
 766 {
 767   struct urlinfo *u1, *u2;
 768   uerr_t err;
 769   int res;
 770
 771   u1 = newurl ();
 772   err = parseurl (url1, u1, 0);
 773   if (err != URLOK)
 774     {
 775       freeurl (u1, 1);
 776       return 0;
 777     }
 778   u2 = newurl ();
 779   err = parseurl (url2, u2, 0);
 780   if (err != URLOK)
 781     {
 782       freeurl (u2, 1);
 783       return 0;
 784     }
 785   res = !strcmp (u1->url, u2->url);
 786   freeurl (u1, 1);
 787   freeurl (u2, 1);
 788   return res;
 789 }
 790 \f
 791 urlpos *
 792 get_urls_file (const char *file)
 793 {
 794   struct file_memory *fm;
 795   urlpos *head, *tail;
 796   const char *text, *text_end;
 797
 798   /* Load the file.  */
 799   fm = read_file (file);
 800   if (!fm)
 801     {
 802       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 803       return NULL;
 804     }
 805   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 806   head = tail = NULL;
 807   text = fm->content;
 808   text_end = fm->content + fm->length;
 809   while (text < text_end)
 810     {
 811       const char *line_beg = text;
 812       const char *line_end = memchr (text, '\n', text_end - text);
 813       if (!line_end)
 814         line_end = text_end;
 815       else
 816         ++line_end;
 817       text = line_end;
 818       while (line_beg < line_end
 819              && ISSPACE (*line_beg))
 820         ++line_beg;
 821       while (line_end > line_beg + 1
 822              && ISSPACE (*(line_end - 1)))
 823         --line_end;
 824       if (line_end > line_beg)
 825         {
 826           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 827           memset (entry, 0, sizeof (*entry));
 828           entry->next = NULL;
 829           entry->url = strdupdelim (line_beg, line_end);
 830           if (!head)
 831             head = entry;
 832           else
 833             tail->next = entry;
 834           tail = entry;
 835         }
 836     }
 837   read_file_free (fm);
 838   return head;
 839 }
 840 \f
 841 /* Free the linked list of urlpos.  */
 842 void
 843 free_urlpos (urlpos *l)
 844 {
 845   while (l)
 846     {
 847       urlpos *next = l->next;
 848       xfree (l->url);
 849       FREE_MAYBE (l->local_name);
 850       xfree (l);
 851       l = next;
 852     }
 853 }
 854
 855 /* Rotate FNAME opt.backups times */
 856 void
 857 rotate_backups(const char *fname)
 858 {
 859   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 860   char *from = (char *)alloca (maxlen);
 861   char *to = (char *)alloca (maxlen);
 862   struct stat sb;
 863   int i;
 864
 865   if (stat (fname, &sb) == 0)
 866     if (S_ISREG (sb.st_mode) == 0)
 867       return;
 868
 869   for (i = opt.backups; i > 1; i--)
 870     {
 871       sprintf (from, "%s.%d", fname, i - 1);
 872       sprintf (to, "%s.%d", fname, i);
 873       /* #### This will fail on machines without the rename() system
 874          call.  */
 875       rename (from, to);
 876     }
 877
 878   sprintf (to, "%s.%d", fname, 1);
 879   rename(fname, to);
 880 }
 881
 882 /* Create all the necessary directories for PATH (a file).  Calls
 883    mkdirhier() internally.  */
 884 int
 885 mkalldirs (const char *path)
 886 {
 887   const char *p;
 888   char *t;
 889   struct stat st;
 890   int res;
 891
 892   p = path + strlen (path);
 893   for (; *p != '/' && p != path; p--);
 894   /* Don't create if it's just a file.  */
 895   if ((p == path) && (*p != '/'))
 896     return 0;
 897   t = strdupdelim (path, p);
 898   /* Check whether the directory exists.  */
 899   if ((stat (t, &st) == 0))
 900     {
 901       if (S_ISDIR (st.st_mode))
 902         {
 903           xfree (t);
 904           return 0;
 905         }
 906       else
 907         {
 908           /* If the dir exists as a file name, remove it first.  This
 909              is *only* for Wget to work with buggy old CERN http
 910              servers.  Here is the scenario: When Wget tries to
 911              retrieve a directory without a slash, e.g.
 912              http://foo/bar (bar being a directory), CERN server will
 913              not redirect it too http://foo/bar/ -- it will generate a
 914              directory listing containing links to bar/file1,
 915              bar/file2, etc.  Wget will lose because it saves this
 916              HTML listing to a file `bar', so it cannot create the
 917              directory.  To work around this, if the file of the same
 918              name exists, we just remove it and create the directory
 919              anyway.  */
 920           DEBUGP (("Removing %s because of directory danger!\n", t));
 921           unlink (t);
 922         }
 923     }
 924   res = make_directory (t);
 925   if (res != 0)
 926     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 927   xfree (t);
 928   return res;
 929 }
 930
 931 static int
 932 count_slashes (const char *s)
 933 {
 934   int i = 0;
 935   while (*s)
 936     if (*s++ == '/')
 937       ++i;
 938   return i;
 939 }
 940
 941 /* Return the path name of the URL-equivalent file name, with a
 942    remote-like structure of directories.  */
 943 static char *
 944 mkstruct (const struct urlinfo *u)
 945 {
 946   char *host, *dir, *file, *res, *dirpref;
 947   int l;
 948
 949   assert (u->dir != NULL);
 950   assert (u->host != NULL);
 951
 952   if (opt.cut_dirs)
 953     {
 954       char *ptr = u->dir + (*u->dir == '/');
 955       int slash_count = 1 + count_slashes (ptr);
 956       int cut = MINVAL (opt.cut_dirs, slash_count);
 957       for (; cut && *ptr; ptr++)
 958         if (*ptr == '/')
 959           --cut;
 960       STRDUP_ALLOCA (dir, ptr);
 961     }
 962   else
 963     dir = u->dir + (*u->dir == '/');
 964
 965   host = xstrdup (u->host);
 966   /* Check for the true name (or at least a consistent name for saving
 967      to directory) of HOST, reusing the hlist if possible.  */
 968   if (opt.add_hostdir && !opt.simple_check)
 969     {
 970       char *nhost = realhost (host);
 971       xfree (host);
 972       host = nhost;
 973     }
 974   /* Add dir_prefix and hostname (if required) to the beginning of
 975      dir.  */
 976   if (opt.add_hostdir)
 977     {
 978       if (!DOTP (opt.dir_prefix))
 979         {
 980           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 981                                     + strlen (host) + 1);
 982           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 983         }
 984       else
 985         STRDUP_ALLOCA (dirpref, host);
 986     }
 987   else                         /* not add_hostdir */
 988     {
 989       if (!DOTP (opt.dir_prefix))
 990         dirpref = opt.dir_prefix;
 991       else
 992         dirpref = "";
 993     }
 994   xfree (host);
 995
 996   /* If there is a prefix, prepend it.  */
 997   if (*dirpref)
 998     {
 999       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1000       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1001       dir = newdir;
1002     }
1003   dir = xstrdup (dir);
1004   URL_CLEANSE (dir);
1005   l = strlen (dir);
1006   if (l && dir[l - 1] == '/')
1007     dir[l - 1] = '\0';
1008
1009   if (!*u->file)
1010     file = "index.html";
1011   else
1012     file = u->file;
1013
1014   /* Finally, construct the full name.  */
1015   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1016   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1017   xfree (dir);
1018   return res;
1019 }
1020
1021 /* Create a unique filename, corresponding to a given URL.  Calls
1022    mkstruct if necessary.  Does *not* actually create any directories.  */
1023 char *
1024 url_filename (const struct urlinfo *u)
1025 {
1026   char *file, *name;
1027   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1028
1029   if (opt.dirstruct)
1030     {
1031       file = mkstruct (u);
1032       have_prefix = 1;
1033     }
1034   else
1035     {
1036       if (!*u->file)
1037         file = xstrdup ("index.html");
1038       else
1039         file = xstrdup (u->file);
1040     }
1041
1042   if (!have_prefix)
1043     {
1044       /* Check whether the prefix directory is something other than "."
1045          before prepending it.  */
1046       if (!DOTP (opt.dir_prefix))
1047         {
1048           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1049                                          + 1 + strlen (file) + 1);
1050           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1051           xfree (file);
1052           file = nfile;
1053         }
1054     }
1055   /* DOS-ish file systems don't like `%' signs in them; we change it
1056      to `@'.  */
1057 #ifdef WINDOWS
1058   {
1059     char *p = file;
1060     for (p = file; *p; p++)
1061       if (*p == '%')
1062         *p = '@';
1063   }
1064 #endif /* WINDOWS */
1065
1066   /* Check the cases in which the unique extensions are not used:
1067      1) Clobbering is turned off (-nc).
1068      2) Retrieval with regetting.
1069      3) Timestamping is used.
1070      4) Hierarchy is built.
1071
1072      The exception is the case when file does exist and is a
1073      directory (actually support for bad httpd-s).  */
1074   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1075       && !(file_exists_p (file) && !file_non_directory_p (file)))
1076     return file;
1077
1078   /* Find a unique name.  */
1079   name = unique_name (file);
1080   xfree (file);
1081   return name;
1082 }
1083
1084 /* Like strlen(), but allow the URL to be ended with '?'.  */
1085 static int
1086 urlpath_length (const char *url)
1087 {
1088   const char *q = strchr (url, '?');
1089   if (q)
1090     return q - url;
1091   return strlen (url);
1092 }
1093
1094 /* Find the last occurrence of character C in the range [b, e), or
1095    NULL, if none are present.  This is almost completely equivalent to
1096    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1097    the contents of the string.  */
1098 static const char *
1099 find_last_char (const char *b, const char *e, char c)
1100 {
1101   for (; e > b; e--)
1102     if (*e == c)
1103       return e;
1104   return NULL;
1105 }
1106
1107 /* Construct a URL by concatenating an absolute URL and a path, which
1108    may or may not be absolute.  This tries to behave "reasonably" in
1109    all foreseeable cases.  It employs little specific knowledge about
1110    protocols or URL-specific stuff -- it just works on strings.  */
1111 static char *
1112 construct (const char *url, const char *sub, int subsize, int no_proto)
1113 {
1114   char *constr;
1115
1116   if (no_proto)
1117     {
1118       const char *end = url + urlpath_length (url);
1119
1120       if (*sub != '/')
1121         {
1122           /* SUB is a relative URL: we need to replace everything
1123              after last slash (possibly empty) with SUB.
1124
1125              So, if URL is "whatever/foo/bar", and SUB is "qux/xyzzy",
1126              our result should be "whatever/foo/qux/xyzzy".  */
1127           int need_explicit_slash = 0;
1128           int span;
1129           const char *start_insert;
1130           const char *last_slash = find_last_char (url, end, '/'); /* the last slash. */
1131           if (!last_slash)
1132             {
1133               /* No slash found at all.  Append SUB to what we have,
1134                  but we'll need a slash as a separator.
1135
1136                  Example: if url == "foo" and sub == "qux/xyzzy", then
1137                  we cannot just append sub to url, because we'd get
1138                  "fooqux/xyzzy", whereas what we want is
1139                  "foo/qux/xyzzy".
1140
1141                  To make sure the / gets inserted, we set
1142                  need_explicit_slash to 1.  We also set start_insert
1143                  to end + 1, so that the length calculations work out
1144                  correctly for one more (slash) character.  Accessing
1145                  that character is fine, since it will be the
1146                  delimiter, '\0' or '?'.  */
1147               /* example: "foo?..." */
1148               /*               ^    ('?' gets changed to '/') */
1149               start_insert = end + 1;
1150               need_explicit_slash = 1;
1151             }
1152           else if (last_slash && last_slash != url && *(last_slash - 1) == '/')
1153             {
1154               /* example: http://host"  */
1155               /*                      ^ */
1156               start_insert = end + 1;
1157               need_explicit_slash = 1;
1158             }
1159           else
1160             {
1161               /* example: "whatever/foo/bar" */
1162               /*                        ^    */
1163               start_insert = last_slash + 1;
1164             }
1165
1166           span = start_insert - url;
1167           constr = (char *)xmalloc (span + subsize + 1);
1168           if (span)
1169             memcpy (constr, url, span);
1170           if (need_explicit_slash)
1171             constr[span - 1] = '/';
1172           if (subsize)
1173             memcpy (constr + span, sub, subsize);
1174           constr[span + subsize] = '\0';
1175         }
1176       else /* *sub == `/' */
1177         {
1178           /* SUB is an absolute path: we need to replace everything
1179              after (and including) the FIRST slash with SUB.
1180
1181              So, if URL is "http://host/whatever/foo/bar", and SUB is
1182              "/qux/xyzzy", our result should be
1183              "http://host/qux/xyzzy".  */
1184           int span;
1185           const char *slash;
1186           const char *start_insert = NULL; /* for gcc to shut up. */
1187           const char *pos = url;
1188           int seen_slash_slash = 0;
1189           /* We're looking for the first slash, but want to ignore
1190              double slash. */
1191         again:
1192           slash = memchr (pos, '/', end - pos);
1193           if (slash && !seen_slash_slash)
1194             if (*(slash + 1) == '/')
1195               {
1196                 pos = slash + 2;
1197                 seen_slash_slash = 1;
1198                 goto again;
1199               }
1200
1201           /* At this point, SLASH is the location of the first / after
1202              "//", or the first slash altogether.  START_INSERT is the
1203              pointer to the location where SUB will be inserted.  When
1204              examining the last two examples, keep in mind that SUB
1205              begins with '/'. */
1206
1207           if (!slash && !seen_slash_slash)
1208             /* example: "foo" */
1209             /*           ^    */
1210             start_insert = url;
1211           else if (!slash && seen_slash_slash)
1212             /* example: "http://foo" */
1213             /*                     ^ */
1214             start_insert = end;
1215           else if (slash && !seen_slash_slash)
1216             /* example: "foo/bar" */
1217             /*           ^        */
1218             start_insert = url;
1219           else if (slash && seen_slash_slash)
1220             /* example: "http://something/" */
1221             /*                           ^  */
1222             start_insert = slash;
1223
1224           span = start_insert - url;
1225           constr = (char *)xmalloc (span + subsize + 1);
1226           if (span)
1227             memcpy (constr, url, span);
1228           if (subsize)
1229             memcpy (constr + span, sub, subsize);
1230           constr[span + subsize] = '\0';
1231         }
1232     }
1233   else /* !no_proto */
1234     {
1235       constr = strdupdelim (sub, sub + subsize);
1236     }
1237   return constr;
1238 }
1239
1240 /* Like the function above, but with a saner caller interface. */
1241 char *
1242 url_concat (const char *base_url, const char *new_url)
1243 {
1244   return construct (base_url, new_url, strlen (new_url), !has_proto (new_url));
1245 }
1246 \f
1247 /* Optimize URL by host, destructively replacing u->host with realhost
1248    (u->host).  Do this regardless of opt.simple_check.  */
1249 void
1250 opt_url (struct urlinfo *u)
1251 {
1252   /* Find the "true" host.  */
1253   char *host = realhost (u->host);
1254   xfree (u->host);
1255   u->host = host;
1256   assert (u->dir != NULL);      /* the URL must have been parsed */
1257   /* Refresh the printed representation.  */
1258   xfree (u->url);
1259   u->url = str_url (u, 0);
1260 }
1261
1262 /* This beautiful kludge is fortunately not needed, as I've made
1263    parse_dir do the (almost) right thing, so that a query can never
1264    become a part of directory.  */
1265 #if 0
1266 /* Call path_simplify, but make sure that the part after the
1267    question-mark, if any, is not destroyed by path_simplify's
1268    "optimizations".  */
1269 void
1270 path_simplify_with_kludge (char *path)
1271 {
1272   char *query = strchr (path, '?');
1273   if (query)
1274     /* path_simplify also works destructively, so we also have the
1275        license to write. */
1276     *query = '\0';
1277   path_simplify (path);
1278   if (query)
1279     {
1280       char *newend = path + strlen (path);
1281       *query = '?';
1282       if (newend != query)
1283         memmove (newend, query, strlen (query) + 1);
1284     }
1285 }
1286 #endif
1287 \f
1288 /* Returns proxy host address, in accordance with PROTO.  */
1289 char *
1290 getproxy (uerr_t proto)
1291 {
1292   if (proto == URLHTTP)
1293     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1294   else if (proto == URLFTP)
1295     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1296 #ifdef HAVE_SSL
1297   else if (proto == URLHTTPS)
1298     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1299 #endif /* HAVE_SSL */
1300   else
1301     return NULL;
1302 }
1303
1304 /* Should a host be accessed through proxy, concerning no_proxy?  */
1305 int
1306 no_proxy_match (const char *host, const char **no_proxy)
1307 {
1308   if (!no_proxy)
1309     return 1;
1310   else
1311     return !sufmatch (no_proxy, host);
1312 }
1313 \f
1314 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1315 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1316
1317 /* Change the links in an HTML document.  Accepts a structure that
1318    defines the positions of all the links.  */
1319 void
1320 convert_links (const char *file, urlpos *l)
1321 {
1322   struct file_memory *fm;
1323   FILE               *fp;
1324   const char         *p;
1325   downloaded_file_t  downloaded_file_return;
1326
1327   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1328
1329   {
1330     /* First we do a "dry run": go through the list L and see whether
1331        any URL needs to be converted in the first place.  If not, just
1332        leave the file alone.  */
1333     int count = 0;
1334     urlpos *dry = l;
1335     for (dry = l; dry; dry = dry->next)
1336       if (dry->convert != CO_NOCONVERT)
1337         ++count;
1338     if (!count)
1339       {
1340         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1341         return;
1342       }
1343   }
1344
1345   fm = read_file (file);
1346   if (!fm)
1347     {
1348       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1349                  file, strerror (errno));
1350       return;
1351     }
1352
1353   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1354   if (opt.backup_converted && downloaded_file_return)
1355     write_backup_file (file, downloaded_file_return);
1356
1357   /* Before opening the file for writing, unlink the file.  This is
1358      important if the data in FM is mmaped.  In such case, nulling the
1359      file, which is what fopen() below does, would make us read all
1360      zeroes from the mmaped region.  */
1361   if (unlink (file) < 0 && errno != ENOENT)
1362     {
1363       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1364                  file, strerror (errno));
1365       read_file_free (fm);
1366       return;
1367     }
1368   /* Now open the file for writing.  */
1369   fp = fopen (file, "wb");
1370   if (!fp)
1371     {
1372       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1373                  file, strerror (errno));
1374       read_file_free (fm);
1375       return;
1376     }
1377   /* Here we loop through all the URLs in file, replacing those of
1378      them that are downloaded with relative references.  */
1379   p = fm->content;
1380   for (; l; l = l->next)
1381     {
1382       char *url_start = fm->content + l->pos;
1383
1384       if (l->pos >= fm->length)
1385         {
1386           DEBUGP (("Something strange is going on.  Please investigate."));
1387           break;
1388         }
1389       /* If the URL is not to be converted, skip it.  */
1390       if (l->convert == CO_NOCONVERT)
1391         {
1392           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1393           continue;
1394         }
1395
1396       /* Echo the file contents, up to the offending URL's opening
1397          quote, to the outfile.  */
1398       fwrite (p, 1, url_start - p, fp);
1399       p = url_start;
1400       if (l->convert == CO_CONVERT_TO_RELATIVE)
1401         {
1402           /* Convert absolute URL to relative. */
1403           char *newname = construct_relative (file, l->local_name);
1404           char *quoted_newname = html_quote_string (newname);
1405           replace_attr (&p, l->size, fp, quoted_newname);
1406           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1407                    l->url, newname, l->pos, file));
1408           xfree (newname);
1409           xfree (quoted_newname);
1410         }
1411       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1412         {
1413           /* Convert the link to absolute URL. */
1414           char *newlink = l->url;
1415           char *quoted_newlink = html_quote_string (newlink);
1416           replace_attr (&p, l->size, fp, quoted_newlink);
1417           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1418                    newlink, l->pos, file));
1419           xfree (quoted_newlink);
1420         }
1421     }
1422   /* Output the rest of the file. */
1423   if (p - fm->content < fm->length)
1424     fwrite (p, 1, fm->length - (p - fm->content), fp);
1425   fclose (fp);
1426   read_file_free (fm);
1427   logputs (LOG_VERBOSE, _("done.\n"));
1428 }
1429
1430 /* Construct and return a malloced copy of the relative link from two
1431    pieces of information: local name S1 of the referring file and
1432    local name S2 of the referred file.
1433
1434    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1435    "jagor.srce.hr/images/news.gif", the function will return
1436    "images/news.gif".
1437
1438    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1439    "fly.cc.fer.hr/images/fly.gif", the function will return
1440    "../images/fly.gif".
1441
1442    Caveats: S1 should not begin with `/', unless S2 also begins with
1443    '/'.  S1 should not contain things like ".." and such --
1444    construct_relative ("fly/ioccc/../index.html",
1445    "fly/images/fly.gif") will fail.  (A workaround is to call
1446    something like path_simplify() on S1).  */
1447 static char *
1448 construct_relative (const char *s1, const char *s2)
1449 {
1450   int i, cnt, sepdirs1;
1451   char *res;
1452
1453   if (*s2 == '/')
1454     return xstrdup (s2);
1455   /* S1 should *not* be absolute, if S2 wasn't.  */
1456   assert (*s1 != '/');
1457   i = cnt = 0;
1458   /* Skip the directories common to both strings.  */
1459   while (1)
1460     {
1461       while (s1[i] && s2[i]
1462              && (s1[i] == s2[i])
1463              && (s1[i] != '/')
1464              && (s2[i] != '/'))
1465         ++i;
1466       if (s1[i] == '/' && s2[i] == '/')
1467         cnt = ++i;
1468       else
1469         break;
1470     }
1471   for (sepdirs1 = 0; s1[i]; i++)
1472     if (s1[i] == '/')
1473       ++sepdirs1;
1474   /* Now, construct the file as of:
1475      - ../ repeated sepdirs1 time
1476      - all the non-mutual directories of S2.  */
1477   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1478   for (i = 0; i < sepdirs1; i++)
1479     memcpy (res + 3 * i, "../", 3);
1480   strcpy (res + 3 * i, s2 + cnt);
1481   return res;
1482 }
1483 \f
1484 /* Add URL to the head of the list L.  */
1485 urlpos *
1486 add_url (urlpos *l, const char *url, const char *file)
1487 {
1488   urlpos *t;
1489
1490   t = (urlpos *)xmalloc (sizeof (urlpos));
1491   memset (t, 0, sizeof (*t));
1492   t->url = xstrdup (url);
1493   t->local_name = xstrdup (file);
1494   t->next = l;
1495   return t;
1496 }
1497
1498 static void
1499 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1500 {
1501   /* Rather than just writing over the original .html file with the
1502      converted version, save the former to *.orig.  Note we only do
1503      this for files we've _successfully_ downloaded, so we don't
1504      clobber .orig files sitting around from previous invocations. */
1505
1506   /* Construct the backup filename as the original name plus ".orig". */
1507   size_t         filename_len = strlen(file);
1508   char*          filename_plus_orig_suffix;
1509   boolean        already_wrote_backup_file = FALSE;
1510   slist*         converted_file_ptr;
1511   static slist*  converted_files = NULL;
1512
1513   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1514     {
1515       /* Just write "orig" over "html".  We need to do it this way
1516          because when we're checking to see if we've downloaded the
1517          file before (to see if we can skip downloading it), we don't
1518          know if it's a text/html file.  Therefore we don't know yet
1519          at that stage that -E is going to cause us to tack on
1520          ".html", so we need to compare vs. the original URL plus
1521          ".orig", not the original URL plus ".html.orig". */
1522       filename_plus_orig_suffix = alloca (filename_len + 1);
1523       strcpy(filename_plus_orig_suffix, file);
1524       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1525     }
1526   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1527     {
1528       /* Append ".orig" to the name. */
1529       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1530       strcpy(filename_plus_orig_suffix, file);
1531       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1532     }
1533
1534   /* We can get called twice on the same URL thanks to the
1535      convert_all_links() call in main().  If we write the .orig file
1536      each time in such a case, it'll end up containing the first-pass
1537      conversion, not the original file.  So, see if we've already been
1538      called on this file. */
1539   converted_file_ptr = converted_files;
1540   while (converted_file_ptr != NULL)
1541     if (strcmp(converted_file_ptr->string, file) == 0)
1542       {
1543         already_wrote_backup_file = TRUE;
1544         break;
1545       }
1546     else
1547       converted_file_ptr = converted_file_ptr->next;
1548
1549   if (!already_wrote_backup_file)
1550     {
1551       /* Rename <file> to <file>.orig before former gets written over. */
1552       if (rename(file, filename_plus_orig_suffix) != 0)
1553         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1554                    file, filename_plus_orig_suffix, strerror (errno));
1555
1556       /* Remember that we've already written a .orig backup for this file.
1557          Note that we never free this memory since we need it till the
1558          convert_all_links() call, which is one of the last things the
1559          program does before terminating.  BTW, I'm not sure if it would be
1560          safe to just set 'converted_file_ptr->string' to 'file' below,
1561          rather than making a copy of the string...  Another note is that I
1562          thought I could just add a field to the urlpos structure saying
1563          that we'd written a .orig file for this URL, but that didn't work,
1564          so I had to make this separate list.
1565          -- Dan Harkless <wget@harkless.org>
1566
1567          This [adding a field to the urlpos structure] didn't work
1568          because convert_file() is called twice: once after all its
1569          sublinks have been retrieved in recursive_retrieve(), and
1570          once at the end of the day in convert_all_links().  The
1571          original linked list collected in recursive_retrieve() is
1572          lost after the first invocation of convert_links(), and
1573          convert_all_links() makes a new one (it calls get_urls_html()
1574          for each file it covers.)  That's why your first approach didn't
1575          work.  The way to make it work is perhaps to make this flag a
1576          field in the `urls_html' list.
1577          -- Hrvoje Niksic <hniksic@arsdigita.com>
1578       */
1579       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1580       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1581       converted_file_ptr->next = converted_files;
1582       converted_files = converted_file_ptr;
1583     }
1584 }
1585
1586 static int find_fragment PARAMS ((const char *, int, const char **,
1587                                   const char **));
1588
1589 static void
1590 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1591 {
1592   const char *p = *pp;
1593   int quote_flag = 0;
1594   int size = raw_size;
1595   char quote_char = '\"';
1596   const char *frag_beg, *frag_end;
1597
1598   /* Structure of our string is:
1599        "...old-contents..."
1600        <---  l->size   --->  (with quotes)
1601      OR:
1602        ...old-contents...
1603        <---  l->size  -->    (no quotes)   */
1604
1605   if (*p == '\"' || *p == '\'')
1606     {
1607       quote_char = *p;
1608       quote_flag = 1;
1609       ++p;
1610       size -= 2;                /* disregard opening and closing quote */
1611     }
1612   putc (quote_char, fp);
1613   fputs (new_str, fp);
1614
1615   /* Look for fragment identifier, if any. */
1616   if (find_fragment (p, size, &frag_beg, &frag_end))
1617     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1618   p += size;
1619   if (quote_flag)
1620     ++p;
1621   putc (quote_char, fp);
1622   *pp = p;
1623 }
1624
1625 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1626    preceded by '&'.  If the character is not found, return zero.  If
1627    the character is found, return 1 and set BP and EP to point to the
1628    beginning and end of the region.
1629
1630    This is used for finding the fragment indentifiers in URLs.  */
1631
1632 static int
1633 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1634 {
1635   const char *end = beg + size;
1636   int saw_amp = 0;
1637   for (; beg < end; beg++)
1638     {
1639       switch (*beg)
1640         {
1641         case '&':
1642           saw_amp = 1;
1643           break;
1644         case '#':
1645           if (!saw_amp)
1646             {
1647               *bp = beg;
1648               *ep = end;
1649               return 1;
1650             }
1651           /* fallthrough */
1652         default:
1653           saw_amp = 0;
1654         }
1655     }
1656   return 0;
1657 }
1658
1659 typedef struct _downloaded_file_list {
1660   char*                          file;
1661   downloaded_file_t              download_type;
1662   struct _downloaded_file_list*  next;
1663 } downloaded_file_list;
1664
1665 static downloaded_file_list *downloaded_files;
1666
1667 /* Remembers which files have been downloaded.  In the standard case, should be
1668    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1669    download successfully (i.e. not for ones we have failures on or that we skip
1670    due to -N).
1671
1672    When we've downloaded a file and tacked on a ".html" extension due to -E,
1673    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1674    FILE_DOWNLOADED_NORMALLY.
1675
1676    If you just want to check if a file has been previously added without adding
1677    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1678    with local filenames, not remote URLs. */
1679 downloaded_file_t
1680 downloaded_file (downloaded_file_t  mode, const char*  file)
1681 {
1682   boolean                       found_file = FALSE;
1683   downloaded_file_list*         rover = downloaded_files;
1684
1685   while (rover != NULL)
1686     if (strcmp(rover->file, file) == 0)
1687       {
1688         found_file = TRUE;
1689         break;
1690       }
1691     else
1692       rover = rover->next;
1693
1694   if (found_file)
1695     return rover->download_type;  /* file had already been downloaded */
1696   else
1697     {
1698       if (mode != CHECK_FOR_FILE)
1699         {
1700           rover = xmalloc(sizeof(*rover));
1701           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1702           rover->download_type = mode;
1703           rover->next = downloaded_files;
1704           downloaded_files = rover;
1705         }
1706
1707       return FILE_NOT_ALREADY_DOWNLOADED;
1708     }
1709 }
1710
1711 void
1712 downloaded_files_free (void)
1713 {
1714   downloaded_file_list*         rover = downloaded_files;
1715   while (rover)
1716     {
1717       downloaded_file_list *next = rover->next;
1718       xfree (rover->file);
1719       xfree (rover);
1720       rover = next;
1721     }
1722 }
1723 \f
1724 /* Initialization of static stuff. */
1725 void
1726 url_init (void)
1727 {
1728   init_unsafe_char_table ();
1729 }