sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of Wget.
   5
   6 This program is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 This program is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with this program; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Table of Unsafe chars.  This is intialized in
  46    init_unsafe_char_table.  */
  47
  48 static char unsafe_char_table[256];
  49
  50 #define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
  51
  52 /* rfc1738 reserved chars.  This is too short to warrant a table.  We
  53    don't use this yet; preservation of reserved chars will be
  54    implemented when I integrate the new `reencode_string'
  55    function.  */
  56 #define RESERVED_CHAR(c) (   (c) == ';' || (c) == '/' || (c) == '?'     \
  57                           || (c) == '@' || (c) == '=' || (c) == '&'     \
  58                           || (c) == '+')
  59
  60 /* Is X "."?  */
  61 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  62 /* Is X ".."?  */
  63 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  64
  65 static int urlpath_length PARAMS ((const char *));
  66
  67 /* A NULL-terminated list of strings to be recognized as protocol
  68    types (URL schemes).  Note that recognized doesn't mean supported
  69    -- only HTTP, HTTPS and FTP are currently supported.
  70
  71    However, a string that does not match anything in the list will be
  72    considered a relative URL.  Thus it's important that this list has
  73    anything anyone could think of being legal.
  74
  75    #### This is probably broken.  Wget should use other means to
  76    distinguish between absolute and relative URIs in HTML links.
  77
  78    Take a look at <http://www.w3.org/pub/WWW/Addressing/schemes.html>
  79    for more.  */
  80 static char *protostrings[] =
  81 {
  82   "cid:",
  83   "clsid:",
  84   "file:",
  85   "finger:",
  86   "ftp:",
  87   "gopher:",
  88   "hdl:",
  89   "http:",
  90   "https:",
  91   "ilu:",
  92   "ior:",
  93   "irc:",
  94   "java:",
  95   "javascript:",
  96   "lifn:",
  97   "mailto:",
  98   "mid:",
  99   "news:",
 100   "nntp:",
 101   "path:",
 102   "prospero:",
 103   "rlogin:",
 104   "service:",
 105   "shttp:",
 106   "snews:",
 107   "stanf:",
 108   "telnet:",
 109   "tn3270:",
 110   "wais:",
 111   "whois++:",
 112   NULL
 113 };
 114
 115 struct proto
 116 {
 117   char *name;
 118   uerr_t ind;
 119   unsigned short port;
 120 };
 121
 122 /* Similar to former, but for supported protocols: */
 123 static struct proto sup_protos[] =
 124 {
 125   { "http://", URLHTTP, DEFAULT_HTTP_PORT },
 126 #ifdef HAVE_SSL
 127   { "https://",URLHTTPS, DEFAULT_HTTPS_PORT},
 128 #endif
 129   { "ftp://", URLFTP, DEFAULT_FTP_PORT }
 130 };
 131
 132 static void parse_dir PARAMS ((const char *, char **, char **));
 133 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
 134 static char *construct PARAMS ((const char *, const char *, int , int));
 135 static char *construct_relative PARAMS ((const char *, const char *));
 136 static char process_ftp_type PARAMS ((char *));
 137
 138 \f
 139 /* Unsafe chars:
 140    - anything <= 32;
 141    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
 142    - @ and :, for user/password encoding.
 143    - everything over 127 (but we don't bother with recording those.  */
 144 void
 145 init_unsafe_char_table (void)
 146 {
 147   int i;
 148   for (i = 0; i < 256; i++)
 149     if (i < 32 || i >= 127
 150         || i == ' '
 151         || i == '<'
 152         || i == '>'
 153         || i == '\"'
 154         || i == '#'
 155         || i == '%'
 156         || i == '{'
 157         || i == '}'
 158         || i == '|'
 159         || i == '\\'
 160         || i == '^'
 161         || i == '~'
 162         || i == '['
 163         || i == ']'
 164         || i == '`')
 165       unsafe_char_table[i] = 1;
 166 }
 167
 168 /* Decodes the forms %xy in a URL to the character the hexadecimal
 169    code of which is xy.  xy are hexadecimal digits from
 170    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 171    hex-digits or `%' precedes `\0', the sequence is inserted
 172    literally.  */
 173
 174 static void
 175 decode_string (char *s)
 176 {
 177   char *p = s;
 178
 179   for (; *s; s++, p++)
 180     {
 181       if (*s != '%')
 182         *p = *s;
 183       else
 184         {
 185           /* Do nothing if at the end of the string, or if the chars
 186              are not hex-digits.  */
 187           if (!*(s + 1) || !*(s + 2)
 188               || !(ISXDIGIT (*(s + 1)) && ISXDIGIT (*(s + 2))))
 189             {
 190               *p = *s;
 191               continue;
 192             }
 193           *p = (XCHAR_TO_XDIGIT (*(s + 1)) << 4) + XCHAR_TO_XDIGIT (*(s + 2));
 194           s += 2;
 195         }
 196     }
 197   *p = '\0';
 198 }
 199
 200 /* Like encode_string, but return S if there are no unsafe chars.  */
 201
 202 static char *
 203 encode_string_maybe (const char *s)
 204 {
 205   const char *p1;
 206   char *p2, *newstr;
 207   int newlen;
 208   int addition = 0;
 209
 210   for (p1 = s; *p1; p1++)
 211     if (UNSAFE_CHAR (*p1))
 212       addition += 2;            /* Two more characters (hex digits) */
 213
 214   if (!addition)
 215     return (char *)s;
 216
 217   newlen = (p1 - s) + addition;
 218   newstr = (char *)xmalloc (newlen + 1);
 219
 220   p1 = s;
 221   p2 = newstr;
 222   while (*p1)
 223     {
 224       if (UNSAFE_CHAR (*p1))
 225         {
 226           const unsigned char c = *p1++;
 227           *p2++ = '%';
 228           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 229           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 230         }
 231       else
 232         *p2++ = *p1++;
 233     }
 234   *p2 = '\0';
 235   assert (p2 - newstr == newlen);
 236
 237   return newstr;
 238 }
 239
 240 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 241    given string, returning a malloc-ed %XX encoded string.  */
 242
 243 char *
 244 encode_string (const char *s)
 245 {
 246   char *encoded = encode_string_maybe (s);
 247   if (encoded != s)
 248     return encoded;
 249   else
 250     return xstrdup (s);
 251 }
 252
 253 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 254    the old value of PTR is freed and PTR is made to point to the newly
 255    allocated storage.  */
 256
 257 #define ENCODE(ptr) do {                        \
 258   char *e_new = encode_string_maybe (ptr);      \
 259   if (e_new != ptr)                             \
 260     {                                           \
 261       xfree (ptr);                              \
 262       ptr = e_new;                              \
 263     }                                           \
 264 } while (0)
 265 \f
 266 /* Returns the protocol type if URL's protocol is supported, or
 267    URLUNKNOWN if not.  */
 268 uerr_t
 269 urlproto (const char *url)
 270 {
 271   int i;
 272
 273   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 274     if (!strncasecmp (url, sup_protos[i].name, strlen (sup_protos[i].name)))
 275       return sup_protos[i].ind;
 276   for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++);
 277   if (url[i] == ':')
 278     {
 279       for (++i; url[i] && url[i] != '/'; i++)
 280         if (!ISDIGIT (url[i]))
 281           return URLBADPORT;
 282       if (url[i - 1] == ':')
 283         return URLFTP;
 284       else
 285         return URLHTTP;
 286     }
 287   else
 288     return URLHTTP;
 289 }
 290
 291 /* Skip the protocol part of the URL, e.g. `http://'.  If no protocol
 292    part is found, returns 0.  */
 293 int
 294 skip_proto (const char *url)
 295 {
 296   char **s;
 297   int l;
 298
 299   for (s = protostrings; *s; s++)
 300     if (!strncasecmp (*s, url, strlen (*s)))
 301       break;
 302   if (!*s)
 303     return 0;
 304   l = strlen (*s);
 305   /* HTTP and FTP protocols are expected to yield exact host names
 306      (i.e. the `//' part must be skipped, too).  */
 307   if (!strcmp (*s, "http:") || !strcmp (*s, "ftp:"))
 308     l += 2;
 309   return l;
 310 }
 311
 312 /* Returns 1 if the URL begins with a protocol (supported or
 313    unsupported), 0 otherwise.  */
 314 int
 315 has_proto (const char *url)
 316 {
 317   char **s;
 318
 319   for (s = protostrings; *s; s++)
 320     if (strncasecmp (url, *s, strlen (*s)) == 0)
 321       return 1;
 322   return 0;
 323 }
 324
 325 /* Skip the username and password, if present here.  The function
 326    should be called *not* with the complete URL, but with the part
 327    right after the protocol.
 328
 329    If no username and password are found, return 0.  */
 330 int
 331 skip_uname (const char *url)
 332 {
 333   const char *p;
 334   const char *q = NULL;
 335   for (p = url ; *p && *p != '/'; p++)
 336     if (*p == '@') q = p;
 337   /* If a `@' was found before the first occurrence of `/', skip
 338      it.  */
 339   if (q != NULL)
 340     return q - url + 1;
 341   else
 342     return 0;
 343 }
 344 \f
 345 /* Allocate a new urlinfo structure, fill it with default values and
 346    return a pointer to it.  */
 347 struct urlinfo *
 348 newurl (void)
 349 {
 350   struct urlinfo *u;
 351
 352   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 353   memset (u, 0, sizeof (*u));
 354   u->proto = URLUNKNOWN;
 355   return u;
 356 }
 357
 358 /* Perform a "deep" free of the urlinfo structure.  The structure
 359    should have been created with newurl, but need not have been used.
 360    If free_pointer is non-0, free the pointer itself.  */
 361 void
 362 freeurl (struct urlinfo *u, int complete)
 363 {
 364   assert (u != NULL);
 365   FREE_MAYBE (u->url);
 366   FREE_MAYBE (u->host);
 367   FREE_MAYBE (u->path);
 368   FREE_MAYBE (u->file);
 369   FREE_MAYBE (u->dir);
 370   FREE_MAYBE (u->user);
 371   FREE_MAYBE (u->passwd);
 372   FREE_MAYBE (u->local);
 373   FREE_MAYBE (u->referer);
 374   if (u->proxy)
 375     freeurl (u->proxy, 1);
 376   if (complete)
 377     xfree (u);
 378   return;
 379 }
 380 \f
 381 /* Extract the given URL of the form
 382    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 383    1. hostname (terminated with `/' or `:')
 384    2. port number (terminated with `/'), or chosen for the protocol
 385    3. dirname (everything after hostname)
 386    Most errors are handled.  No allocation is done, you must supply
 387    pointers to allocated memory.
 388    ...and a host of other stuff :-)
 389
 390    - Recognizes hostname:dir/file for FTP and
 391      hostname (:portnum)?/dir/file for HTTP.
 392    - Parses the path to yield directory and file
 393    - Parses the URL to yield the username and passwd (if present)
 394    - Decodes the strings, in case they contain "forbidden" characters
 395    - Writes the result to struct urlinfo
 396
 397    If the argument STRICT is set, it recognizes only the canonical
 398    form.  */
 399 uerr_t
 400 parseurl (const char *url, struct urlinfo *u, int strict)
 401 {
 402   int i, l, abs_ftp;
 403   int recognizable;            /* Recognizable URL is the one where
 404                                   the protocol name was explicitly
 405                                   named, i.e. it wasn't deduced from
 406                                   the URL format.  */
 407   uerr_t type;
 408
 409   DEBUGP (("parseurl (\"%s\") -> ", url));
 410   recognizable = has_proto (url);
 411   if (strict && !recognizable)
 412     return URLUNKNOWN;
 413   for (i = 0, l = 0; i < ARRAY_SIZE (sup_protos); i++)
 414     {
 415       l = strlen (sup_protos[i].name);
 416       if (!strncasecmp (sup_protos[i].name, url, l))
 417         break;
 418     }
 419   /* If protocol is recognizable, but unsupported, bail out, else
 420      suppose unknown.  */
 421   if (recognizable && i == ARRAY_SIZE (sup_protos))
 422     return URLUNKNOWN;
 423   else if (i == ARRAY_SIZE (sup_protos))
 424     type = URLUNKNOWN;
 425   else
 426     u->proto = type = sup_protos[i].ind;
 427
 428   if (type == URLUNKNOWN)
 429     l = 0;
 430   /* Allow a username and password to be specified (i.e. just skip
 431      them for now).  */
 432   if (recognizable)
 433     l += skip_uname (url + l);
 434   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 435   if (i == l)
 436     return URLBADHOST;
 437   /* Get the hostname.  */
 438   u->host = strdupdelim (url + l, url + i);
 439   DEBUGP (("host %s -> ", u->host));
 440
 441   /* Assume no port has been given.  */
 442   u->port = 0;
 443   if (url[i] == ':')
 444     {
 445       /* We have a colon delimiting the hostname.  It could mean that
 446          a port number is following it, or a directory.  */
 447       if (ISDIGIT (url[++i]))    /* A port number */
 448         {
 449           if (type == URLUNKNOWN)
 450             u->proto = type = URLHTTP;
 451           for (; url[i] && url[i] != '/'; i++)
 452             if (ISDIGIT (url[i]))
 453               u->port = 10 * u->port + (url[i] - '0');
 454             else
 455               return URLBADPORT;
 456           if (!u->port)
 457             return URLBADPORT;
 458           DEBUGP (("port %hu -> ", u->port));
 459         }
 460       else if (type == URLUNKNOWN) /* or a directory */
 461         u->proto = type = URLFTP;
 462       else                      /* or just a misformed port number */
 463         return URLBADPORT;
 464     }
 465   else if (type == URLUNKNOWN)
 466     u->proto = type = URLHTTP;
 467   if (!u->port)
 468     {
 469       int ind;
 470       for (ind = 0; ind < ARRAY_SIZE (sup_protos); ind++)
 471         if (sup_protos[ind].ind == type)
 472           break;
 473       if (ind == ARRAY_SIZE (sup_protos))
 474         return URLUNKNOWN;
 475       u->port = sup_protos[ind].port;
 476     }
 477   /* Some delimiter troubles...  */
 478   if (url[i] == '/' && url[i - 1] != ':')
 479     ++i;
 480   if (type == URLHTTP)
 481     while (url[i] && url[i] == '/')
 482       ++i;
 483   u->path = (char *)xmalloc (strlen (url + i) + 8);
 484   strcpy (u->path, url + i);
 485   if (type == URLFTP)
 486     {
 487       u->ftp_type = process_ftp_type (u->path);
 488       /* #### We don't handle type `d' correctly yet.  */
 489       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 490         u->ftp_type = 'I';
 491       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 492     }
 493   DEBUGP (("opath %s -> ", u->path));
 494   /* Parse the username and password (if existing).  */
 495   parse_uname (url, &u->user, &u->passwd);
 496   /* Decode the strings, as per RFC 1738.  */
 497   decode_string (u->host);
 498   decode_string (u->path);
 499   if (u->user)
 500     decode_string (u->user);
 501   if (u->passwd)
 502     decode_string (u->passwd);
 503   /* Parse the directory.  */
 504   parse_dir (u->path, &u->dir, &u->file);
 505   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 506   /* Simplify the directory.  */
 507   path_simplify (u->dir);
 508   /* Remove the leading `/' in HTTP.  */
 509   if (type == URLHTTP && *u->dir == '/')
 510     strcpy (u->dir, u->dir + 1);
 511   DEBUGP (("ndir %s\n", u->dir));
 512   /* Strip trailing `/'.  */
 513   l = strlen (u->dir);
 514   if (l > 1 && u->dir[l - 1] == '/')
 515     u->dir[l - 1] = '\0';
 516   /* Re-create the path: */
 517   abs_ftp = (u->proto == URLFTP && *u->dir == '/');
 518   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 519       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 520   strcpy (u->path, abs_ftp ? "%2F" : "/");
 521   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 522   strcat (u->path, *u->dir ? "/" : "");
 523   strcat (u->path, u->file);
 524   ENCODE (u->path);
 525   DEBUGP (("newpath: %s\n", u->path));
 526   /* Create the clean URL.  */
 527   u->url = str_url (u, 0);
 528   return URLOK;
 529 }
 530 \f
 531 /* Special versions of DOTP and DDOTP for parse_dir().  They work like
 532    DOTP and DDOTP, but they also recognize `?' as end-of-string
 533    delimiter.  This is needed for correct handling of query
 534    strings.  */
 535
 536 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 537 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 538                      && (!*((x) + 2) || *((x) + 2) == '?'))
 539
 540 /* Build the directory and filename components of the path.  Both
 541    components are *separately* malloc-ed strings!  It does not change
 542    the contents of path.
 543
 544    If the path ends with "." or "..", they are (correctly) counted as
 545    directories.  */
 546 static void
 547 parse_dir (const char *path, char **dir, char **file)
 548 {
 549   int i, l;
 550
 551   l = urlpath_length (path);
 552   for (i = l; i && path[i] != '/'; i--);
 553
 554   if (!i && *path != '/')   /* Just filename */
 555     {
 556       if (PD_DOTP (path) || PD_DDOTP (path))
 557         {
 558           *dir = strdupdelim (path, path + l);
 559           *file = xstrdup (path + l); /* normally empty, but could
 560                                          contain ?... */
 561         }
 562       else
 563         {
 564           *dir = xstrdup ("");     /* This is required because of FTP */
 565           *file = xstrdup (path);
 566         }
 567     }
 568   else if (!i)                 /* /filename */
 569     {
 570       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 571         {
 572           *dir = strdupdelim (path, path + l);
 573           *file = xstrdup (path + l); /* normally empty, but could
 574                                          contain ?... */
 575         }
 576       else
 577         {
 578           *dir = xstrdup ("/");
 579           *file = xstrdup (path + 1);
 580         }
 581     }
 582   else /* Nonempty directory with or without a filename */
 583     {
 584       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 585         {
 586           *dir = strdupdelim (path, path + l);
 587           *file = xstrdup (path + l); /* normally empty, but could
 588                                          contain ?... */
 589         }
 590       else
 591         {
 592           *dir = strdupdelim (path, path + i);
 593           *file = xstrdup (path + i + 1);
 594         }
 595     }
 596 }
 597
 598 /* Find the optional username and password within the URL, as per
 599    RFC1738.  The returned user and passwd char pointers are
 600    malloc-ed.  */
 601 static uerr_t
 602 parse_uname (const char *url, char **user, char **passwd)
 603 {
 604   int l;
 605   const char *p, *q, *col;
 606   char **where;
 607
 608   *user = NULL;
 609   *passwd = NULL;
 610
 611   /* Look for the end of the protocol string.  */
 612   l = skip_proto (url);
 613   if (!l)
 614     return URLUNKNOWN;
 615   /* Add protocol offset.  */
 616   url += l;
 617   /* Is there an `@' character?  */
 618   for (p = url; *p && *p != '/'; p++)
 619     if (*p == '@')
 620       break;
 621   /* If not, return.  */
 622   if (*p != '@')
 623     return URLOK;
 624   /* Else find the username and password.  */
 625   for (p = q = col = url; *p && *p != '/'; p++)
 626     {
 627       if (*p == ':' && !*user)
 628         {
 629           *user = (char *)xmalloc (p - url + 1);
 630           memcpy (*user, url, p - url);
 631           (*user)[p - url] = '\0';
 632           col = p + 1;
 633         }
 634       if (*p == '@') q = p;
 635     }
 636   /* Decide whether you have only the username or both.  */
 637   where = *user ? passwd : user;
 638   *where = (char *)xmalloc (q - col + 1);
 639   memcpy (*where, col, q - col);
 640   (*where)[q - col] = '\0';
 641   return URLOK;
 642 }
 643
 644 /* If PATH ends with `;type=X', return the character X.  */
 645 static char
 646 process_ftp_type (char *path)
 647 {
 648   int len = strlen (path);
 649
 650   if (len >= 7
 651       && !memcmp (path + len - 7, ";type=", 6))
 652     {
 653       path[len - 7] = '\0';
 654       return path[len - 1];
 655     }
 656   else
 657     return '\0';
 658 }
 659 \f
 660 /* Return the URL as fine-formed string, with a proper protocol, optional port
 661    number, directory and optional user/password.  If `hide' is non-zero (as it
 662    is when we're calling this on a URL we plan to print, but not when calling it
 663    to canonicalize a URL for use within the program), password will be hidden.
 664    The forbidden characters in the URL will be cleansed.  */
 665 char *
 666 str_url (const struct urlinfo *u, int hide)
 667 {
 668   char *res, *host, *user, *passwd, *proto_name, *dir, *file;
 669   int i, l, ln, lu, lh, lp, lf, ld;
 670   unsigned short proto_default_port;
 671
 672   /* Look for the protocol name.  */
 673   for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
 674     if (sup_protos[i].ind == u->proto)
 675       break;
 676   if (i == ARRAY_SIZE (sup_protos))
 677     return NULL;
 678   proto_name = sup_protos[i].name;
 679   proto_default_port = sup_protos[i].port;
 680   host = encode_string (u->host);
 681   dir = encode_string (u->dir);
 682   file = encode_string (u->file);
 683   user = passwd = NULL;
 684   if (u->user)
 685     user = encode_string (u->user);
 686   if (u->passwd)
 687     {
 688       if (hide)
 689         /* Don't output the password, or someone might see it over the user's
 690            shoulder (or in saved wget output).  Don't give away the number of
 691            characters in the password, either, as we did in past versions of
 692            this code, when we replaced the password characters with 'x's. */
 693         passwd = xstrdup("<password>");
 694       else
 695         passwd = encode_string (u->passwd);
 696     }
 697   if (u->proto == URLFTP && *dir == '/')
 698     {
 699       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 700       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 701       tmp[0] = '%';
 702       tmp[1] = '2';
 703       tmp[2] = 'F';
 704       strcpy (tmp + 3, dir + 1);
 705       xfree (dir);
 706       dir = tmp;
 707     }
 708
 709   ln = strlen (proto_name);
 710   lu = user ? strlen (user) : 0;
 711   lp = passwd ? strlen (passwd) : 0;
 712   lh = strlen (host);
 713   ld = strlen (dir);
 714   lf = strlen (file);
 715   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 716   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", proto_name,
 717      (user ? user : ""), (passwd ? ":" : ""),
 718      (passwd ? passwd : ""), (user ? "@" : ""),
 719      host, u->port, dir, *dir ? "/" : "", file); */
 720   l = 0;
 721   memcpy (res, proto_name, ln);
 722   l += ln;
 723   if (user)
 724     {
 725       memcpy (res + l, user, lu);
 726       l += lu;
 727       if (passwd)
 728         {
 729           res[l++] = ':';
 730           memcpy (res + l, passwd, lp);
 731           l += lp;
 732         }
 733       res[l++] = '@';
 734     }
 735   memcpy (res + l, host, lh);
 736   l += lh;
 737   if (u->port != proto_default_port)
 738     {
 739       res[l++] = ':';
 740       long_to_string (res + l, (long)u->port);
 741       l += numdigit (u->port);
 742     }
 743   res[l++] = '/';
 744   memcpy (res + l, dir, ld);
 745   l += ld;
 746   if (*dir)
 747     res[l++] = '/';
 748   strcpy (res + l, file);
 749   xfree (host);
 750   xfree (dir);
 751   xfree (file);
 752   FREE_MAYBE (user);
 753   FREE_MAYBE (passwd);
 754   return res;
 755 }
 756
 757 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 758    location.  Uses parseurl to parse them, and compares the canonical
 759    forms.
 760
 761    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 762    return 0 on error.  */
 763 int
 764 url_equal (const char *url1, const char *url2)
 765 {
 766   struct urlinfo *u1, *u2;
 767   uerr_t err;
 768   int res;
 769
 770   u1 = newurl ();
 771   err = parseurl (url1, u1, 0);
 772   if (err != URLOK)
 773     {
 774       freeurl (u1, 1);
 775       return 0;
 776     }
 777   u2 = newurl ();
 778   err = parseurl (url2, u2, 0);
 779   if (err != URLOK)
 780     {
 781       freeurl (u2, 1);
 782       return 0;
 783     }
 784   res = !strcmp (u1->url, u2->url);
 785   freeurl (u1, 1);
 786   freeurl (u2, 1);
 787   return res;
 788 }
 789 \f
 790 urlpos *
 791 get_urls_file (const char *file)
 792 {
 793   struct file_memory *fm;
 794   urlpos *head, *tail;
 795   const char *text, *text_end;
 796
 797   /* Load the file.  */
 798   fm = read_file (file);
 799   if (!fm)
 800     {
 801       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 802       return NULL;
 803     }
 804   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 805   head = tail = NULL;
 806   text = fm->content;
 807   text_end = fm->content + fm->length;
 808   while (text < text_end)
 809     {
 810       const char *line_beg = text;
 811       const char *line_end = memchr (text, '\n', text_end - text);
 812       if (!line_end)
 813         line_end = text_end;
 814       else
 815         ++line_end;
 816       text = line_end;
 817       while (line_beg < line_end
 818              && ISSPACE (*line_beg))
 819         ++line_beg;
 820       while (line_end > line_beg + 1
 821              && ISSPACE (*(line_end - 1)))
 822         --line_end;
 823       if (line_end > line_beg)
 824         {
 825           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 826           memset (entry, 0, sizeof (*entry));
 827           entry->next = NULL;
 828           entry->url = strdupdelim (line_beg, line_end);
 829           if (!head)
 830             head = entry;
 831           else
 832             tail->next = entry;
 833           tail = entry;
 834         }
 835     }
 836   read_file_free (fm);
 837   return head;
 838 }
 839 \f
 840 /* Free the linked list of urlpos.  */
 841 void
 842 free_urlpos (urlpos *l)
 843 {
 844   while (l)
 845     {
 846       urlpos *next = l->next;
 847       xfree (l->url);
 848       FREE_MAYBE (l->local_name);
 849       xfree (l);
 850       l = next;
 851     }
 852 }
 853
 854 /* Rotate FNAME opt.backups times */
 855 void
 856 rotate_backups(const char *fname)
 857 {
 858   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 859   char *from = (char *)alloca (maxlen);
 860   char *to = (char *)alloca (maxlen);
 861   struct stat sb;
 862   int i;
 863
 864   if (stat (fname, &sb) == 0)
 865     if (S_ISREG (sb.st_mode) == 0)
 866       return;
 867
 868   for (i = opt.backups; i > 1; i--)
 869     {
 870       sprintf (from, "%s.%d", fname, i - 1);
 871       sprintf (to, "%s.%d", fname, i);
 872       /* #### This will fail on machines without the rename() system
 873          call.  */
 874       rename (from, to);
 875     }
 876
 877   sprintf (to, "%s.%d", fname, 1);
 878   rename(fname, to);
 879 }
 880
 881 /* Create all the necessary directories for PATH (a file).  Calls
 882    mkdirhier() internally.  */
 883 int
 884 mkalldirs (const char *path)
 885 {
 886   const char *p;
 887   char *t;
 888   struct stat st;
 889   int res;
 890
 891   p = path + strlen (path);
 892   for (; *p != '/' && p != path; p--);
 893   /* Don't create if it's just a file.  */
 894   if ((p == path) && (*p != '/'))
 895     return 0;
 896   t = strdupdelim (path, p);
 897   /* Check whether the directory exists.  */
 898   if ((stat (t, &st) == 0))
 899     {
 900       if (S_ISDIR (st.st_mode))
 901         {
 902           xfree (t);
 903           return 0;
 904         }
 905       else
 906         {
 907           /* If the dir exists as a file name, remove it first.  This
 908              is *only* for Wget to work with buggy old CERN http
 909              servers.  Here is the scenario: When Wget tries to
 910              retrieve a directory without a slash, e.g.
 911              http://foo/bar (bar being a directory), CERN server will
 912              not redirect it too http://foo/bar/ -- it will generate a
 913              directory listing containing links to bar/file1,
 914              bar/file2, etc.  Wget will lose because it saves this
 915              HTML listing to a file `bar', so it cannot create the
 916              directory.  To work around this, if the file of the same
 917              name exists, we just remove it and create the directory
 918              anyway.  */
 919           DEBUGP (("Removing %s because of directory danger!\n", t));
 920           unlink (t);
 921         }
 922     }
 923   res = make_directory (t);
 924   if (res != 0)
 925     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 926   xfree (t);
 927   return res;
 928 }
 929
 930 static int
 931 count_slashes (const char *s)
 932 {
 933   int i = 0;
 934   while (*s)
 935     if (*s++ == '/')
 936       ++i;
 937   return i;
 938 }
 939
 940 /* Return the path name of the URL-equivalent file name, with a
 941    remote-like structure of directories.  */
 942 static char *
 943 mkstruct (const struct urlinfo *u)
 944 {
 945   char *host, *dir, *file, *res, *dirpref;
 946   int l;
 947
 948   assert (u->dir != NULL);
 949   assert (u->host != NULL);
 950
 951   if (opt.cut_dirs)
 952     {
 953       char *ptr = u->dir + (*u->dir == '/');
 954       int slash_count = 1 + count_slashes (ptr);
 955       int cut = MINVAL (opt.cut_dirs, slash_count);
 956       for (; cut && *ptr; ptr++)
 957         if (*ptr == '/')
 958           --cut;
 959       STRDUP_ALLOCA (dir, ptr);
 960     }
 961   else
 962     dir = u->dir + (*u->dir == '/');
 963
 964   host = xstrdup (u->host);
 965   /* Check for the true name (or at least a consistent name for saving
 966      to directory) of HOST, reusing the hlist if possible.  */
 967   if (opt.add_hostdir && !opt.simple_check)
 968     {
 969       char *nhost = realhost (host);
 970       xfree (host);
 971       host = nhost;
 972     }
 973   /* Add dir_prefix and hostname (if required) to the beginning of
 974      dir.  */
 975   if (opt.add_hostdir)
 976     {
 977       if (!DOTP (opt.dir_prefix))
 978         {
 979           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 980                                     + strlen (host) + 1);
 981           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 982         }
 983       else
 984         STRDUP_ALLOCA (dirpref, host);
 985     }
 986   else                         /* not add_hostdir */
 987     {
 988       if (!DOTP (opt.dir_prefix))
 989         dirpref = opt.dir_prefix;
 990       else
 991         dirpref = "";
 992     }
 993   xfree (host);
 994
 995   /* If there is a prefix, prepend it.  */
 996   if (*dirpref)
 997     {
 998       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
 999       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1000       dir = newdir;
1001     }
1002   dir = encode_string (dir);
1003   l = strlen (dir);
1004   if (l && dir[l - 1] == '/')
1005     dir[l - 1] = '\0';
1006
1007   if (!*u->file)
1008     file = "index.html";
1009   else
1010     file = u->file;
1011
1012   /* Finally, construct the full name.  */
1013   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1014   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1015   xfree (dir);
1016   return res;
1017 }
1018
1019 /* Create a unique filename, corresponding to a given URL.  Calls
1020    mkstruct if necessary.  Does *not* actually create any directories.  */
1021 char *
1022 url_filename (const struct urlinfo *u)
1023 {
1024   char *file, *name;
1025   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1026
1027   if (opt.dirstruct)
1028     {
1029       file = mkstruct (u);
1030       have_prefix = 1;
1031     }
1032   else
1033     {
1034       if (!*u->file)
1035         file = xstrdup ("index.html");
1036       else
1037         file = xstrdup (u->file);
1038     }
1039
1040   if (!have_prefix)
1041     {
1042       /* Check whether the prefix directory is something other than "."
1043          before prepending it.  */
1044       if (!DOTP (opt.dir_prefix))
1045         {
1046           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1047                                          + 1 + strlen (file) + 1);
1048           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1049           xfree (file);
1050           file = nfile;
1051         }
1052     }
1053   /* DOS-ish file systems don't like `%' signs in them; we change it
1054      to `@'.  */
1055 #ifdef WINDOWS
1056   {
1057     char *p = file;
1058     for (p = file; *p; p++)
1059       if (*p == '%')
1060         *p = '@';
1061   }
1062 #endif /* WINDOWS */
1063
1064   /* Check the cases in which the unique extensions are not used:
1065      1) Clobbering is turned off (-nc).
1066      2) Retrieval with regetting.
1067      3) Timestamping is used.
1068      4) Hierarchy is built.
1069
1070      The exception is the case when file does exist and is a
1071      directory (actually support for bad httpd-s).  */
1072   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1073       && !(file_exists_p (file) && !file_non_directory_p (file)))
1074     return file;
1075
1076   /* Find a unique name.  */
1077   name = unique_name (file);
1078   xfree (file);
1079   return name;
1080 }
1081
1082 /* Like strlen(), but allow the URL to be ended with '?'.  */
1083 static int
1084 urlpath_length (const char *url)
1085 {
1086   const char *q = strchr (url, '?');
1087   if (q)
1088     return q - url;
1089   return strlen (url);
1090 }
1091
1092 /* Find the last occurrence of character C in the range [b, e), or
1093    NULL, if none are present.  This is almost completely equivalent to
1094    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1095    the contents of the string.  */
1096 static const char *
1097 find_last_char (const char *b, const char *e, char c)
1098 {
1099   for (; e > b; e--)
1100     if (*e == c)
1101       return e;
1102   return NULL;
1103 }
1104
1105 /* Resolve the result of "linking" a base URI (BASE) to a
1106    link-specified URI (LINK).
1107
1108    Either of the URIs may be absolute or relative, complete with the
1109    host name, or path only.  This tries to behave "reasonably" in all
1110    foreseeable cases.  It employs little specific knowledge about
1111    protocols or URL-specific stuff -- it just works on strings.
1112
1113    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1114    See uri_merge for a gentler interface to this functionality.
1115
1116    #### This function should handle `./' and `../' so that the evil
1117    path_simplify can go.  */
1118 static char *
1119 uri_merge_1 (const char *base, const char *link, int linklength, int no_proto)
1120 {
1121   char *constr;
1122
1123   if (no_proto)
1124     {
1125       const char *end = base + urlpath_length (base);
1126
1127       if (*link != '/')
1128         {
1129           /* LINK is a relative URL: we need to replace everything
1130              after last slash (possibly empty) with LINK.
1131
1132              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1133              our result should be "whatever/foo/qux/xyzzy".  */
1134           int need_explicit_slash = 0;
1135           int span;
1136           const char *start_insert;
1137           const char *last_slash = find_last_char (base, end, '/');
1138           if (!last_slash)
1139             {
1140               /* No slash found at all.  Append LINK to what we have,
1141                  but we'll need a slash as a separator.
1142
1143                  Example: if base == "foo" and link == "qux/xyzzy", then
1144                  we cannot just append link to base, because we'd get
1145                  "fooqux/xyzzy", whereas what we want is
1146                  "foo/qux/xyzzy".
1147
1148                  To make sure the / gets inserted, we set
1149                  need_explicit_slash to 1.  We also set start_insert
1150                  to end + 1, so that the length calculations work out
1151                  correctly for one more (slash) character.  Accessing
1152                  that character is fine, since it will be the
1153                  delimiter, '\0' or '?'.  */
1154               /* example: "foo?..." */
1155               /*               ^    ('?' gets changed to '/') */
1156               start_insert = end + 1;
1157               need_explicit_slash = 1;
1158             }
1159           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1160             {
1161               /* example: http://host"  */
1162               /*                      ^ */
1163               start_insert = end + 1;
1164               need_explicit_slash = 1;
1165             }
1166           else
1167             {
1168               /* example: "whatever/foo/bar" */
1169               /*                        ^    */
1170               start_insert = last_slash + 1;
1171             }
1172
1173           span = start_insert - base;
1174           constr = (char *)xmalloc (span + linklength + 1);
1175           if (span)
1176             memcpy (constr, base, span);
1177           if (need_explicit_slash)
1178             constr[span - 1] = '/';
1179           if (linklength)
1180             memcpy (constr + span, link, linklength);
1181           constr[span + linklength] = '\0';
1182         }
1183       else /* *link == `/' */
1184         {
1185           /* LINK is an absolute path: we need to replace everything
1186              after (and including) the FIRST slash with LINK.
1187
1188              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1189              "/qux/xyzzy", our result should be
1190              "http://host/qux/xyzzy".  */
1191           int span;
1192           const char *slash;
1193           const char *start_insert = NULL; /* for gcc to shut up. */
1194           const char *pos = base;
1195           int seen_slash_slash = 0;
1196           /* We're looking for the first slash, but want to ignore
1197              double slash. */
1198         again:
1199           slash = memchr (pos, '/', end - pos);
1200           if (slash && !seen_slash_slash)
1201             if (*(slash + 1) == '/')
1202               {
1203                 pos = slash + 2;
1204                 seen_slash_slash = 1;
1205                 goto again;
1206               }
1207
1208           /* At this point, SLASH is the location of the first / after
1209              "//", or the first slash altogether.  START_INSERT is the
1210              pointer to the location where LINK will be inserted.  When
1211              examining the last two examples, keep in mind that LINK
1212              begins with '/'. */
1213
1214           if (!slash && !seen_slash_slash)
1215             /* example: "foo" */
1216             /*           ^    */
1217             start_insert = base;
1218           else if (!slash && seen_slash_slash)
1219             /* example: "http://foo" */
1220             /*                     ^ */
1221             start_insert = end;
1222           else if (slash && !seen_slash_slash)
1223             /* example: "foo/bar" */
1224             /*           ^        */
1225             start_insert = base;
1226           else if (slash && seen_slash_slash)
1227             /* example: "http://something/" */
1228             /*                           ^  */
1229             start_insert = slash;
1230
1231           span = start_insert - base;
1232           constr = (char *)xmalloc (span + linklength + 1);
1233           if (span)
1234             memcpy (constr, base, span);
1235           if (linklength)
1236             memcpy (constr + span, link, linklength);
1237           constr[span + linklength] = '\0';
1238         }
1239     }
1240   else /* !no_proto */
1241     {
1242       constr = strdupdelim (link, link + linklength);
1243     }
1244   return constr;
1245 }
1246
1247 /* Merge BASE with LINK and return the resulting URI.  This is an
1248    interface to uri_merge_1 that assumes that LINK is a
1249    zero-terminated string.  */
1250 char *
1251 uri_merge (const char *base, const char *link)
1252 {
1253   return uri_merge_1 (base, link, strlen (link), !has_proto (link));
1254 }
1255 \f
1256 /* Optimize URL by host, destructively replacing u->host with realhost
1257    (u->host).  Do this regardless of opt.simple_check.  */
1258 void
1259 opt_url (struct urlinfo *u)
1260 {
1261   /* Find the "true" host.  */
1262   char *host = realhost (u->host);
1263   xfree (u->host);
1264   u->host = host;
1265   assert (u->dir != NULL);      /* the URL must have been parsed */
1266   /* Refresh the printed representation.  */
1267   xfree (u->url);
1268   u->url = str_url (u, 0);
1269 }
1270 \f
1271 /* Returns proxy host address, in accordance with PROTO.  */
1272 char *
1273 getproxy (uerr_t proto)
1274 {
1275   if (proto == URLHTTP)
1276     return opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1277   else if (proto == URLFTP)
1278     return opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1279 #ifdef HAVE_SSL
1280   else if (proto == URLHTTPS)
1281     return opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1282 #endif /* HAVE_SSL */
1283   else
1284     return NULL;
1285 }
1286
1287 /* Should a host be accessed through proxy, concerning no_proxy?  */
1288 int
1289 no_proxy_match (const char *host, const char **no_proxy)
1290 {
1291   if (!no_proxy)
1292     return 1;
1293   else
1294     return !sufmatch (no_proxy, host);
1295 }
1296 \f
1297 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1298 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1299
1300 /* Change the links in an HTML document.  Accepts a structure that
1301    defines the positions of all the links.  */
1302 void
1303 convert_links (const char *file, urlpos *l)
1304 {
1305   struct file_memory *fm;
1306   FILE               *fp;
1307   const char         *p;
1308   downloaded_file_t  downloaded_file_return;
1309
1310   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1311
1312   {
1313     /* First we do a "dry run": go through the list L and see whether
1314        any URL needs to be converted in the first place.  If not, just
1315        leave the file alone.  */
1316     int count = 0;
1317     urlpos *dry = l;
1318     for (dry = l; dry; dry = dry->next)
1319       if (dry->convert != CO_NOCONVERT)
1320         ++count;
1321     if (!count)
1322       {
1323         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1324         return;
1325       }
1326   }
1327
1328   fm = read_file (file);
1329   if (!fm)
1330     {
1331       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1332                  file, strerror (errno));
1333       return;
1334     }
1335
1336   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1337   if (opt.backup_converted && downloaded_file_return)
1338     write_backup_file (file, downloaded_file_return);
1339
1340   /* Before opening the file for writing, unlink the file.  This is
1341      important if the data in FM is mmaped.  In such case, nulling the
1342      file, which is what fopen() below does, would make us read all
1343      zeroes from the mmaped region.  */
1344   if (unlink (file) < 0 && errno != ENOENT)
1345     {
1346       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1347                  file, strerror (errno));
1348       read_file_free (fm);
1349       return;
1350     }
1351   /* Now open the file for writing.  */
1352   fp = fopen (file, "wb");
1353   if (!fp)
1354     {
1355       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1356                  file, strerror (errno));
1357       read_file_free (fm);
1358       return;
1359     }
1360   /* Here we loop through all the URLs in file, replacing those of
1361      them that are downloaded with relative references.  */
1362   p = fm->content;
1363   for (; l; l = l->next)
1364     {
1365       char *url_start = fm->content + l->pos;
1366
1367       if (l->pos >= fm->length)
1368         {
1369           DEBUGP (("Something strange is going on.  Please investigate."));
1370           break;
1371         }
1372       /* If the URL is not to be converted, skip it.  */
1373       if (l->convert == CO_NOCONVERT)
1374         {
1375           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1376           continue;
1377         }
1378
1379       /* Echo the file contents, up to the offending URL's opening
1380          quote, to the outfile.  */
1381       fwrite (p, 1, url_start - p, fp);
1382       p = url_start;
1383       if (l->convert == CO_CONVERT_TO_RELATIVE)
1384         {
1385           /* Convert absolute URL to relative. */
1386           char *newname = construct_relative (file, l->local_name);
1387           char *quoted_newname = html_quote_string (newname);
1388           replace_attr (&p, l->size, fp, quoted_newname);
1389           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1390                    l->url, newname, l->pos, file));
1391           xfree (newname);
1392           xfree (quoted_newname);
1393         }
1394       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1395         {
1396           /* Convert the link to absolute URL. */
1397           char *newlink = l->url;
1398           char *quoted_newlink = html_quote_string (newlink);
1399           replace_attr (&p, l->size, fp, quoted_newlink);
1400           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1401                    newlink, l->pos, file));
1402           xfree (quoted_newlink);
1403         }
1404     }
1405   /* Output the rest of the file. */
1406   if (p - fm->content < fm->length)
1407     fwrite (p, 1, fm->length - (p - fm->content), fp);
1408   fclose (fp);
1409   read_file_free (fm);
1410   logputs (LOG_VERBOSE, _("done.\n"));
1411 }
1412
1413 /* Construct and return a malloced copy of the relative link from two
1414    pieces of information: local name S1 of the referring file and
1415    local name S2 of the referred file.
1416
1417    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1418    "jagor.srce.hr/images/news.gif", the function will return
1419    "images/news.gif".
1420
1421    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1422    "fly.cc.fer.hr/images/fly.gif", the function will return
1423    "../images/fly.gif".
1424
1425    Caveats: S1 should not begin with `/', unless S2 also begins with
1426    '/'.  S1 should not contain things like ".." and such --
1427    construct_relative ("fly/ioccc/../index.html",
1428    "fly/images/fly.gif") will fail.  (A workaround is to call
1429    something like path_simplify() on S1).  */
1430 static char *
1431 construct_relative (const char *s1, const char *s2)
1432 {
1433   int i, cnt, sepdirs1;
1434   char *res;
1435
1436   if (*s2 == '/')
1437     return xstrdup (s2);
1438   /* S1 should *not* be absolute, if S2 wasn't.  */
1439   assert (*s1 != '/');
1440   i = cnt = 0;
1441   /* Skip the directories common to both strings.  */
1442   while (1)
1443     {
1444       while (s1[i] && s2[i]
1445              && (s1[i] == s2[i])
1446              && (s1[i] != '/')
1447              && (s2[i] != '/'))
1448         ++i;
1449       if (s1[i] == '/' && s2[i] == '/')
1450         cnt = ++i;
1451       else
1452         break;
1453     }
1454   for (sepdirs1 = 0; s1[i]; i++)
1455     if (s1[i] == '/')
1456       ++sepdirs1;
1457   /* Now, construct the file as of:
1458      - ../ repeated sepdirs1 time
1459      - all the non-mutual directories of S2.  */
1460   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1461   for (i = 0; i < sepdirs1; i++)
1462     memcpy (res + 3 * i, "../", 3);
1463   strcpy (res + 3 * i, s2 + cnt);
1464   return res;
1465 }
1466 \f
1467 /* Add URL to the head of the list L.  */
1468 urlpos *
1469 add_url (urlpos *l, const char *url, const char *file)
1470 {
1471   urlpos *t;
1472
1473   t = (urlpos *)xmalloc (sizeof (urlpos));
1474   memset (t, 0, sizeof (*t));
1475   t->url = xstrdup (url);
1476   t->local_name = xstrdup (file);
1477   t->next = l;
1478   return t;
1479 }
1480
1481 static void
1482 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1483 {
1484   /* Rather than just writing over the original .html file with the
1485      converted version, save the former to *.orig.  Note we only do
1486      this for files we've _successfully_ downloaded, so we don't
1487      clobber .orig files sitting around from previous invocations. */
1488
1489   /* Construct the backup filename as the original name plus ".orig". */
1490   size_t         filename_len = strlen(file);
1491   char*          filename_plus_orig_suffix;
1492   boolean        already_wrote_backup_file = FALSE;
1493   slist*         converted_file_ptr;
1494   static slist*  converted_files = NULL;
1495
1496   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1497     {
1498       /* Just write "orig" over "html".  We need to do it this way
1499          because when we're checking to see if we've downloaded the
1500          file before (to see if we can skip downloading it), we don't
1501          know if it's a text/html file.  Therefore we don't know yet
1502          at that stage that -E is going to cause us to tack on
1503          ".html", so we need to compare vs. the original URL plus
1504          ".orig", not the original URL plus ".html.orig". */
1505       filename_plus_orig_suffix = alloca (filename_len + 1);
1506       strcpy(filename_plus_orig_suffix, file);
1507       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1508     }
1509   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1510     {
1511       /* Append ".orig" to the name. */
1512       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1513       strcpy(filename_plus_orig_suffix, file);
1514       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1515     }
1516
1517   /* We can get called twice on the same URL thanks to the
1518      convert_all_links() call in main().  If we write the .orig file
1519      each time in such a case, it'll end up containing the first-pass
1520      conversion, not the original file.  So, see if we've already been
1521      called on this file. */
1522   converted_file_ptr = converted_files;
1523   while (converted_file_ptr != NULL)
1524     if (strcmp(converted_file_ptr->string, file) == 0)
1525       {
1526         already_wrote_backup_file = TRUE;
1527         break;
1528       }
1529     else
1530       converted_file_ptr = converted_file_ptr->next;
1531
1532   if (!already_wrote_backup_file)
1533     {
1534       /* Rename <file> to <file>.orig before former gets written over. */
1535       if (rename(file, filename_plus_orig_suffix) != 0)
1536         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1537                    file, filename_plus_orig_suffix, strerror (errno));
1538
1539       /* Remember that we've already written a .orig backup for this file.
1540          Note that we never free this memory since we need it till the
1541          convert_all_links() call, which is one of the last things the
1542          program does before terminating.  BTW, I'm not sure if it would be
1543          safe to just set 'converted_file_ptr->string' to 'file' below,
1544          rather than making a copy of the string...  Another note is that I
1545          thought I could just add a field to the urlpos structure saying
1546          that we'd written a .orig file for this URL, but that didn't work,
1547          so I had to make this separate list.
1548          -- Dan Harkless <wget@harkless.org>
1549
1550          This [adding a field to the urlpos structure] didn't work
1551          because convert_file() is called twice: once after all its
1552          sublinks have been retrieved in recursive_retrieve(), and
1553          once at the end of the day in convert_all_links().  The
1554          original linked list collected in recursive_retrieve() is
1555          lost after the first invocation of convert_links(), and
1556          convert_all_links() makes a new one (it calls get_urls_html()
1557          for each file it covers.)  That's why your first approach didn't
1558          work.  The way to make it work is perhaps to make this flag a
1559          field in the `urls_html' list.
1560          -- Hrvoje Niksic <hniksic@arsdigita.com>
1561       */
1562       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1563       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1564       converted_file_ptr->next = converted_files;
1565       converted_files = converted_file_ptr;
1566     }
1567 }
1568
1569 static int find_fragment PARAMS ((const char *, int, const char **,
1570                                   const char **));
1571
1572 static void
1573 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1574 {
1575   const char *p = *pp;
1576   int quote_flag = 0;
1577   int size = raw_size;
1578   char quote_char = '\"';
1579   const char *frag_beg, *frag_end;
1580
1581   /* Structure of our string is:
1582        "...old-contents..."
1583        <---  l->size   --->  (with quotes)
1584      OR:
1585        ...old-contents...
1586        <---  l->size  -->    (no quotes)   */
1587
1588   if (*p == '\"' || *p == '\'')
1589     {
1590       quote_char = *p;
1591       quote_flag = 1;
1592       ++p;
1593       size -= 2;                /* disregard opening and closing quote */
1594     }
1595   putc (quote_char, fp);
1596   fputs (new_str, fp);
1597
1598   /* Look for fragment identifier, if any. */
1599   if (find_fragment (p, size, &frag_beg, &frag_end))
1600     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1601   p += size;
1602   if (quote_flag)
1603     ++p;
1604   putc (quote_char, fp);
1605   *pp = p;
1606 }
1607
1608 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1609    preceded by '&'.  If the character is not found, return zero.  If
1610    the character is found, return 1 and set BP and EP to point to the
1611    beginning and end of the region.
1612
1613    This is used for finding the fragment indentifiers in URLs.  */
1614
1615 static int
1616 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1617 {
1618   const char *end = beg + size;
1619   int saw_amp = 0;
1620   for (; beg < end; beg++)
1621     {
1622       switch (*beg)
1623         {
1624         case '&':
1625           saw_amp = 1;
1626           break;
1627         case '#':
1628           if (!saw_amp)
1629             {
1630               *bp = beg;
1631               *ep = end;
1632               return 1;
1633             }
1634           /* fallthrough */
1635         default:
1636           saw_amp = 0;
1637         }
1638     }
1639   return 0;
1640 }
1641
1642 typedef struct _downloaded_file_list {
1643   char*                          file;
1644   downloaded_file_t              download_type;
1645   struct _downloaded_file_list*  next;
1646 } downloaded_file_list;
1647
1648 static downloaded_file_list *downloaded_files;
1649
1650 /* Remembers which files have been downloaded.  In the standard case, should be
1651    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1652    download successfully (i.e. not for ones we have failures on or that we skip
1653    due to -N).
1654
1655    When we've downloaded a file and tacked on a ".html" extension due to -E,
1656    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1657    FILE_DOWNLOADED_NORMALLY.
1658
1659    If you just want to check if a file has been previously added without adding
1660    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1661    with local filenames, not remote URLs. */
1662 downloaded_file_t
1663 downloaded_file (downloaded_file_t  mode, const char*  file)
1664 {
1665   boolean                       found_file = FALSE;
1666   downloaded_file_list*         rover = downloaded_files;
1667
1668   while (rover != NULL)
1669     if (strcmp(rover->file, file) == 0)
1670       {
1671         found_file = TRUE;
1672         break;
1673       }
1674     else
1675       rover = rover->next;
1676
1677   if (found_file)
1678     return rover->download_type;  /* file had already been downloaded */
1679   else
1680     {
1681       if (mode != CHECK_FOR_FILE)
1682         {
1683           rover = xmalloc(sizeof(*rover));
1684           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1685           rover->download_type = mode;
1686           rover->next = downloaded_files;
1687           downloaded_files = rover;
1688         }
1689
1690       return FILE_NOT_ALREADY_DOWNLOADED;
1691     }
1692 }
1693
1694 void
1695 downloaded_files_free (void)
1696 {
1697   downloaded_file_list*         rover = downloaded_files;
1698   while (rover)
1699     {
1700       downloaded_file_list *next = rover->next;
1701       xfree (rover->file);
1702       xfree (rover);
1703       rover = next;
1704     }
1705 }
1706 \f
1707 /* Initialization of static stuff. */
1708 void
1709 url_init (void)
1710 {
1711   init_unsafe_char_table ();
1712 }