sjero.net Git - wget/blob - src/url.c

   1 /* URL handling.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif
  29 #include <sys/types.h>
  30 #ifdef HAVE_UNISTD_H
  31 # include <unistd.h>
  32 #endif
  33 #include <errno.h>
  34 #include <assert.h>
  35
  36 #include "wget.h"
  37 #include "utils.h"
  38 #include "url.h"
  39 #include "host.h"
  40
  41 #ifndef errno
  42 extern int errno;
  43 #endif
  44
  45 /* Is X "."?  */
  46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
  47 /* Is X ".."?  */
  48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
  49
  50 static int urlpath_length PARAMS ((const char *));
  51
  52 struct scheme_data
  53 {
  54   enum url_scheme scheme;
  55   char *leading_string;
  56   int default_port;
  57 };
  58
  59 /* Supported schemes: */
  60 static struct scheme_data supported_schemes[] =
  61 {
  62   { SCHEME_HTTP,  "http://",  DEFAULT_HTTP_PORT },
  63 #ifdef HAVE_SSL
  64   { SCHEME_HTTPS, "https://", DEFAULT_HTTPS_PORT },
  65 #endif
  66   { SCHEME_FTP,   "ftp://",   DEFAULT_FTP_PORT }
  67 };
  68
  69 static void parse_dir PARAMS ((const char *, char **, char **));
  70 static uerr_t parse_uname PARAMS ((const char *, char **, char **));
  71 static char *construct_relative PARAMS ((const char *, const char *));
  72 static char process_ftp_type PARAMS ((char *));
  73
  74 \f
  75 /* Support for encoding and decoding of URL strings.  We determine
  76    whether a character is unsafe through static table lookup.  This
  77    code assumes ASCII character set and 8-bit chars.  */
  78
  79 enum {
  80   urlchr_reserved = 1,
  81   urlchr_unsafe   = 2
  82 };
  83
  84 #define R  urlchr_reserved
  85 #define U  urlchr_unsafe
  86 #define RU R|U
  87
  88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
  89
  90 /* rfc1738 reserved chars.  We don't use this yet; preservation of
  91    reserved chars will be implemented when I integrate the new
  92    `reencode_string' function.  */
  93
  94 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
  95
  96 /* Unsafe chars:
  97    - anything <= 32;
  98    - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
  99    - '@' and ':'; needed for encoding URL username and password.
 100    - anything >= 127. */
 101
 102 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
 103
 104 const static unsigned char urlchr_table[256] =
 105 {
 106   U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 107   U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
 108   U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
 109   U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
 110   U,  0,  U,  U,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
 111   0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
 112   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
 113   0,  0,  U,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
 114  RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
 115   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
 116   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
 117   0,  0,  0,  U,   U,  U,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
 118   U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
 119   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
 120   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
 121   0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */
 122
 123   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 124   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 125   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 126   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 127
 128   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 129   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 130   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 131   U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
 132 };
 133
 134 /* Decodes the forms %xy in a URL to the character the hexadecimal
 135    code of which is xy.  xy are hexadecimal digits from
 136    [0123456789ABCDEF] (case-insensitive).  If x or y are not
 137    hex-digits or `%' precedes `\0', the sequence is inserted
 138    literally.  */
 139
 140 static void
 141 decode_string (char *s)
 142 {
 143   char *t = s;                  /* t - tortoise */
 144   char *h = s;                  /* h - hare     */
 145
 146   for (; *h; h++, t++)
 147     {
 148       if (*h != '%')
 149         {
 150         copychar:
 151           *t = *h;
 152         }
 153       else
 154         {
 155           /* Do nothing if '%' is not followed by two hex digits. */
 156           if (!*(h + 1) || !*(h + 2)
 157               || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
 158             goto copychar;
 159           *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
 160           h += 2;
 161         }
 162     }
 163   *t = '\0';
 164 }
 165
 166 /* Like encode_string, but return S if there are no unsafe chars.  */
 167
 168 static char *
 169 encode_string_maybe (const char *s)
 170 {
 171   const char *p1;
 172   char *p2, *newstr;
 173   int newlen;
 174   int addition = 0;
 175
 176   for (p1 = s; *p1; p1++)
 177     if (UNSAFE_CHAR (*p1))
 178       addition += 2;            /* Two more characters (hex digits) */
 179
 180   if (!addition)
 181     return (char *)s;
 182
 183   newlen = (p1 - s) + addition;
 184   newstr = (char *)xmalloc (newlen + 1);
 185
 186   p1 = s;
 187   p2 = newstr;
 188   while (*p1)
 189     {
 190       if (UNSAFE_CHAR (*p1))
 191         {
 192           const unsigned char c = *p1++;
 193           *p2++ = '%';
 194           *p2++ = XDIGIT_TO_XCHAR (c >> 4);
 195           *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
 196         }
 197       else
 198         *p2++ = *p1++;
 199     }
 200   *p2 = '\0';
 201   assert (p2 - newstr == newlen);
 202
 203   return newstr;
 204 }
 205
 206 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
 207    given string, returning a malloc-ed %XX encoded string.  */
 208
 209 char *
 210 encode_string (const char *s)
 211 {
 212   char *encoded = encode_string_maybe (s);
 213   if (encoded != s)
 214     return encoded;
 215   else
 216     return xstrdup (s);
 217 }
 218
 219 /* Encode unsafe characters in PTR to %xx.  If such encoding is done,
 220    the old value of PTR is freed and PTR is made to point to the newly
 221    allocated storage.  */
 222
 223 #define ENCODE(ptr) do {                        \
 224   char *e_new = encode_string_maybe (ptr);      \
 225   if (e_new != ptr)                             \
 226     {                                           \
 227       xfree (ptr);                              \
 228       ptr = e_new;                              \
 229     }                                           \
 230 } while (0)
 231 \f
 232 /* Returns the scheme type if the scheme is supported, or
 233    SCHEME_INVALID if not.  */
 234 enum url_scheme
 235 url_scheme (const char *url)
 236 {
 237   int i;
 238
 239   for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
 240     if (!strncasecmp (url, supported_schemes[i].leading_string,
 241                       strlen (supported_schemes[i].leading_string)))
 242       return supported_schemes[i].scheme;
 243   return SCHEME_INVALID;
 244 }
 245
 246 /* Return the number of characters needed to skip the scheme part of
 247    the URL, e.g. `http://'.  If no scheme is found, returns 0.  */
 248 int
 249 url_skip_scheme (const char *url)
 250 {
 251   const char *p = url;
 252
 253   /* Skip the scheme name.  We allow `-' and `+' because of `whois++',
 254      etc. */
 255   while (ISALNUM (*p) || *p == '-' || *p == '+')
 256     ++p;
 257   if (*p != ':')
 258     return 0;
 259   /* Skip ':'. */
 260   ++p;
 261
 262   /* Skip "//" if found. */
 263   if (*p == '/' && *(p + 1) == '/')
 264     p += 2;
 265
 266   return p - url;
 267 }
 268
 269 /* Returns 1 if the URL begins with a scheme (supported or
 270    unsupported), 0 otherwise.  */
 271 int
 272 url_has_scheme (const char *url)
 273 {
 274   const char *p = url;
 275   while (ISALNUM (*p) || *p == '-' || *p == '+')
 276     ++p;
 277   return *p == ':';
 278 }
 279
 280 /* Skip the username and password, if present here.  The function
 281    should be called *not* with the complete URL, but with the part
 282    right after the scheme.
 283
 284    If no username and password are found, return 0.  */
 285 int
 286 url_skip_uname (const char *url)
 287 {
 288   const char *p;
 289   const char *q = NULL;
 290   for (p = url ; *p && *p != '/'; p++)
 291     if (*p == '@') q = p;
 292   /* If a `@' was found before the first occurrence of `/', skip
 293      it.  */
 294   if (q != NULL)
 295     return q - url + 1;
 296   else
 297     return 0;
 298 }
 299 \f
 300 /* Allocate a new urlinfo structure, fill it with default values and
 301    return a pointer to it.  */
 302 struct urlinfo *
 303 newurl (void)
 304 {
 305   struct urlinfo *u;
 306
 307   u = (struct urlinfo *)xmalloc (sizeof (struct urlinfo));
 308   memset (u, 0, sizeof (*u));
 309   u->scheme = SCHEME_INVALID;
 310   return u;
 311 }
 312
 313 /* Perform a "deep" free of the urlinfo structure.  The structure
 314    should have been created with newurl, but need not have been used.
 315    If free_pointer is non-0, free the pointer itself.  */
 316 void
 317 freeurl (struct urlinfo *u, int complete)
 318 {
 319   assert (u != NULL);
 320   FREE_MAYBE (u->url);
 321   FREE_MAYBE (u->host);
 322   FREE_MAYBE (u->path);
 323   FREE_MAYBE (u->file);
 324   FREE_MAYBE (u->dir);
 325   FREE_MAYBE (u->user);
 326   FREE_MAYBE (u->passwd);
 327   FREE_MAYBE (u->local);
 328   FREE_MAYBE (u->referer);
 329   if (u->proxy)
 330     freeurl (u->proxy, 1);
 331   if (complete)
 332     xfree (u);
 333   return;
 334 }
 335 \f
 336 enum url_parse_error {
 337   PE_UNRECOGNIZED_SCHEME, PE_BAD_PORT
 338 };
 339
 340 /* Extract the given URL of the form
 341    (http:|ftp:)// (user (:password)?@)?hostname (:port)? (/path)?
 342    1. hostname (terminated with `/' or `:')
 343    2. port number (terminated with `/'), or chosen for the scheme
 344    3. dirname (everything after hostname)
 345    Most errors are handled.  No allocation is done, you must supply
 346    pointers to allocated memory.
 347    ...and a host of other stuff :-)
 348
 349    - Recognizes hostname:dir/file for FTP and
 350      hostname (:portnum)?/dir/file for HTTP.
 351    - Parses the path to yield directory and file
 352    - Parses the URL to yield the username and passwd (if present)
 353    - Decodes the strings, in case they contain "forbidden" characters
 354    - Writes the result to struct urlinfo
 355
 356    If the argument STRICT is set, it recognizes only the canonical
 357    form.  */
 358 uerr_t
 359 parseurl (const char *url, struct urlinfo *u, int strict)
 360 {
 361   int i, l, abs_ftp;
 362   int recognizable;            /* Recognizable URL is the one where
 363                                   the scheme was explicitly named,
 364                                   i.e. it wasn't deduced from the URL
 365                                   format.  */
 366   uerr_t type;
 367
 368   DEBUGP (("parseurl (\"%s\") -> ", url));
 369   recognizable = url_has_scheme (url);
 370   if (strict && !recognizable)
 371     return URLUNKNOWN;
 372   for (i = 0, l = 0; i < ARRAY_SIZE (supported_schemes); i++)
 373     {
 374       l = strlen (supported_schemes[i].leading_string);
 375       if (!strncasecmp (supported_schemes[i].leading_string, url, l))
 376         break;
 377     }
 378   /* If scheme is recognizable, but unsupported, bail out, else
 379      suppose unknown.  */
 380   if (recognizable && i == ARRAY_SIZE (supported_schemes))
 381     return URLUNKNOWN;
 382   else if (i == ARRAY_SIZE (supported_schemes))
 383     type = URLUNKNOWN;
 384   else
 385     u->scheme = type = supported_schemes[i].scheme;
 386
 387   if (type == URLUNKNOWN)
 388     l = 0;
 389   /* Allow a username and password to be specified (i.e. just skip
 390      them for now).  */
 391   if (recognizable)
 392     l += url_skip_uname (url + l);
 393   for (i = l; url[i] && url[i] != ':' && url[i] != '/'; i++);
 394   if (i == l)
 395     return URLBADHOST;
 396   /* Get the hostname.  */
 397   u->host = strdupdelim (url + l, url + i);
 398   DEBUGP (("host %s -> ", u->host));
 399
 400   /* Assume no port has been given.  */
 401   u->port = 0;
 402   if (url[i] == ':')
 403     {
 404       /* We have a colon delimiting the hostname.  It could mean that
 405          a port number is following it, or a directory.  */
 406       if (ISDIGIT (url[++i]))    /* A port number */
 407         {
 408           if (type == URLUNKNOWN)
 409             {
 410               type = URLHTTP;
 411               u->scheme = SCHEME_HTTP;
 412             }
 413           for (; url[i] && url[i] != '/'; i++)
 414             if (ISDIGIT (url[i]))
 415               u->port = 10 * u->port + (url[i] - '0');
 416             else
 417               return URLBADPORT;
 418           if (!u->port)
 419             return URLBADPORT;
 420           DEBUGP (("port %hu -> ", u->port));
 421         }
 422       else if (type == URLUNKNOWN) /* or a directory */
 423         {
 424           type = URLFTP;
 425           u->scheme = SCHEME_FTP;
 426         }
 427       else                      /* or just a misformed port number */
 428         return URLBADPORT;
 429     }
 430   else if (type == URLUNKNOWN)
 431     {
 432       type = URLHTTP;
 433       u->scheme = SCHEME_HTTP;
 434     }
 435   if (!u->port)
 436     {
 437       int ind;
 438       for (ind = 0; ind < ARRAY_SIZE (supported_schemes); ind++)
 439         if (supported_schemes[ind].scheme == u->scheme)
 440           break;
 441       if (ind == ARRAY_SIZE (supported_schemes))
 442         return URLUNKNOWN;
 443       u->port = supported_schemes[ind].default_port;
 444     }
 445   /* Some delimiter troubles...  */
 446   if (url[i] == '/' && url[i - 1] != ':')
 447     ++i;
 448   if (type == URLHTTP)
 449     while (url[i] && url[i] == '/')
 450       ++i;
 451   u->path = (char *)xmalloc (strlen (url + i) + 8);
 452   strcpy (u->path, url + i);
 453   if (type == URLFTP)
 454     {
 455       u->ftp_type = process_ftp_type (u->path);
 456       /* #### We don't handle type `d' correctly yet.  */
 457       if (!u->ftp_type || TOUPPER (u->ftp_type) == 'D')
 458         u->ftp_type = 'I';
 459       DEBUGP (("ftp_type %c -> ", u->ftp_type));
 460     }
 461   DEBUGP (("opath %s -> ", u->path));
 462   /* Parse the username and password (if existing).  */
 463   parse_uname (url, &u->user, &u->passwd);
 464   /* Decode the strings, as per RFC 1738.  */
 465   decode_string (u->host);
 466   decode_string (u->path);
 467   if (u->user)
 468     decode_string (u->user);
 469   if (u->passwd)
 470     decode_string (u->passwd);
 471   /* Parse the directory.  */
 472   parse_dir (u->path, &u->dir, &u->file);
 473   DEBUGP (("dir %s -> file %s -> ", u->dir, u->file));
 474   /* Simplify the directory.  */
 475   path_simplify (u->dir);
 476   /* Remove the leading `/' in HTTP.  */
 477   if (type == URLHTTP && *u->dir == '/')
 478     strcpy (u->dir, u->dir + 1);
 479   DEBUGP (("ndir %s\n", u->dir));
 480   /* Strip trailing `/'.  */
 481   l = strlen (u->dir);
 482   if (l > 1 && u->dir[l - 1] == '/')
 483     u->dir[l - 1] = '\0';
 484   /* Re-create the path: */
 485   abs_ftp = (u->scheme == SCHEME_FTP && *u->dir == '/');
 486   /*  sprintf (u->path, "%s%s%s%s", abs_ftp ? "%2F": "/",
 487       abs_ftp ? (u->dir + 1) : u->dir, *u->dir ? "/" : "", u->file); */
 488   strcpy (u->path, abs_ftp ? "%2F" : "/");
 489   strcat (u->path, abs_ftp ? (u->dir + 1) : u->dir);
 490   strcat (u->path, *u->dir ? "/" : "");
 491   strcat (u->path, u->file);
 492   ENCODE (u->path);
 493   DEBUGP (("newpath: %s\n", u->path));
 494   /* Create the clean URL.  */
 495   u->url = str_url (u, 0);
 496   return URLOK;
 497 }
 498 \f
 499 /* Special versions of DOTP and DDOTP for parse_dir().  They work like
 500    DOTP and DDOTP, but they also recognize `?' as end-of-string
 501    delimiter.  This is needed for correct handling of query
 502    strings.  */
 503
 504 #define PD_DOTP(x)  ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))
 505 #define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.')             \
 506                      && (!*((x) + 2) || *((x) + 2) == '?'))
 507
 508 /* Build the directory and filename components of the path.  Both
 509    components are *separately* malloc-ed strings!  It does not change
 510    the contents of path.
 511
 512    If the path ends with "." or "..", they are (correctly) counted as
 513    directories.  */
 514 static void
 515 parse_dir (const char *path, char **dir, char **file)
 516 {
 517   int i, l;
 518
 519   l = urlpath_length (path);
 520   for (i = l; i && path[i] != '/'; i--);
 521
 522   if (!i && *path != '/')   /* Just filename */
 523     {
 524       if (PD_DOTP (path) || PD_DDOTP (path))
 525         {
 526           *dir = strdupdelim (path, path + l);
 527           *file = xstrdup (path + l); /* normally empty, but could
 528                                          contain ?... */
 529         }
 530       else
 531         {
 532           *dir = xstrdup ("");     /* This is required because of FTP */
 533           *file = xstrdup (path);
 534         }
 535     }
 536   else if (!i)                 /* /filename */
 537     {
 538       if (PD_DOTP (path + 1) || PD_DDOTP (path + 1))
 539         {
 540           *dir = strdupdelim (path, path + l);
 541           *file = xstrdup (path + l); /* normally empty, but could
 542                                          contain ?... */
 543         }
 544       else
 545         {
 546           *dir = xstrdup ("/");
 547           *file = xstrdup (path + 1);
 548         }
 549     }
 550   else /* Nonempty directory with or without a filename */
 551     {
 552       if (PD_DOTP (path + i + 1) || PD_DDOTP (path + i + 1))
 553         {
 554           *dir = strdupdelim (path, path + l);
 555           *file = xstrdup (path + l); /* normally empty, but could
 556                                          contain ?... */
 557         }
 558       else
 559         {
 560           *dir = strdupdelim (path, path + i);
 561           *file = xstrdup (path + i + 1);
 562         }
 563     }
 564 }
 565
 566 /* Find the optional username and password within the URL, as per
 567    RFC1738.  The returned user and passwd char pointers are
 568    malloc-ed.  */
 569 static uerr_t
 570 parse_uname (const char *url, char **user, char **passwd)
 571 {
 572   int l;
 573   const char *p, *q, *col;
 574   char **where;
 575
 576   *user = NULL;
 577   *passwd = NULL;
 578
 579   /* Look for the end of the scheme identifier.  */
 580   l = url_skip_scheme (url);
 581   if (!l)
 582     return URLUNKNOWN;
 583   url += l;
 584   /* Is there an `@' character?  */
 585   for (p = url; *p && *p != '/'; p++)
 586     if (*p == '@')
 587       break;
 588   /* If not, return.  */
 589   if (*p != '@')
 590     return URLOK;
 591   /* Else find the username and password.  */
 592   for (p = q = col = url; *p && *p != '/'; p++)
 593     {
 594       if (*p == ':' && !*user)
 595         {
 596           *user = (char *)xmalloc (p - url + 1);
 597           memcpy (*user, url, p - url);
 598           (*user)[p - url] = '\0';
 599           col = p + 1;
 600         }
 601       if (*p == '@') q = p;
 602     }
 603   /* Decide whether you have only the username or both.  */
 604   where = *user ? passwd : user;
 605   *where = (char *)xmalloc (q - col + 1);
 606   memcpy (*where, col, q - col);
 607   (*where)[q - col] = '\0';
 608   return URLOK;
 609 }
 610
 611 /* If PATH ends with `;type=X', return the character X.  */
 612 static char
 613 process_ftp_type (char *path)
 614 {
 615   int len = strlen (path);
 616
 617   if (len >= 7
 618       && !memcmp (path + len - 7, ";type=", 6))
 619     {
 620       path[len - 7] = '\0';
 621       return path[len - 1];
 622     }
 623   else
 624     return '\0';
 625 }
 626 \f
 627 /* Recreate the URL string from the data in urlinfo.  This can be used
 628    to create a "canonical" representation of the URL.  If `hide' is
 629    non-zero (as it is when we're calling this on a URL we plan to
 630    print, but not when calling it to canonicalize a URL for use within
 631    the program), password will be hidden.  The forbidden characters in
 632    the URL will be cleansed.  */
 633 char *
 634 str_url (const struct urlinfo *u, int hide)
 635 {
 636   char *res, *host, *user, *passwd, *scheme_name, *dir, *file;
 637   int i, l, ln, lu, lh, lp, lf, ld;
 638   unsigned short default_port;
 639
 640   /* Look for the scheme.  */
 641   for (i = 0; i < ARRAY_SIZE (supported_schemes); i++)
 642     if (supported_schemes[i].scheme == u->scheme)
 643       break;
 644   if (i == ARRAY_SIZE (supported_schemes))
 645     return NULL;
 646   scheme_name = supported_schemes[i].leading_string;
 647   default_port = supported_schemes[i].default_port;
 648   host = encode_string (u->host);
 649   dir = encode_string (u->dir);
 650   file = encode_string (u->file);
 651   user = passwd = NULL;
 652   if (u->user)
 653     user = encode_string (u->user);
 654   if (u->passwd)
 655     {
 656       if (hide)
 657         /* Don't output the password, or someone might see it over the user's
 658            shoulder (or in saved wget output).  Don't give away the number of
 659            characters in the password, either, as we did in past versions of
 660            this code, when we replaced the password characters with 'x's. */
 661         passwd = xstrdup("<password>");
 662       else
 663         passwd = encode_string (u->passwd);
 664     }
 665   if (u->scheme == SCHEME_FTP && *dir == '/')
 666     {
 667       char *tmp = (char *)xmalloc (strlen (dir) + 3);
 668       /*sprintf (tmp, "%%2F%s", dir + 1);*/
 669       tmp[0] = '%';
 670       tmp[1] = '2';
 671       tmp[2] = 'F';
 672       strcpy (tmp + 3, dir + 1);
 673       xfree (dir);
 674       dir = tmp;
 675     }
 676
 677   ln = strlen (scheme_name);
 678   lu = user ? strlen (user) : 0;
 679   lp = passwd ? strlen (passwd) : 0;
 680   lh = strlen (host);
 681   ld = strlen (dir);
 682   lf = strlen (file);
 683   res = (char *)xmalloc (ln + lu + lp + lh + ld + lf + 20); /* safe sex */
 684   /* sprintf (res, "%s%s%s%s%s%s:%d/%s%s%s", scheme_name,
 685      (user ? user : ""), (passwd ? ":" : ""),
 686      (passwd ? passwd : ""), (user ? "@" : ""),
 687      host, u->port, dir, *dir ? "/" : "", file); */
 688   l = 0;
 689   memcpy (res, scheme_name, ln);
 690   l += ln;
 691   if (user)
 692     {
 693       memcpy (res + l, user, lu);
 694       l += lu;
 695       if (passwd)
 696         {
 697           res[l++] = ':';
 698           memcpy (res + l, passwd, lp);
 699           l += lp;
 700         }
 701       res[l++] = '@';
 702     }
 703   memcpy (res + l, host, lh);
 704   l += lh;
 705   if (u->port != default_port)
 706     {
 707       res[l++] = ':';
 708       long_to_string (res + l, (long)u->port);
 709       l += numdigit (u->port);
 710     }
 711   res[l++] = '/';
 712   memcpy (res + l, dir, ld);
 713   l += ld;
 714   if (*dir)
 715     res[l++] = '/';
 716   strcpy (res + l, file);
 717   xfree (host);
 718   xfree (dir);
 719   xfree (file);
 720   FREE_MAYBE (user);
 721   FREE_MAYBE (passwd);
 722   return res;
 723 }
 724
 725 /* Check whether two URL-s are equivalent, i.e. pointing to the same
 726    location.  Uses parseurl to parse them, and compares the canonical
 727    forms.
 728
 729    Returns 1 if the URL1 is equivalent to URL2, 0 otherwise.  Also
 730    return 0 on error.  */
 731 /* Do not compile unused code. */
 732 #if 0
 733 int
 734 url_equal (const char *url1, const char *url2)
 735 {
 736   struct urlinfo *u1, *u2;
 737   uerr_t err;
 738   int res;
 739
 740   u1 = newurl ();
 741   err = parseurl (url1, u1, 0);
 742   if (err != URLOK)
 743     {
 744       freeurl (u1, 1);
 745       return 0;
 746     }
 747   u2 = newurl ();
 748   err = parseurl (url2, u2, 0);
 749   if (err != URLOK)
 750     {
 751       freeurl (u1, 1);
 752       freeurl (u2, 1);
 753       return 0;
 754     }
 755   res = !strcmp (u1->url, u2->url);
 756   freeurl (u1, 1);
 757   freeurl (u2, 1);
 758   return res;
 759 }
 760 #endif /* 0 */
 761 \f
 762 urlpos *
 763 get_urls_file (const char *file)
 764 {
 765   struct file_memory *fm;
 766   urlpos *head, *tail;
 767   const char *text, *text_end;
 768
 769   /* Load the file.  */
 770   fm = read_file (file);
 771   if (!fm)
 772     {
 773       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
 774       return NULL;
 775     }
 776   DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
 777   head = tail = NULL;
 778   text = fm->content;
 779   text_end = fm->content + fm->length;
 780   while (text < text_end)
 781     {
 782       const char *line_beg = text;
 783       const char *line_end = memchr (text, '\n', text_end - text);
 784       if (!line_end)
 785         line_end = text_end;
 786       else
 787         ++line_end;
 788       text = line_end;
 789       while (line_beg < line_end
 790              && ISSPACE (*line_beg))
 791         ++line_beg;
 792       while (line_end > line_beg + 1
 793              && ISSPACE (*(line_end - 1)))
 794         --line_end;
 795       if (line_end > line_beg)
 796         {
 797           urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
 798           memset (entry, 0, sizeof (*entry));
 799           entry->next = NULL;
 800           entry->url = strdupdelim (line_beg, line_end);
 801           if (!head)
 802             head = entry;
 803           else
 804             tail->next = entry;
 805           tail = entry;
 806         }
 807     }
 808   read_file_free (fm);
 809   return head;
 810 }
 811 \f
 812 /* Free the linked list of urlpos.  */
 813 void
 814 free_urlpos (urlpos *l)
 815 {
 816   while (l)
 817     {
 818       urlpos *next = l->next;
 819       xfree (l->url);
 820       FREE_MAYBE (l->local_name);
 821       xfree (l);
 822       l = next;
 823     }
 824 }
 825
 826 /* Rotate FNAME opt.backups times */
 827 void
 828 rotate_backups(const char *fname)
 829 {
 830   int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
 831   char *from = (char *)alloca (maxlen);
 832   char *to = (char *)alloca (maxlen);
 833   struct stat sb;
 834   int i;
 835
 836   if (stat (fname, &sb) == 0)
 837     if (S_ISREG (sb.st_mode) == 0)
 838       return;
 839
 840   for (i = opt.backups; i > 1; i--)
 841     {
 842       sprintf (from, "%s.%d", fname, i - 1);
 843       sprintf (to, "%s.%d", fname, i);
 844       /* #### This will fail on machines without the rename() system
 845          call.  */
 846       rename (from, to);
 847     }
 848
 849   sprintf (to, "%s.%d", fname, 1);
 850   rename(fname, to);
 851 }
 852
 853 /* Create all the necessary directories for PATH (a file).  Calls
 854    mkdirhier() internally.  */
 855 int
 856 mkalldirs (const char *path)
 857 {
 858   const char *p;
 859   char *t;
 860   struct stat st;
 861   int res;
 862
 863   p = path + strlen (path);
 864   for (; *p != '/' && p != path; p--);
 865   /* Don't create if it's just a file.  */
 866   if ((p == path) && (*p != '/'))
 867     return 0;
 868   t = strdupdelim (path, p);
 869   /* Check whether the directory exists.  */
 870   if ((stat (t, &st) == 0))
 871     {
 872       if (S_ISDIR (st.st_mode))
 873         {
 874           xfree (t);
 875           return 0;
 876         }
 877       else
 878         {
 879           /* If the dir exists as a file name, remove it first.  This
 880              is *only* for Wget to work with buggy old CERN http
 881              servers.  Here is the scenario: When Wget tries to
 882              retrieve a directory without a slash, e.g.
 883              http://foo/bar (bar being a directory), CERN server will
 884              not redirect it too http://foo/bar/ -- it will generate a
 885              directory listing containing links to bar/file1,
 886              bar/file2, etc.  Wget will lose because it saves this
 887              HTML listing to a file `bar', so it cannot create the
 888              directory.  To work around this, if the file of the same
 889              name exists, we just remove it and create the directory
 890              anyway.  */
 891           DEBUGP (("Removing %s because of directory danger!\n", t));
 892           unlink (t);
 893         }
 894     }
 895   res = make_directory (t);
 896   if (res != 0)
 897     logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
 898   xfree (t);
 899   return res;
 900 }
 901
 902 static int
 903 count_slashes (const char *s)
 904 {
 905   int i = 0;
 906   while (*s)
 907     if (*s++ == '/')
 908       ++i;
 909   return i;
 910 }
 911
 912 /* Return the path name of the URL-equivalent file name, with a
 913    remote-like structure of directories.  */
 914 static char *
 915 mkstruct (const struct urlinfo *u)
 916 {
 917   char *host, *dir, *file, *res, *dirpref;
 918   int l;
 919
 920   assert (u->dir != NULL);
 921   assert (u->host != NULL);
 922
 923   if (opt.cut_dirs)
 924     {
 925       char *ptr = u->dir + (*u->dir == '/');
 926       int slash_count = 1 + count_slashes (ptr);
 927       int cut = MINVAL (opt.cut_dirs, slash_count);
 928       for (; cut && *ptr; ptr++)
 929         if (*ptr == '/')
 930           --cut;
 931       STRDUP_ALLOCA (dir, ptr);
 932     }
 933   else
 934     dir = u->dir + (*u->dir == '/');
 935
 936   host = xstrdup (u->host);
 937   /* Check for the true name (or at least a consistent name for saving
 938      to directory) of HOST, reusing the hlist if possible.  */
 939   if (opt.add_hostdir && !opt.simple_check)
 940     {
 941       char *nhost = realhost (host);
 942       xfree (host);
 943       host = nhost;
 944     }
 945   /* Add dir_prefix and hostname (if required) to the beginning of
 946      dir.  */
 947   if (opt.add_hostdir)
 948     {
 949       if (!DOTP (opt.dir_prefix))
 950         {
 951           dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
 952                                     + strlen (host) + 1);
 953           sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
 954         }
 955       else
 956         STRDUP_ALLOCA (dirpref, host);
 957     }
 958   else                         /* not add_hostdir */
 959     {
 960       if (!DOTP (opt.dir_prefix))
 961         dirpref = opt.dir_prefix;
 962       else
 963         dirpref = "";
 964     }
 965   xfree (host);
 966
 967   /* If there is a prefix, prepend it.  */
 968   if (*dirpref)
 969     {
 970       char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
 971       sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
 972       dir = newdir;
 973     }
 974   dir = encode_string (dir);
 975   l = strlen (dir);
 976   if (l && dir[l - 1] == '/')
 977     dir[l - 1] = '\0';
 978
 979   if (!*u->file)
 980     file = "index.html";
 981   else
 982     file = u->file;
 983
 984   /* Finally, construct the full name.  */
 985   res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
 986   sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
 987   xfree (dir);
 988   return res;
 989 }
 990
 991 /* Return a malloced copy of S, but protect any '/' characters. */
 992
 993 static char *
 994 file_name_protect_query_string (const char *s)
 995 {
 996   const char *from;
 997   char *to, *dest;
 998   int destlen = 0;
 999   for (from = s; *from; from++)
1000     {
1001       ++destlen;
1002       if (*from == '/')
1003         destlen += 2;           /* each / gets replaced with %2F, so
1004                                    it adds two more chars.  */
1005     }
1006   dest = (char *)xmalloc (destlen + 1);
1007   for (from = s, to = dest; *from; from++)
1008     {
1009       if (*from != '/')
1010         *to++ = *from;
1011       else
1012         {
1013           *to++ = '%';
1014           *to++ = '2';
1015           *to++ = 'F';
1016         }
1017     }
1018   assert (to - dest == destlen);
1019   *to = '\0';
1020   return dest;
1021 }
1022
1023 /* Create a unique filename, corresponding to a given URL.  Calls
1024    mkstruct if necessary.  Does *not* actually create any directories.  */
1025 char *
1026 url_filename (const struct urlinfo *u)
1027 {
1028   char *file, *name;
1029   int have_prefix = 0;          /* whether we must prepend opt.dir_prefix */
1030
1031   if (opt.dirstruct)
1032     {
1033       file = mkstruct (u);
1034       have_prefix = 1;
1035     }
1036   else
1037     {
1038       if (!*u->file)
1039         file = xstrdup ("index.html");
1040       else
1041         {
1042           /* If the URL came with a query string, u->file will contain
1043              a question mark followed by query string contents.  These
1044              contents can contain '/' which would make us create
1045              unwanted directories.  These slashes must be protected
1046              explicitly.  */
1047           if (!strchr (u->file, '/'))
1048             file = xstrdup (u->file);
1049           else
1050             {
1051               /*assert (strchr (u->file, '?') != NULL);*/
1052               file = file_name_protect_query_string (u->file);
1053             }
1054         }
1055     }
1056
1057   if (!have_prefix)
1058     {
1059       /* Check whether the prefix directory is something other than "."
1060          before prepending it.  */
1061       if (!DOTP (opt.dir_prefix))
1062         {
1063           char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1064                                          + 1 + strlen (file) + 1);
1065           sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1066           xfree (file);
1067           file = nfile;
1068         }
1069     }
1070   /* DOS-ish file systems don't like `%' signs in them; we change it
1071      to `@'.  */
1072 #ifdef WINDOWS
1073   {
1074     char *p = file;
1075     for (p = file; *p; p++)
1076       if (*p == '%')
1077         *p = '@';
1078   }
1079 #endif /* WINDOWS */
1080
1081   /* Check the cases in which the unique extensions are not used:
1082      1) Clobbering is turned off (-nc).
1083      2) Retrieval with regetting.
1084      3) Timestamping is used.
1085      4) Hierarchy is built.
1086
1087      The exception is the case when file does exist and is a
1088      directory (actually support for bad httpd-s).  */
1089   if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1090       && !(file_exists_p (file) && !file_non_directory_p (file)))
1091     return file;
1092
1093   /* Find a unique name.  */
1094   name = unique_name (file);
1095   xfree (file);
1096   return name;
1097 }
1098
1099 /* Like strlen(), but allow the URL to be ended with '?'.  */
1100 static int
1101 urlpath_length (const char *url)
1102 {
1103   const char *q = strchr (url, '?');
1104   if (q)
1105     return q - url;
1106   return strlen (url);
1107 }
1108
1109 /* Find the last occurrence of character C in the range [b, e), or
1110    NULL, if none are present.  This is almost completely equivalent to
1111    { *e = '\0'; return strrchr(b); }, except that it doesn't change
1112    the contents of the string.  */
1113 static const char *
1114 find_last_char (const char *b, const char *e, char c)
1115 {
1116   for (; e > b; e--)
1117     if (*e == c)
1118       return e;
1119   return NULL;
1120 }
1121
1122 /* Resolve the result of "linking" a base URI (BASE) to a
1123    link-specified URI (LINK).
1124
1125    Either of the URIs may be absolute or relative, complete with the
1126    host name, or path only.  This tries to behave "reasonably" in all
1127    foreseeable cases.  It employs little specific knowledge about
1128    schemes or URL-specific stuff -- it just works on strings.
1129
1130    The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1131    See uri_merge for a gentler interface to this functionality.
1132
1133    #### This function should handle `./' and `../' so that the evil
1134    path_simplify can go.  */
1135 static char *
1136 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1137 {
1138   char *constr;
1139
1140   if (no_scheme)
1141     {
1142       const char *end = base + urlpath_length (base);
1143
1144       if (*link != '/')
1145         {
1146           /* LINK is a relative URL: we need to replace everything
1147              after last slash (possibly empty) with LINK.
1148
1149              So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1150              our result should be "whatever/foo/qux/xyzzy".  */
1151           int need_explicit_slash = 0;
1152           int span;
1153           const char *start_insert;
1154           const char *last_slash = find_last_char (base, end, '/');
1155           if (!last_slash)
1156             {
1157               /* No slash found at all.  Append LINK to what we have,
1158                  but we'll need a slash as a separator.
1159
1160                  Example: if base == "foo" and link == "qux/xyzzy", then
1161                  we cannot just append link to base, because we'd get
1162                  "fooqux/xyzzy", whereas what we want is
1163                  "foo/qux/xyzzy".
1164
1165                  To make sure the / gets inserted, we set
1166                  need_explicit_slash to 1.  We also set start_insert
1167                  to end + 1, so that the length calculations work out
1168                  correctly for one more (slash) character.  Accessing
1169                  that character is fine, since it will be the
1170                  delimiter, '\0' or '?'.  */
1171               /* example: "foo?..." */
1172               /*               ^    ('?' gets changed to '/') */
1173               start_insert = end + 1;
1174               need_explicit_slash = 1;
1175             }
1176           else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1177             {
1178               /* example: http://host"  */
1179               /*                      ^ */
1180               start_insert = end + 1;
1181               need_explicit_slash = 1;
1182             }
1183           else
1184             {
1185               /* example: "whatever/foo/bar" */
1186               /*                        ^    */
1187               start_insert = last_slash + 1;
1188             }
1189
1190           span = start_insert - base;
1191           constr = (char *)xmalloc (span + linklength + 1);
1192           if (span)
1193             memcpy (constr, base, span);
1194           if (need_explicit_slash)
1195             constr[span - 1] = '/';
1196           if (linklength)
1197             memcpy (constr + span, link, linklength);
1198           constr[span + linklength] = '\0';
1199         }
1200       else /* *link == `/' */
1201         {
1202           /* LINK is an absolute path: we need to replace everything
1203              after (and including) the FIRST slash with LINK.
1204
1205              So, if BASE is "http://host/whatever/foo/bar", and LINK is
1206              "/qux/xyzzy", our result should be
1207              "http://host/qux/xyzzy".  */
1208           int span;
1209           const char *slash;
1210           const char *start_insert = NULL; /* for gcc to shut up. */
1211           const char *pos = base;
1212           int seen_slash_slash = 0;
1213           /* We're looking for the first slash, but want to ignore
1214              double slash. */
1215         again:
1216           slash = memchr (pos, '/', end - pos);
1217           if (slash && !seen_slash_slash)
1218             if (*(slash + 1) == '/')
1219               {
1220                 pos = slash + 2;
1221                 seen_slash_slash = 1;
1222                 goto again;
1223               }
1224
1225           /* At this point, SLASH is the location of the first / after
1226              "//", or the first slash altogether.  START_INSERT is the
1227              pointer to the location where LINK will be inserted.  When
1228              examining the last two examples, keep in mind that LINK
1229              begins with '/'. */
1230
1231           if (!slash && !seen_slash_slash)
1232             /* example: "foo" */
1233             /*           ^    */
1234             start_insert = base;
1235           else if (!slash && seen_slash_slash)
1236             /* example: "http://foo" */
1237             /*                     ^ */
1238             start_insert = end;
1239           else if (slash && !seen_slash_slash)
1240             /* example: "foo/bar" */
1241             /*           ^        */
1242             start_insert = base;
1243           else if (slash && seen_slash_slash)
1244             /* example: "http://something/" */
1245             /*                           ^  */
1246             start_insert = slash;
1247
1248           span = start_insert - base;
1249           constr = (char *)xmalloc (span + linklength + 1);
1250           if (span)
1251             memcpy (constr, base, span);
1252           if (linklength)
1253             memcpy (constr + span, link, linklength);
1254           constr[span + linklength] = '\0';
1255         }
1256     }
1257   else /* !no_scheme */
1258     {
1259       constr = strdupdelim (link, link + linklength);
1260     }
1261   return constr;
1262 }
1263
1264 /* Merge BASE with LINK and return the resulting URI.  This is an
1265    interface to uri_merge_1 that assumes that LINK is a
1266    zero-terminated string.  */
1267 char *
1268 uri_merge (const char *base, const char *link)
1269 {
1270   return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1271 }
1272 \f
1273 /* Optimize URL by host, destructively replacing u->host with realhost
1274    (u->host).  Do this regardless of opt.simple_check.  */
1275 void
1276 opt_url (struct urlinfo *u)
1277 {
1278   /* Find the "true" host.  */
1279   char *host = realhost (u->host);
1280   xfree (u->host);
1281   u->host = host;
1282   assert (u->dir != NULL);      /* the URL must have been parsed */
1283   /* Refresh the printed representation.  */
1284   xfree (u->url);
1285   u->url = str_url (u, 0);
1286 }
1287 \f
1288 /* Returns proxy host address, in accordance with SCHEME.  */
1289 char *
1290 getproxy (enum url_scheme scheme)
1291 {
1292   char *proxy = NULL;
1293
1294   switch (scheme)
1295     {
1296     case SCHEME_HTTP:
1297       proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1298       break;
1299 #ifdef HAVE_SSL
1300     case SCHEME_HTTPS:
1301       proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1302       break;
1303 #endif
1304     case SCHEME_FTP:
1305       proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1306       break;
1307     case SCHEME_INVALID:
1308       break;
1309     }
1310   if (!proxy || !*proxy)
1311     return NULL;
1312   return proxy;
1313 }
1314
1315 /* Should a host be accessed through proxy, concerning no_proxy?  */
1316 int
1317 no_proxy_match (const char *host, const char **no_proxy)
1318 {
1319   if (!no_proxy)
1320     return 1;
1321   else
1322     return !sufmatch (no_proxy, host);
1323 }
1324 \f
1325 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1326 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1327
1328 /* Change the links in an HTML document.  Accepts a structure that
1329    defines the positions of all the links.  */
1330 void
1331 convert_links (const char *file, urlpos *l)
1332 {
1333   struct file_memory *fm;
1334   FILE               *fp;
1335   const char         *p;
1336   downloaded_file_t  downloaded_file_return;
1337
1338   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1339
1340   {
1341     /* First we do a "dry run": go through the list L and see whether
1342        any URL needs to be converted in the first place.  If not, just
1343        leave the file alone.  */
1344     int count = 0;
1345     urlpos *dry = l;
1346     for (dry = l; dry; dry = dry->next)
1347       if (dry->convert != CO_NOCONVERT)
1348         ++count;
1349     if (!count)
1350       {
1351         logputs (LOG_VERBOSE, _("nothing to do.\n"));
1352         return;
1353       }
1354   }
1355
1356   fm = read_file (file);
1357   if (!fm)
1358     {
1359       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1360                  file, strerror (errno));
1361       return;
1362     }
1363
1364   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1365   if (opt.backup_converted && downloaded_file_return)
1366     write_backup_file (file, downloaded_file_return);
1367
1368   /* Before opening the file for writing, unlink the file.  This is
1369      important if the data in FM is mmaped.  In such case, nulling the
1370      file, which is what fopen() below does, would make us read all
1371      zeroes from the mmaped region.  */
1372   if (unlink (file) < 0 && errno != ENOENT)
1373     {
1374       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1375                  file, strerror (errno));
1376       read_file_free (fm);
1377       return;
1378     }
1379   /* Now open the file for writing.  */
1380   fp = fopen (file, "wb");
1381   if (!fp)
1382     {
1383       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1384                  file, strerror (errno));
1385       read_file_free (fm);
1386       return;
1387     }
1388   /* Here we loop through all the URLs in file, replacing those of
1389      them that are downloaded with relative references.  */
1390   p = fm->content;
1391   for (; l; l = l->next)
1392     {
1393       char *url_start = fm->content + l->pos;
1394
1395       if (l->pos >= fm->length)
1396         {
1397           DEBUGP (("Something strange is going on.  Please investigate."));
1398           break;
1399         }
1400       /* If the URL is not to be converted, skip it.  */
1401       if (l->convert == CO_NOCONVERT)
1402         {
1403           DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1404           continue;
1405         }
1406
1407       /* Echo the file contents, up to the offending URL's opening
1408          quote, to the outfile.  */
1409       fwrite (p, 1, url_start - p, fp);
1410       p = url_start;
1411       if (l->convert == CO_CONVERT_TO_RELATIVE)
1412         {
1413           /* Convert absolute URL to relative. */
1414           char *newname = construct_relative (file, l->local_name);
1415           char *quoted_newname = html_quote_string (newname);
1416           replace_attr (&p, l->size, fp, quoted_newname);
1417           DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1418                    l->url, newname, l->pos, file));
1419           xfree (newname);
1420           xfree (quoted_newname);
1421         }
1422       else if (l->convert == CO_CONVERT_TO_COMPLETE)
1423         {
1424           /* Convert the link to absolute URL. */
1425           char *newlink = l->url;
1426           char *quoted_newlink = html_quote_string (newlink);
1427           replace_attr (&p, l->size, fp, quoted_newlink);
1428           DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1429                    newlink, l->pos, file));
1430           xfree (quoted_newlink);
1431         }
1432     }
1433   /* Output the rest of the file. */
1434   if (p - fm->content < fm->length)
1435     fwrite (p, 1, fm->length - (p - fm->content), fp);
1436   fclose (fp);
1437   read_file_free (fm);
1438   logputs (LOG_VERBOSE, _("done.\n"));
1439 }
1440
1441 /* Construct and return a malloced copy of the relative link from two
1442    pieces of information: local name S1 of the referring file and
1443    local name S2 of the referred file.
1444
1445    So, if S1 is "jagor.srce.hr/index.html" and S2 is
1446    "jagor.srce.hr/images/news.gif", the function will return
1447    "images/news.gif".
1448
1449    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1450    "fly.cc.fer.hr/images/fly.gif", the function will return
1451    "../images/fly.gif".
1452
1453    Caveats: S1 should not begin with `/', unless S2 also begins with
1454    '/'.  S1 should not contain things like ".." and such --
1455    construct_relative ("fly/ioccc/../index.html",
1456    "fly/images/fly.gif") will fail.  (A workaround is to call
1457    something like path_simplify() on S1).  */
1458 static char *
1459 construct_relative (const char *s1, const char *s2)
1460 {
1461   int i, cnt, sepdirs1;
1462   char *res;
1463
1464   if (*s2 == '/')
1465     return xstrdup (s2);
1466   /* S1 should *not* be absolute, if S2 wasn't.  */
1467   assert (*s1 != '/');
1468   i = cnt = 0;
1469   /* Skip the directories common to both strings.  */
1470   while (1)
1471     {
1472       while (s1[i] && s2[i]
1473              && (s1[i] == s2[i])
1474              && (s1[i] != '/')
1475              && (s2[i] != '/'))
1476         ++i;
1477       if (s1[i] == '/' && s2[i] == '/')
1478         cnt = ++i;
1479       else
1480         break;
1481     }
1482   for (sepdirs1 = 0; s1[i]; i++)
1483     if (s1[i] == '/')
1484       ++sepdirs1;
1485   /* Now, construct the file as of:
1486      - ../ repeated sepdirs1 time
1487      - all the non-mutual directories of S2.  */
1488   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1489   for (i = 0; i < sepdirs1; i++)
1490     memcpy (res + 3 * i, "../", 3);
1491   strcpy (res + 3 * i, s2 + cnt);
1492   return res;
1493 }
1494 \f
1495 /* Add URL to the head of the list L.  */
1496 urlpos *
1497 add_url (urlpos *l, const char *url, const char *file)
1498 {
1499   urlpos *t;
1500
1501   t = (urlpos *)xmalloc (sizeof (urlpos));
1502   memset (t, 0, sizeof (*t));
1503   t->url = xstrdup (url);
1504   t->local_name = xstrdup (file);
1505   t->next = l;
1506   return t;
1507 }
1508
1509 static void
1510 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1511 {
1512   /* Rather than just writing over the original .html file with the
1513      converted version, save the former to *.orig.  Note we only do
1514      this for files we've _successfully_ downloaded, so we don't
1515      clobber .orig files sitting around from previous invocations. */
1516
1517   /* Construct the backup filename as the original name plus ".orig". */
1518   size_t         filename_len = strlen(file);
1519   char*          filename_plus_orig_suffix;
1520   boolean        already_wrote_backup_file = FALSE;
1521   slist*         converted_file_ptr;
1522   static slist*  converted_files = NULL;
1523
1524   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1525     {
1526       /* Just write "orig" over "html".  We need to do it this way
1527          because when we're checking to see if we've downloaded the
1528          file before (to see if we can skip downloading it), we don't
1529          know if it's a text/html file.  Therefore we don't know yet
1530          at that stage that -E is going to cause us to tack on
1531          ".html", so we need to compare vs. the original URL plus
1532          ".orig", not the original URL plus ".html.orig". */
1533       filename_plus_orig_suffix = alloca (filename_len + 1);
1534       strcpy(filename_plus_orig_suffix, file);
1535       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1536     }
1537   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1538     {
1539       /* Append ".orig" to the name. */
1540       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1541       strcpy(filename_plus_orig_suffix, file);
1542       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1543     }
1544
1545   /* We can get called twice on the same URL thanks to the
1546      convert_all_links() call in main().  If we write the .orig file
1547      each time in such a case, it'll end up containing the first-pass
1548      conversion, not the original file.  So, see if we've already been
1549      called on this file. */
1550   converted_file_ptr = converted_files;
1551   while (converted_file_ptr != NULL)
1552     if (strcmp(converted_file_ptr->string, file) == 0)
1553       {
1554         already_wrote_backup_file = TRUE;
1555         break;
1556       }
1557     else
1558       converted_file_ptr = converted_file_ptr->next;
1559
1560   if (!already_wrote_backup_file)
1561     {
1562       /* Rename <file> to <file>.orig before former gets written over. */
1563       if (rename(file, filename_plus_orig_suffix) != 0)
1564         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1565                    file, filename_plus_orig_suffix, strerror (errno));
1566
1567       /* Remember that we've already written a .orig backup for this file.
1568          Note that we never free this memory since we need it till the
1569          convert_all_links() call, which is one of the last things the
1570          program does before terminating.  BTW, I'm not sure if it would be
1571          safe to just set 'converted_file_ptr->string' to 'file' below,
1572          rather than making a copy of the string...  Another note is that I
1573          thought I could just add a field to the urlpos structure saying
1574          that we'd written a .orig file for this URL, but that didn't work,
1575          so I had to make this separate list.
1576          -- Dan Harkless <wget@harkless.org>
1577
1578          This [adding a field to the urlpos structure] didn't work
1579          because convert_file() is called twice: once after all its
1580          sublinks have been retrieved in recursive_retrieve(), and
1581          once at the end of the day in convert_all_links().  The
1582          original linked list collected in recursive_retrieve() is
1583          lost after the first invocation of convert_links(), and
1584          convert_all_links() makes a new one (it calls get_urls_html()
1585          for each file it covers.)  That's why your first approach didn't
1586          work.  The way to make it work is perhaps to make this flag a
1587          field in the `urls_html' list.
1588          -- Hrvoje Niksic <hniksic@arsdigita.com>
1589       */
1590       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1591       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
1592       converted_file_ptr->next = converted_files;
1593       converted_files = converted_file_ptr;
1594     }
1595 }
1596
1597 static int find_fragment PARAMS ((const char *, int, const char **,
1598                                   const char **));
1599
1600 static void
1601 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1602 {
1603   const char *p = *pp;
1604   int quote_flag = 0;
1605   int size = raw_size;
1606   char quote_char = '\"';
1607   const char *frag_beg, *frag_end;
1608
1609   /* Structure of our string is:
1610        "...old-contents..."
1611        <---  l->size   --->  (with quotes)
1612      OR:
1613        ...old-contents...
1614        <---  l->size  -->    (no quotes)   */
1615
1616   if (*p == '\"' || *p == '\'')
1617     {
1618       quote_char = *p;
1619       quote_flag = 1;
1620       ++p;
1621       size -= 2;                /* disregard opening and closing quote */
1622     }
1623   putc (quote_char, fp);
1624   fputs (new_str, fp);
1625
1626   /* Look for fragment identifier, if any. */
1627   if (find_fragment (p, size, &frag_beg, &frag_end))
1628     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1629   p += size;
1630   if (quote_flag)
1631     ++p;
1632   putc (quote_char, fp);
1633   *pp = p;
1634 }
1635
1636 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1637    preceded by '&'.  If the character is not found, return zero.  If
1638    the character is found, return 1 and set BP and EP to point to the
1639    beginning and end of the region.
1640
1641    This is used for finding the fragment indentifiers in URLs.  */
1642
1643 static int
1644 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1645 {
1646   const char *end = beg + size;
1647   int saw_amp = 0;
1648   for (; beg < end; beg++)
1649     {
1650       switch (*beg)
1651         {
1652         case '&':
1653           saw_amp = 1;
1654           break;
1655         case '#':
1656           if (!saw_amp)
1657             {
1658               *bp = beg;
1659               *ep = end;
1660               return 1;
1661             }
1662           /* fallthrough */
1663         default:
1664           saw_amp = 0;
1665         }
1666     }
1667   return 0;
1668 }
1669
1670 typedef struct _downloaded_file_list {
1671   char*                          file;
1672   downloaded_file_t              download_type;
1673   struct _downloaded_file_list*  next;
1674 } downloaded_file_list;
1675
1676 static downloaded_file_list *downloaded_files;
1677
1678 /* Remembers which files have been downloaded.  In the standard case, should be
1679    called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1680    download successfully (i.e. not for ones we have failures on or that we skip
1681    due to -N).
1682
1683    When we've downloaded a file and tacked on a ".html" extension due to -E,
1684    call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1685    FILE_DOWNLOADED_NORMALLY.
1686
1687    If you just want to check if a file has been previously added without adding
1688    it, call with mode == CHECK_FOR_FILE.  Please be sure to call this function
1689    with local filenames, not remote URLs. */
1690 downloaded_file_t
1691 downloaded_file (downloaded_file_t  mode, const char*  file)
1692 {
1693   boolean                       found_file = FALSE;
1694   downloaded_file_list*         rover = downloaded_files;
1695
1696   while (rover != NULL)
1697     if (strcmp(rover->file, file) == 0)
1698       {
1699         found_file = TRUE;
1700         break;
1701       }
1702     else
1703       rover = rover->next;
1704
1705   if (found_file)
1706     return rover->download_type;  /* file had already been downloaded */
1707   else
1708     {
1709       if (mode != CHECK_FOR_FILE)
1710         {
1711           rover = xmalloc(sizeof(*rover));
1712           rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1713           rover->download_type = mode;
1714           rover->next = downloaded_files;
1715           downloaded_files = rover;
1716         }
1717
1718       return FILE_NOT_ALREADY_DOWNLOADED;
1719     }
1720 }
1721
1722 void
1723 downloaded_files_free (void)
1724 {
1725   downloaded_file_list*         rover = downloaded_files;
1726   while (rover)
1727     {
1728       downloaded_file_list *next = rover->next;
1729       xfree (rover->file);
1730       xfree (rover);
1731       rover = next;
1732     }
1733 }