sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003-2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "convert.h"
  43 #include "url.h"
  44 #include "recur.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "ptimer.h"
  48
  49 static struct hash_table *dl_file_url_map;
  50 struct hash_table *dl_url_file_map;
  51
  52 /* Set of HTML files downloaded in this Wget run, used for link
  53    conversion after Wget is done.  */
  54 struct hash_table *downloaded_html_set;
  55
  56 static struct hash_table *nonexisting_urls_hash;
  57
  58 static void convert_links (const char *, struct urlpos *);
  59
  60 /* This function is called when the retrieval is done to convert the
  61    links that have been downloaded.  It has to be called at the end of
  62    the retrieval, because only then does Wget know conclusively which
  63    URLs have been downloaded, and which not, so it can tell which
  64    direction to convert to.
  65
  66    The "direction" means that the URLs to the files that have been
  67    downloaded get converted to the relative URL which will point to
  68    that file.  And the other URLs get converted to the remote URL on
  69    the server.
  70
  71    All the downloaded HTMLs are kept in downloaded_html_files, and
  72    downloaded URLs in urls_downloaded.  All the information is
  73    extracted from these two lists.  */
  74
  75 void
  76 convert_all_links (void)
  77 {
  78   int i;
  79   double secs;
  80   int file_count = 0;
  81
  82   struct ptimer *timer = ptimer_new ();
  83
  84   int cnt;
  85   char **file_array;
  86
  87   cnt = 0;
  88   if (downloaded_html_set)
  89     cnt = hash_table_count (downloaded_html_set);
  90   if (cnt == 0)
  91     return;
  92   file_array = alloca_array (char *, cnt);
  93   string_set_to_array (downloaded_html_set, file_array);
  94
  95   for (i = 0; i < cnt; i++)
  96     {
  97       struct urlpos *urls, *cur_url;
  98       char *url;
  99       char *file = file_array[i];
 100
 101       /* Determine the URL of the HTML file.  get_urls_html will need
 102          it.  */
 103       url = hash_table_get (dl_file_url_map, file);
 104       if (!url)
 105         {
 106           DEBUGP (("Apparently %s has been removed.\n", file));
 107           continue;
 108         }
 109
 110       DEBUGP (("Scanning %s (from %s)\n", file, url));
 111
 112       /* Parse the HTML file...  */
 113       urls = get_urls_html (file, url, NULL);
 114
 115       /* We don't respect meta_disallow_follow here because, even if
 116          the file is not followed, we might still want to convert the
 117          links that have been followed from other files.  */
 118
 119       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 120         {
 121           char *local_name;
 122           struct url *u = cur_url->url;
 123
 124           if (cur_url->link_base_p)
 125             {
 126               /* Base references have been resolved by our parser, so
 127                  we turn the base URL into an empty string.  (Perhaps
 128                  we should remove the tag entirely?)  */
 129               cur_url->convert = CO_NULLIFY_BASE;
 130               continue;
 131             }
 132
 133           /* We decide the direction of conversion according to whether
 134              a URL was downloaded.  Downloaded URLs will be converted
 135              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 136           local_name = hash_table_get (dl_url_file_map, u->url);
 137
 138           /* Decide on the conversion type.  */
 139           if (local_name)
 140             {
 141               /* We've downloaded this URL.  Convert it to relative
 142                  form.  We do this even if the URL already is in
 143                  relative form, because our directory structure may
 144                  not be identical to that on the server (think `-nd',
 145                  `--cut-dirs', etc.)  */
 146               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 147               cur_url->local_name = xstrdup (local_name);
 148               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 149             }
 150           else
 151             {
 152               /* We haven't downloaded this URL.  If it's not already
 153                  complete (including a full host name), convert it to
 154                  that form, so it can be reached while browsing this
 155                  HTML locally.  */
 156               if (!cur_url->link_complete_p)
 157                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 158               cur_url->local_name = NULL;
 159               DEBUGP (("will convert url %s to complete\n", u->url));
 160             }
 161         }
 162
 163       /* Convert the links in the file.  */
 164       convert_links (file, urls);
 165       ++file_count;
 166
 167       /* Free the data.  */
 168       free_urlpos (urls);
 169     }
 170
 171   secs = ptimer_measure (timer);
 172   ptimer_destroy (timer);
 173   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 174              file_count, print_decimal (secs));
 175 }
 176
 177 static void write_backup_file (const char *, downloaded_file_t);
 178 static const char *replace_attr (const char *, int, FILE *, const char *);
 179 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 180                                               const char *, int);
 181 static char *local_quote_string (const char *);
 182 static char *construct_relative (const char *, const char *);
 183
 184 /* Change the links in one HTML file.  LINKS is a list of links in the
 185    document, along with their positions and the desired direction of
 186    the conversion.  */
 187 static void
 188 convert_links (const char *file, struct urlpos *links)
 189 {
 190   struct file_memory *fm;
 191   FILE *fp;
 192   const char *p;
 193   downloaded_file_t downloaded_file_return;
 194
 195   struct urlpos *link;
 196   int to_url_count = 0, to_file_count = 0;
 197
 198   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 199
 200   {
 201     /* First we do a "dry run": go through the list L and see whether
 202        any URL needs to be converted in the first place.  If not, just
 203        leave the file alone.  */
 204     int dry_count = 0;
 205     struct urlpos *dry;
 206     for (dry = links; dry; dry = dry->next)
 207       if (dry->convert != CO_NOCONVERT)
 208         ++dry_count;
 209     if (!dry_count)
 210       {
 211         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 212         return;
 213       }
 214   }
 215
 216   fm = read_file (file);
 217   if (!fm)
 218     {
 219       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 220                  file, strerror (errno));
 221       return;
 222     }
 223
 224   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 225   if (opt.backup_converted && downloaded_file_return)
 226     write_backup_file (file, downloaded_file_return);
 227
 228   /* Before opening the file for writing, unlink the file.  This is
 229      important if the data in FM is mmaped.  In such case, nulling the
 230      file, which is what fopen() below does, would make us read all
 231      zeroes from the mmaped region.  */
 232   if (unlink (file) < 0 && errno != ENOENT)
 233     {
 234       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
 235                  file, strerror (errno));
 236       read_file_free (fm);
 237       return;
 238     }
 239   /* Now open the file for writing.  */
 240   fp = fopen (file, "wb");
 241   if (!fp)
 242     {
 243       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 244                  file, strerror (errno));
 245       read_file_free (fm);
 246       return;
 247     }
 248
 249   /* Here we loop through all the URLs in file, replacing those of
 250      them that are downloaded with relative references.  */
 251   p = fm->content;
 252   for (link = links; link; link = link->next)
 253     {
 254       char *url_start = fm->content + link->pos;
 255
 256       if (link->pos >= fm->length)
 257         {
 258           DEBUGP (("Something strange is going on.  Please investigate."));
 259           break;
 260         }
 261       /* If the URL is not to be converted, skip it.  */
 262       if (link->convert == CO_NOCONVERT)
 263         {
 264           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 265           continue;
 266         }
 267
 268       /* Echo the file contents, up to the offending URL's opening
 269          quote, to the outfile.  */
 270       fwrite (p, 1, url_start - p, fp);
 271       p = url_start;
 272
 273       switch (link->convert)
 274         {
 275         case CO_CONVERT_TO_RELATIVE:
 276           /* Convert absolute URL to relative. */
 277           {
 278             char *newname = construct_relative (file, link->local_name);
 279             char *quoted_newname = local_quote_string (newname);
 280
 281             if (!link->link_refresh_p)
 282               p = replace_attr (p, link->size, fp, quoted_newname);
 283             else
 284               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 285                                              link->refresh_timeout);
 286
 287             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 288                      link->url->url, newname, link->pos, file));
 289             xfree (newname);
 290             xfree (quoted_newname);
 291             ++to_file_count;
 292             break;
 293           }
 294         case CO_CONVERT_TO_COMPLETE:
 295           /* Convert the link to absolute URL. */
 296           {
 297             char *newlink = link->url->url;
 298             char *quoted_newlink = html_quote_string (newlink);
 299
 300             if (!link->link_refresh_p)
 301               p = replace_attr (p, link->size, fp, quoted_newlink);
 302             else
 303               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 304                                              link->refresh_timeout);
 305
 306             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 307                      newlink, link->pos, file));
 308             xfree (quoted_newlink);
 309             ++to_url_count;
 310             break;
 311           }
 312         case CO_NULLIFY_BASE:
 313           /* Change the base href to "". */
 314           p = replace_attr (p, link->size, fp, "");
 315           break;
 316         case CO_NOCONVERT:
 317           abort ();
 318           break;
 319         }
 320     }
 321
 322   /* Output the rest of the file. */
 323   if (p - fm->content < fm->length)
 324     fwrite (p, 1, fm->length - (p - fm->content), fp);
 325   fclose (fp);
 326   read_file_free (fm);
 327
 328   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 329 }
 330
 331 /* Construct and return a link that points from BASEFILE to LINKFILE.
 332    Both files should be local file names, BASEFILE of the referrering
 333    file, and LINKFILE of the referred file.
 334
 335    Examples:
 336
 337    cr("foo", "bar")         -> "bar"
 338    cr("A/foo", "A/bar")     -> "bar"
 339    cr("A/foo", "A/B/bar")   -> "B/bar"
 340    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 341    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 342
 343    Both files should be absolute or relative, otherwise strange
 344    results might ensue.  The function makes no special efforts to
 345    handle "." and ".." in links, so make sure they're not there
 346    (e.g. using path_simplify).  */
 347
 348 static char *
 349 construct_relative (const char *basefile, const char *linkfile)
 350 {
 351   char *link;
 352   int basedirs;
 353   const char *b, *l;
 354   int i, start;
 355
 356   /* First, skip the initial directory components common to both
 357      files.  */
 358   start = 0;
 359   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 360     {
 361       if (*b == '/')
 362         start = (b - basefile) + 1;
 363     }
 364   basefile += start;
 365   linkfile += start;
 366
 367   /* With common directories out of the way, the situation we have is
 368      as follows:
 369          b - b1/b2/[...]/bfile
 370          l - l1/l2/[...]/lfile
 371
 372      The link we're constructing needs to be:
 373        lnk - ../../l1/l2/[...]/lfile
 374
 375      Where the number of ".."'s equals the number of bN directory
 376      components in B.  */
 377
 378   /* Count the directory components in B. */
 379   basedirs = 0;
 380   for (b = basefile; *b; b++)
 381     {
 382       if (*b == '/')
 383         ++basedirs;
 384     }
 385
 386   /* Construct LINK as explained above. */
 387   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 388   for (i = 0; i < basedirs; i++)
 389     memcpy (link + 3 * i, "../", 3);
 390   strcpy (link + 3 * i, linkfile);
 391   return link;
 392 }
 393
 394 /* Used by write_backup_file to remember which files have been
 395    written. */
 396 static struct hash_table *converted_files;
 397
 398 static void
 399 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 400 {
 401   /* Rather than just writing over the original .html file with the
 402      converted version, save the former to *.orig.  Note we only do
 403      this for files we've _successfully_ downloaded, so we don't
 404      clobber .orig files sitting around from previous invocations. */
 405
 406   /* Construct the backup filename as the original name plus ".orig". */
 407   size_t         filename_len = strlen (file);
 408   char*          filename_plus_orig_suffix;
 409
 410   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 411     {
 412       /* Just write "orig" over "html".  We need to do it this way
 413          because when we're checking to see if we've downloaded the
 414          file before (to see if we can skip downloading it), we don't
 415          know if it's a text/html file.  Therefore we don't know yet
 416          at that stage that -E is going to cause us to tack on
 417          ".html", so we need to compare vs. the original URL plus
 418          ".orig", not the original URL plus ".html.orig". */
 419       filename_plus_orig_suffix = alloca (filename_len + 1);
 420       strcpy (filename_plus_orig_suffix, file);
 421       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 422     }
 423   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 424     {
 425       /* Append ".orig" to the name. */
 426       filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
 427       strcpy (filename_plus_orig_suffix, file);
 428       strcpy (filename_plus_orig_suffix + filename_len, ".orig");
 429     }
 430
 431   if (!converted_files)
 432     converted_files = make_string_hash_table (0);
 433
 434   /* We can get called twice on the same URL thanks to the
 435      convert_all_links() call in main().  If we write the .orig file
 436      each time in such a case, it'll end up containing the first-pass
 437      conversion, not the original file.  So, see if we've already been
 438      called on this file. */
 439   if (!string_set_contains (converted_files, file))
 440     {
 441       /* Rename <file> to <file>.orig before former gets written over. */
 442       if (rename (file, filename_plus_orig_suffix) != 0)
 443         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 444                    file, filename_plus_orig_suffix, strerror (errno));
 445
 446       /* Remember that we've already written a .orig backup for this file.
 447          Note that we never free this memory since we need it till the
 448          convert_all_links() call, which is one of the last things the
 449          program does before terminating.  BTW, I'm not sure if it would be
 450          safe to just set 'converted_file_ptr->string' to 'file' below,
 451          rather than making a copy of the string...  Another note is that I
 452          thought I could just add a field to the urlpos structure saying
 453          that we'd written a .orig file for this URL, but that didn't work,
 454          so I had to make this separate list.
 455          -- Dan Harkless <wget@harkless.org>
 456
 457          This [adding a field to the urlpos structure] didn't work
 458          because convert_file() is called from convert_all_links at
 459          the end of the retrieval with a freshly built new urlpos
 460          list.
 461          -- Hrvoje Niksic <hniksic@xemacs.org>
 462       */
 463       string_set_add (converted_files, file);
 464     }
 465 }
 466
 467 static bool find_fragment (const char *, int, const char **, const char **);
 468
 469 /* Replace an attribute's original text with NEW_TEXT. */
 470
 471 static const char *
 472 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 473 {
 474   bool quote_flag = false;
 475   char quote_char = '\"';       /* use "..." for quoting, unless the
 476                                    original value is quoted, in which
 477                                    case reuse its quoting char. */
 478   const char *frag_beg, *frag_end;
 479
 480   /* Structure of our string is:
 481        "...old-contents..."
 482        <---    size    --->  (with quotes)
 483      OR:
 484        ...old-contents...
 485        <---    size   -->    (no quotes)   */
 486
 487   if (*p == '\"' || *p == '\'')
 488     {
 489       quote_char = *p;
 490       quote_flag = true;
 491       ++p;
 492       size -= 2;                /* disregard opening and closing quote */
 493     }
 494   putc (quote_char, fp);
 495   fputs (new_text, fp);
 496
 497   /* Look for fragment identifier, if any. */
 498   if (find_fragment (p, size, &frag_beg, &frag_end))
 499     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 500   p += size;
 501   if (quote_flag)
 502     ++p;
 503   putc (quote_char, fp);
 504
 505   return p;
 506 }
 507
 508 /* The same as REPLACE_ATTR, but used when replacing
 509    <meta http-equiv=refresh content="new_text"> because we need to
 510    append "timeout_value; URL=" before the next_text.  */
 511
 512 static const char *
 513 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 514                            const char *new_text, int timeout)
 515 {
 516   /* "0; URL=..." */
 517   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 518                                            + 6 /* "; URL=" */
 519                                            + strlen (new_text)
 520                                            + 1);
 521   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 522
 523   return replace_attr (p, size, fp, new_with_timeout);
 524 }
 525
 526 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 527    preceded by '&'.  If the character is not found, return zero.  If
 528    the character is found, return true and set BP and EP to point to
 529    the beginning and end of the region.
 530
 531    This is used for finding the fragment indentifiers in URLs.  */
 532
 533 static bool
 534 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 535 {
 536   const char *end = beg + size;
 537   bool saw_amp = false;
 538   for (; beg < end; beg++)
 539     {
 540       switch (*beg)
 541         {
 542         case '&':
 543           saw_amp = true;
 544           break;
 545         case '#':
 546           if (!saw_amp)
 547             {
 548               *bp = beg;
 549               *ep = end;
 550               return true;
 551             }
 552           /* fallthrough */
 553         default:
 554           saw_amp = false;
 555         }
 556     }
 557   return false;
 558 }
 559
 560 /* Quote FILE for use as local reference to an HTML file.
 561
 562    We quote ? as %3F to avoid passing part of the file name as the
 563    parameter when browsing the converted file through HTTP.  However,
 564    it is safe to do this only when `--html-extension' is turned on.
 565    This is because converting "index.html?foo=bar" to
 566    "index.html%3Ffoo=bar" would break local browsing, as the latter
 567    isn't even recognized as an HTML file!  However, converting
 568    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 569    safe for both local and HTTP-served browsing.
 570
 571    We always quote "#" as "%23" and "%" as "%25" because those
 572    characters have special meanings in URLs.  */
 573
 574 static char *
 575 local_quote_string (const char *file)
 576 {
 577   const char *from;
 578   char *newname, *to;
 579
 580   char *any = strpbrk (file, "?#%");
 581   if (!any)
 582     return html_quote_string (file);
 583
 584   /* Allocate space assuming the worst-case scenario, each character
 585      having to be quoted.  */
 586   to = newname = (char *)alloca (3 * strlen (file) + 1);
 587   for (from = file; *from; from++)
 588     switch (*from)
 589       {
 590       case '%':
 591         *to++ = '%';
 592         *to++ = '2';
 593         *to++ = '5';
 594         break;
 595       case '#':
 596         *to++ = '%';
 597         *to++ = '2';
 598         *to++ = '3';
 599         break;
 600       case '?':
 601         if (opt.html_extension)
 602           {
 603             *to++ = '%';
 604             *to++ = '3';
 605             *to++ = 'F';
 606             break;
 607           }
 608         /* fallthrough */
 609       default:
 610         *to++ = *from;
 611       }
 612   *to = '\0';
 613
 614   return html_quote_string (newname);
 615 }
 616 \f
 617 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 618    downloaded_html_list, and downloaded_html_set.  Other code calls
 619    these functions to let us know that a file has been downloaded.  */
 620
 621 #define ENSURE_TABLES_EXIST do {                        \
 622   if (!dl_file_url_map)                                 \
 623     dl_file_url_map = make_string_hash_table (0);       \
 624   if (!dl_url_file_map)                                 \
 625     dl_url_file_map = make_string_hash_table (0);       \
 626 } while (0)
 627
 628 /* Return true if S1 and S2 are the same, except for "/index.html".
 629    The three cases in which it returns one are (substitute any
 630    substring for "foo"):
 631
 632    m("foo/index.html", "foo/")  ==> 1
 633    m("foo/", "foo/index.html")  ==> 1
 634    m("foo", "foo/index.html")   ==> 1
 635    m("foo", "foo/"              ==> 1
 636    m("foo", "foo")              ==> 1  */
 637
 638 static bool
 639 match_except_index (const char *s1, const char *s2)
 640 {
 641   int i;
 642   const char *lng;
 643
 644   /* Skip common substring. */
 645   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 646     ;
 647   if (i == 0)
 648     /* Strings differ at the very beginning -- bail out.  We need to
 649        check this explicitly to avoid `lng - 1' reading outside the
 650        array.  */
 651     return false;
 652
 653   if (!*s1 && !*s2)
 654     /* Both strings hit EOF -- strings are equal. */
 655     return true;
 656   else if (*s1 && *s2)
 657     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 658     return false;
 659   else if (*s1)
 660     /* S1 is the longer one. */
 661     lng = s1;
 662   else
 663     /* S2 is the longer one. */
 664     lng = s2;
 665
 666   /* foo            */            /* foo/           */
 667   /* foo/index.html */  /* or */  /* foo/index.html */
 668   /*    ^           */            /*     ^          */
 669
 670   if (*lng != '/')
 671     /* The right-hand case. */
 672     --lng;
 673
 674   if (*lng == '/' && *(lng + 1) == '\0')
 675     /* foo  */
 676     /* foo/ */
 677     return true;
 678
 679   return 0 == strcmp (lng, "/index.html");
 680 }
 681
 682 static int
 683 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 684 {
 685   char *mapping_url = (char *)key;
 686   char *mapping_file = (char *)value;
 687   char *file = (char *)arg;
 688
 689   if (0 == strcmp (mapping_file, file))
 690     {
 691       hash_table_remove (dl_url_file_map, mapping_url);
 692       xfree (mapping_url);
 693       xfree (mapping_file);
 694     }
 695
 696   /* Continue mapping. */
 697   return 0;
 698 }
 699
 700 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 701
 702 static void
 703 dissociate_urls_from_file (const char *file)
 704 {
 705   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 706   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 707                        (char *) file);
 708 }
 709
 710 /* Register that URL has been successfully downloaded to FILE.  This
 711    is used by the link conversion code to convert references to URLs
 712    to references to local files.  It is also being used to check if a
 713    URL has already been downloaded.  */
 714
 715 void
 716 register_download (const char *url, const char *file)
 717 {
 718   char *old_file, *old_url;
 719
 720   ENSURE_TABLES_EXIST;
 721
 722   /* With some forms of retrieval, it is possible, although not likely
 723      or particularly desirable.  If both are downloaded, the second
 724      download will override the first one.  When that happens,
 725      dissociate the old file name from the URL.  */
 726
 727   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 728     {
 729       if (0 == strcmp (url, old_url))
 730         /* We have somehow managed to download the same URL twice.
 731            Nothing to do.  */
 732         return;
 733
 734       if (match_except_index (url, old_url)
 735           && !hash_table_contains (dl_url_file_map, url))
 736         /* The two URLs differ only in the "index.html" ending.  For
 737            example, one is "http://www.server.com/", and the other is
 738            "http://www.server.com/index.html".  Don't remove the old
 739            one, just add the new one as a non-canonical entry.  */
 740         goto url_only;
 741
 742       hash_table_remove (dl_file_url_map, file);
 743       xfree (old_file);
 744       xfree (old_url);
 745
 746       /* Remove all the URLs that point to this file.  Yes, there can
 747          be more than one such URL, because we store redirections as
 748          multiple entries in dl_url_file_map.  For example, if URL1
 749          redirects to URL2 which gets downloaded to FILE, we map both
 750          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 751          only points to URL2.)  When another URL gets loaded to FILE,
 752          we want both URL1 and URL2 dissociated from it.
 753
 754          This is a relatively expensive operation because it performs
 755          a linear search of the whole hash table, but it should be
 756          called very rarely, only when two URLs resolve to the same
 757          file name, *and* the "<file>.1" extensions are turned off.
 758          In other words, almost never.  */
 759       dissociate_urls_from_file (file);
 760     }
 761
 762   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 763
 764  url_only:
 765   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 766      If the latter were present, it should have been removed by the
 767      above `if'.  So we could write:
 768
 769          assert (!hash_table_contains (dl_url_file_map, url));
 770
 771      The above is correct when running in recursive mode where the
 772      same URL always resolves to the same file.  But if you do
 773      something like:
 774
 775          wget URL URL
 776
 777      then the first URL will resolve to "FILE", and the other to
 778      "FILE.1".  In that case, FILE.1 will not be found in
 779      dl_file_url_map, but URL will still point to FILE in
 780      dl_url_file_map.  */
 781   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 782     {
 783       hash_table_remove (dl_url_file_map, url);
 784       xfree (old_url);
 785       xfree (old_file);
 786     }
 787
 788   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 789 }
 790
 791 /* Register that FROM has been redirected to TO.  This assumes that TO
 792    is successfully downloaded and already registered using
 793    register_download() above.  */
 794
 795 void
 796 register_redirection (const char *from, const char *to)
 797 {
 798   char *file;
 799
 800   ENSURE_TABLES_EXIST;
 801
 802   file = hash_table_get (dl_url_file_map, to);
 803   assert (file != NULL);
 804   if (!hash_table_contains (dl_url_file_map, from))
 805     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 806 }
 807
 808 /* Register that the file has been deleted. */
 809
 810 void
 811 register_delete_file (const char *file)
 812 {
 813   char *old_url, *old_file;
 814
 815   ENSURE_TABLES_EXIST;
 816
 817   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 818     return;
 819
 820   hash_table_remove (dl_file_url_map, file);
 821   xfree (old_file);
 822   xfree (old_url);
 823   dissociate_urls_from_file (file);
 824 }
 825
 826 /* Register that FILE is an HTML file that has been downloaded. */
 827
 828 void
 829 register_html (const char *url, const char *file)
 830 {
 831   if (!downloaded_html_set)
 832     downloaded_html_set = make_string_hash_table (0);
 833   string_set_add (downloaded_html_set, file);
 834 }
 835
 836 static void downloaded_files_free (void);
 837 static void nonexisting_urls_free (void);
 838
 839 /* Cleanup the data structures associated with this file.  */
 840
 841 void
 842 convert_cleanup (void)
 843 {
 844   if (dl_file_url_map)
 845     {
 846       free_keys_and_values (dl_file_url_map);
 847       hash_table_destroy (dl_file_url_map);
 848       dl_file_url_map = NULL;
 849     }
 850   if (dl_url_file_map)
 851     {
 852       free_keys_and_values (dl_url_file_map);
 853       hash_table_destroy (dl_url_file_map);
 854       dl_url_file_map = NULL;
 855     }
 856   if (downloaded_html_set)
 857     string_set_free (downloaded_html_set);
 858   downloaded_files_free ();
 859   nonexisting_urls_free ();
 860   if (converted_files)
 861     string_set_free (converted_files);
 862 }
 863 \f
 864 /* Book-keeping code for downloaded files that enables extension
 865    hacks.  */
 866
 867 /* This table should really be merged with dl_file_url_map and
 868    downloaded_html_files.  This was originally a list, but I changed
 869    it to a hash table beause it was actually taking a lot of time to
 870    find things in it.  */
 871
 872 static struct hash_table *downloaded_files_hash;
 873
 874 /* We're storing "modes" of type downloaded_file_t in the hash table.
 875    However, our hash tables only accept pointers for keys and values.
 876    So when we need a pointer, we use the address of a
 877    downloaded_file_t variable of static storage.  */
 878
 879 static downloaded_file_t *
 880 downloaded_mode_to_ptr (downloaded_file_t mode)
 881 {
 882   static downloaded_file_t
 883     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 884     v2 = FILE_DOWNLOADED_NORMALLY,
 885     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 886     v4 = CHECK_FOR_FILE;
 887
 888   switch (mode)
 889     {
 890     case FILE_NOT_ALREADY_DOWNLOADED:
 891       return &v1;
 892     case FILE_DOWNLOADED_NORMALLY:
 893       return &v2;
 894     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 895       return &v3;
 896     case CHECK_FOR_FILE:
 897       return &v4;
 898     }
 899   return NULL;
 900 }
 901
 902 /* Remembers which files have been downloaded.  In the standard case,
 903    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 904    file we actually download successfully (i.e. not for ones we have
 905    failures on or that we skip due to -N).
 906
 907    When we've downloaded a file and tacked on a ".html" extension due
 908    to -E, call this function with
 909    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 910    FILE_DOWNLOADED_NORMALLY.
 911
 912    If you just want to check if a file has been previously added
 913    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 914    sure to call this function with local filenames, not remote
 915    URLs.  */
 916
 917 downloaded_file_t
 918 downloaded_file (downloaded_file_t mode, const char *file)
 919 {
 920   downloaded_file_t *ptr;
 921
 922   if (mode == CHECK_FOR_FILE)
 923     {
 924       if (!downloaded_files_hash)
 925         return FILE_NOT_ALREADY_DOWNLOADED;
 926       ptr = hash_table_get (downloaded_files_hash, file);
 927       if (!ptr)
 928         return FILE_NOT_ALREADY_DOWNLOADED;
 929       return *ptr;
 930     }
 931
 932   if (!downloaded_files_hash)
 933     downloaded_files_hash = make_string_hash_table (0);
 934
 935   ptr = hash_table_get (downloaded_files_hash, file);
 936   if (ptr)
 937     return *ptr;
 938
 939   ptr = downloaded_mode_to_ptr (mode);
 940   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
 941
 942   return FILE_NOT_ALREADY_DOWNLOADED;
 943 }
 944
 945 static void
 946 downloaded_files_free (void)
 947 {
 948   if (downloaded_files_hash)
 949     {
 950       hash_table_iterator iter;
 951       for (hash_table_iterate (downloaded_files_hash, &iter);
 952            hash_table_iter_next (&iter);
 953            )
 954         xfree (iter.key);
 955       hash_table_destroy (downloaded_files_hash);
 956       downloaded_files_hash = NULL;
 957     }
 958 }
 959 \f
 960 /* Remembers broken links.  */
 961
 962 struct broken_urls_list
 963 {
 964   char *url;
 965   struct broken_urls_list *next;
 966 };
 967
 968 static bool
 969 in_list (const struct broken_urls_list *list, const char *url)
 970 {
 971   const struct broken_urls_list *ptr;
 972
 973   for (ptr = list; ptr; ptr = ptr->next)
 974     {
 975       /* TODO: strcasecmp may not be appropriate to compare URLs */
 976       if (strcasecmp (url, ptr->url) == 0) return true;
 977     }
 978
 979   return false;
 980 }
 981
 982 void
 983 nonexisting_url (const char *url, const char *referrer)
 984 {
 985   struct broken_urls_list *list;
 986
 987   if (!nonexisting_urls_hash)
 988     nonexisting_urls_hash = make_string_hash_table (0);
 989
 990   list = hash_table_get (nonexisting_urls_hash, url);
 991   if (!list)
 992     {
 993       list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
 994       list->url = referrer ? xstrdup (referrer) : NULL;
 995       hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
 996     }
 997   else if (list && !in_list (list, referrer))
 998     {
 999       /* Append referrer at the end of the list */
1000       struct broken_urls_list *newnode;
1001
1002       while (list->next) list = list->next;
1003
1004       newnode = xnew0 (struct broken_urls_list);
1005       newnode->url = xstrdup (referrer);
1006       list->next = newnode;
1007     }
1008 }
1009
1010 static void
1011 nonexisting_urls_free (void)
1012 {
1013   if (nonexisting_urls_hash)
1014     {
1015       hash_table_iterator iter;
1016       for (hash_table_iterate (nonexisting_urls_hash, &iter);
1017            hash_table_iter_next (&iter);
1018            )
1019         {
1020           xfree (iter.key);
1021           xfree (iter.value);
1022         }
1023       hash_table_destroy (nonexisting_urls_hash);
1024       nonexisting_urls_hash = NULL;
1025     }
1026 }
1027
1028 void
1029 print_broken_links (void)
1030 {
1031   hash_table_iterator iter;
1032   int num_elems;
1033
1034   if (!nonexisting_urls_hash)
1035     {
1036       logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
1037       return;
1038     }
1039
1040   num_elems = hash_table_count (nonexisting_urls_hash);
1041   assert (num_elems > 0);
1042
1043   if (num_elems > 1)
1044     {
1045       logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
1046                  num_elems);
1047     }
1048   else
1049     {
1050       logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
1051     }
1052
1053   for (hash_table_iterate (nonexisting_urls_hash, &iter);
1054        hash_table_iter_next (&iter);
1055        )
1056     {
1057       struct broken_urls_list *list;
1058
1059       logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
1060
1061       for (list = (struct broken_urls_list *) iter.value;
1062            list;
1063            list = list->next)
1064         {
1065           logprintf (LOG_NOTQUIET, _("    %s\n"), list->url);
1066         }
1067     }
1068   logputs (LOG_NOTQUIET, "\n");
1069 }
1070
1071 \f
1072 /* The function returns the pointer to the malloc-ed quoted version of
1073    string s.  It will recognize and quote numeric and special graphic
1074    entities, as per RFC1866:
1075
1076    `&' -> `&amp;'
1077    `<' -> `&lt;'
1078    `>' -> `&gt;'
1079    `"' -> `&quot;'
1080    SP  -> `&#32;'
1081
1082    No other entities are recognized or replaced.  */
1083 char *
1084 html_quote_string (const char *s)
1085 {
1086   const char *b = s;
1087   char *p, *res;
1088   int i;
1089
1090   /* Pass through the string, and count the new size.  */
1091   for (i = 0; *s; s++, i++)
1092     {
1093       if (*s == '&')
1094         i += 4;                 /* `amp;' */
1095       else if (*s == '<' || *s == '>')
1096         i += 3;                 /* `lt;' and `gt;' */
1097       else if (*s == '\"')
1098         i += 5;                 /* `quot;' */
1099       else if (*s == ' ')
1100         i += 4;                 /* #32; */
1101     }
1102   res = xmalloc (i + 1);
1103   s = b;
1104   for (p = res; *s; s++)
1105     {
1106       switch (*s)
1107         {
1108         case '&':
1109           *p++ = '&';
1110           *p++ = 'a';
1111           *p++ = 'm';
1112           *p++ = 'p';
1113           *p++ = ';';
1114           break;
1115         case '<': case '>':
1116           *p++ = '&';
1117           *p++ = (*s == '<' ? 'l' : 'g');
1118           *p++ = 't';
1119           *p++ = ';';
1120           break;
1121         case '\"':
1122           *p++ = '&';
1123           *p++ = 'q';
1124           *p++ = 'u';
1125           *p++ = 'o';
1126           *p++ = 't';
1127           *p++ = ';';
1128           break;
1129         case ' ':
1130           *p++ = '&';
1131           *p++ = '#';
1132           *p++ = '3';
1133           *p++ = '2';
1134           *p++ = ';';
1135           break;
1136         default:
1137           *p++ = *s;
1138         }
1139     }
1140   *p = '\0';
1141   return res;
1142 }