sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003-2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software Foundation, Inc.,
  18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #ifdef HAVE_UNISTD_H
  36 # include <unistd.h>
  37 #endif /* HAVE_UNISTD_H */
  38 #include <errno.h>
  39 #include <assert.h>
  40
  41 #include "wget.h"
  42 #include "convert.h"
  43 #include "url.h"
  44 #include "recur.h"
  45 #include "utils.h"
  46 #include "hash.h"
  47 #include "ptimer.h"
  48 #include "res.h"
  49
  50 static struct hash_table *dl_file_url_map;
  51 struct hash_table *dl_url_file_map;
  52
  53 /* Set of HTML files downloaded in this Wget run, used for link
  54    conversion after Wget is done.  */
  55 struct hash_table *downloaded_html_set;
  56
  57 static struct hash_table *nonexisting_urls_hash;
  58
  59 static void convert_links (const char *, struct urlpos *);
  60
  61 /* This function is called when the retrieval is done to convert the
  62    links that have been downloaded.  It has to be called at the end of
  63    the retrieval, because only then does Wget know conclusively which
  64    URLs have been downloaded, and which not, so it can tell which
  65    direction to convert to.
  66
  67    The "direction" means that the URLs to the files that have been
  68    downloaded get converted to the relative URL which will point to
  69    that file.  And the other URLs get converted to the remote URL on
  70    the server.
  71
  72    All the downloaded HTMLs are kept in downloaded_html_files, and
  73    downloaded URLs in urls_downloaded.  All the information is
  74    extracted from these two lists.  */
  75
  76 void
  77 convert_all_links (void)
  78 {
  79   int i;
  80   double secs;
  81   int file_count = 0;
  82
  83   struct ptimer *timer = ptimer_new ();
  84
  85   int cnt;
  86   char **file_array;
  87
  88   cnt = 0;
  89   if (downloaded_html_set)
  90     cnt = hash_table_count (downloaded_html_set);
  91   if (cnt == 0)
  92     return;
  93   file_array = alloca_array (char *, cnt);
  94   string_set_to_array (downloaded_html_set, file_array);
  95
  96   for (i = 0; i < cnt; i++)
  97     {
  98       struct urlpos *urls, *cur_url;
  99       char *url;
 100       char *file = file_array[i];
 101
 102       /* Determine the URL of the HTML file.  get_urls_html will need
 103          it.  */
 104       url = hash_table_get (dl_file_url_map, file);
 105       if (!url)
 106         {
 107           DEBUGP (("Apparently %s has been removed.\n", file));
 108           continue;
 109         }
 110
 111       DEBUGP (("Scanning %s (from %s)\n", file, url));
 112
 113       /* Parse the HTML file...  */
 114       urls = get_urls_html (file, url, NULL);
 115
 116       /* We don't respect meta_disallow_follow here because, even if
 117          the file is not followed, we might still want to convert the
 118          links that have been followed from other files.  */
 119
 120       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 121         {
 122           char *local_name;
 123           struct url *u = cur_url->url;
 124
 125           if (cur_url->link_base_p)
 126             {
 127               /* Base references have been resolved by our parser, so
 128                  we turn the base URL into an empty string.  (Perhaps
 129                  we should remove the tag entirely?)  */
 130               cur_url->convert = CO_NULLIFY_BASE;
 131               continue;
 132             }
 133
 134           /* We decide the direction of conversion according to whether
 135              a URL was downloaded.  Downloaded URLs will be converted
 136              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 137           local_name = hash_table_get (dl_url_file_map, u->url);
 138
 139           /* Decide on the conversion type.  */
 140           if (local_name)
 141             {
 142               /* We've downloaded this URL.  Convert it to relative
 143                  form.  We do this even if the URL already is in
 144                  relative form, because our directory structure may
 145                  not be identical to that on the server (think `-nd',
 146                  `--cut-dirs', etc.)  */
 147               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 148               cur_url->local_name = xstrdup (local_name);
 149               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 150             }
 151           else
 152             {
 153               /* We haven't downloaded this URL.  If it's not already
 154                  complete (including a full host name), convert it to
 155                  that form, so it can be reached while browsing this
 156                  HTML locally.  */
 157               if (!cur_url->link_complete_p)
 158                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 159               cur_url->local_name = NULL;
 160               DEBUGP (("will convert url %s to complete\n", u->url));
 161             }
 162         }
 163
 164       /* Convert the links in the file.  */
 165       convert_links (file, urls);
 166       ++file_count;
 167
 168       /* Free the data.  */
 169       free_urlpos (urls);
 170     }
 171
 172   secs = ptimer_measure (timer);
 173   ptimer_destroy (timer);
 174   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 175              file_count, print_decimal (secs));
 176 }
 177
 178 static void write_backup_file (const char *, downloaded_file_t);
 179 static const char *replace_attr (const char *, int, FILE *, const char *);
 180 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 181                                               const char *, int);
 182 static char *local_quote_string (const char *);
 183 static char *construct_relative (const char *, const char *);
 184
 185 /* Change the links in one HTML file.  LINKS is a list of links in the
 186    document, along with their positions and the desired direction of
 187    the conversion.  */
 188 static void
 189 convert_links (const char *file, struct urlpos *links)
 190 {
 191   struct file_memory *fm;
 192   FILE *fp;
 193   const char *p;
 194   downloaded_file_t downloaded_file_return;
 195
 196   struct urlpos *link;
 197   int to_url_count = 0, to_file_count = 0;
 198
 199   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 200
 201   {
 202     /* First we do a "dry run": go through the list L and see whether
 203        any URL needs to be converted in the first place.  If not, just
 204        leave the file alone.  */
 205     int dry_count = 0;
 206     struct urlpos *dry;
 207     for (dry = links; dry; dry = dry->next)
 208       if (dry->convert != CO_NOCONVERT)
 209         ++dry_count;
 210     if (!dry_count)
 211       {
 212         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 213         return;
 214       }
 215   }
 216
 217   fm = read_file (file);
 218   if (!fm)
 219     {
 220       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 221                  file, strerror (errno));
 222       return;
 223     }
 224
 225   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 226   if (opt.backup_converted && downloaded_file_return)
 227     write_backup_file (file, downloaded_file_return);
 228
 229   /* Before opening the file for writing, unlink the file.  This is
 230      important if the data in FM is mmaped.  In such case, nulling the
 231      file, which is what fopen() below does, would make us read all
 232      zeroes from the mmaped region.  */
 233   if (unlink (file) < 0 && errno != ENOENT)
 234     {
 235       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
 236                  file, strerror (errno));
 237       read_file_free (fm);
 238       return;
 239     }
 240   /* Now open the file for writing.  */
 241   fp = fopen (file, "wb");
 242   if (!fp)
 243     {
 244       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 245                  file, strerror (errno));
 246       read_file_free (fm);
 247       return;
 248     }
 249
 250   /* Here we loop through all the URLs in file, replacing those of
 251      them that are downloaded with relative references.  */
 252   p = fm->content;
 253   for (link = links; link; link = link->next)
 254     {
 255       char *url_start = fm->content + link->pos;
 256
 257       if (link->pos >= fm->length)
 258         {
 259           DEBUGP (("Something strange is going on.  Please investigate."));
 260           break;
 261         }
 262       /* If the URL is not to be converted, skip it.  */
 263       if (link->convert == CO_NOCONVERT)
 264         {
 265           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 266           continue;
 267         }
 268
 269       /* Echo the file contents, up to the offending URL's opening
 270          quote, to the outfile.  */
 271       fwrite (p, 1, url_start - p, fp);
 272       p = url_start;
 273
 274       switch (link->convert)
 275         {
 276         case CO_CONVERT_TO_RELATIVE:
 277           /* Convert absolute URL to relative. */
 278           {
 279             char *newname = construct_relative (file, link->local_name);
 280             char *quoted_newname = local_quote_string (newname);
 281
 282             if (!link->link_refresh_p)
 283               p = replace_attr (p, link->size, fp, quoted_newname);
 284             else
 285               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 286                                              link->refresh_timeout);
 287
 288             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 289                      link->url->url, newname, link->pos, file));
 290             xfree (newname);
 291             xfree (quoted_newname);
 292             ++to_file_count;
 293             break;
 294           }
 295         case CO_CONVERT_TO_COMPLETE:
 296           /* Convert the link to absolute URL. */
 297           {
 298             char *newlink = link->url->url;
 299             char *quoted_newlink = html_quote_string (newlink);
 300
 301             if (!link->link_refresh_p)
 302               p = replace_attr (p, link->size, fp, quoted_newlink);
 303             else
 304               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 305                                              link->refresh_timeout);
 306
 307             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 308                      newlink, link->pos, file));
 309             xfree (quoted_newlink);
 310             ++to_url_count;
 311             break;
 312           }
 313         case CO_NULLIFY_BASE:
 314           /* Change the base href to "". */
 315           p = replace_attr (p, link->size, fp, "");
 316           break;
 317         case CO_NOCONVERT:
 318           abort ();
 319           break;
 320         }
 321     }
 322
 323   /* Output the rest of the file. */
 324   if (p - fm->content < fm->length)
 325     fwrite (p, 1, fm->length - (p - fm->content), fp);
 326   fclose (fp);
 327   read_file_free (fm);
 328
 329   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 330 }
 331
 332 /* Construct and return a link that points from BASEFILE to LINKFILE.
 333    Both files should be local file names, BASEFILE of the referrering
 334    file, and LINKFILE of the referred file.
 335
 336    Examples:
 337
 338    cr("foo", "bar")         -> "bar"
 339    cr("A/foo", "A/bar")     -> "bar"
 340    cr("A/foo", "A/B/bar")   -> "B/bar"
 341    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 342    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 343
 344    Both files should be absolute or relative, otherwise strange
 345    results might ensue.  The function makes no special efforts to
 346    handle "." and ".." in links, so make sure they're not there
 347    (e.g. using path_simplify).  */
 348
 349 static char *
 350 construct_relative (const char *basefile, const char *linkfile)
 351 {
 352   char *link;
 353   int basedirs;
 354   const char *b, *l;
 355   int i, start;
 356
 357   /* First, skip the initial directory components common to both
 358      files.  */
 359   start = 0;
 360   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 361     {
 362       if (*b == '/')
 363         start = (b - basefile) + 1;
 364     }
 365   basefile += start;
 366   linkfile += start;
 367
 368   /* With common directories out of the way, the situation we have is
 369      as follows:
 370          b - b1/b2/[...]/bfile
 371          l - l1/l2/[...]/lfile
 372
 373      The link we're constructing needs to be:
 374        lnk - ../../l1/l2/[...]/lfile
 375
 376      Where the number of ".."'s equals the number of bN directory
 377      components in B.  */
 378
 379   /* Count the directory components in B. */
 380   basedirs = 0;
 381   for (b = basefile; *b; b++)
 382     {
 383       if (*b == '/')
 384         ++basedirs;
 385     }
 386
 387   /* Construct LINK as explained above. */
 388   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 389   for (i = 0; i < basedirs; i++)
 390     memcpy (link + 3 * i, "../", 3);
 391   strcpy (link + 3 * i, linkfile);
 392   return link;
 393 }
 394
 395 /* Used by write_backup_file to remember which files have been
 396    written. */
 397 static struct hash_table *converted_files;
 398
 399 static void
 400 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 401 {
 402   /* Rather than just writing over the original .html file with the
 403      converted version, save the former to *.orig.  Note we only do
 404      this for files we've _successfully_ downloaded, so we don't
 405      clobber .orig files sitting around from previous invocations. */
 406
 407   /* Construct the backup filename as the original name plus ".orig". */
 408   size_t         filename_len = strlen (file);
 409   char*          filename_plus_orig_suffix;
 410
 411   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 412     {
 413       /* Just write "orig" over "html".  We need to do it this way
 414          because when we're checking to see if we've downloaded the
 415          file before (to see if we can skip downloading it), we don't
 416          know if it's a text/html file.  Therefore we don't know yet
 417          at that stage that -E is going to cause us to tack on
 418          ".html", so we need to compare vs. the original URL plus
 419          ".orig", not the original URL plus ".html.orig". */
 420       filename_plus_orig_suffix = alloca (filename_len + 1);
 421       strcpy (filename_plus_orig_suffix, file);
 422       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 423     }
 424   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 425     {
 426       /* Append ".orig" to the name. */
 427       filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
 428       strcpy (filename_plus_orig_suffix, file);
 429       strcpy (filename_plus_orig_suffix + filename_len, ".orig");
 430     }
 431
 432   if (!converted_files)
 433     converted_files = make_string_hash_table (0);
 434
 435   /* We can get called twice on the same URL thanks to the
 436      convert_all_links() call in main().  If we write the .orig file
 437      each time in such a case, it'll end up containing the first-pass
 438      conversion, not the original file.  So, see if we've already been
 439      called on this file. */
 440   if (!string_set_contains (converted_files, file))
 441     {
 442       /* Rename <file> to <file>.orig before former gets written over. */
 443       if (rename (file, filename_plus_orig_suffix) != 0)
 444         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 445                    file, filename_plus_orig_suffix, strerror (errno));
 446
 447       /* Remember that we've already written a .orig backup for this file.
 448          Note that we never free this memory since we need it till the
 449          convert_all_links() call, which is one of the last things the
 450          program does before terminating.  BTW, I'm not sure if it would be
 451          safe to just set 'converted_file_ptr->string' to 'file' below,
 452          rather than making a copy of the string...  Another note is that I
 453          thought I could just add a field to the urlpos structure saying
 454          that we'd written a .orig file for this URL, but that didn't work,
 455          so I had to make this separate list.
 456          -- Dan Harkless <wget@harkless.org>
 457
 458          This [adding a field to the urlpos structure] didn't work
 459          because convert_file() is called from convert_all_links at
 460          the end of the retrieval with a freshly built new urlpos
 461          list.
 462          -- Hrvoje Niksic <hniksic@xemacs.org>
 463       */
 464       string_set_add (converted_files, file);
 465     }
 466 }
 467
 468 static bool find_fragment (const char *, int, const char **, const char **);
 469
 470 /* Replace an attribute's original text with NEW_TEXT. */
 471
 472 static const char *
 473 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 474 {
 475   bool quote_flag = false;
 476   char quote_char = '\"';       /* use "..." for quoting, unless the
 477                                    original value is quoted, in which
 478                                    case reuse its quoting char. */
 479   const char *frag_beg, *frag_end;
 480
 481   /* Structure of our string is:
 482        "...old-contents..."
 483        <---    size    --->  (with quotes)
 484      OR:
 485        ...old-contents...
 486        <---    size   -->    (no quotes)   */
 487
 488   if (*p == '\"' || *p == '\'')
 489     {
 490       quote_char = *p;
 491       quote_flag = true;
 492       ++p;
 493       size -= 2;                /* disregard opening and closing quote */
 494     }
 495   putc (quote_char, fp);
 496   fputs (new_text, fp);
 497
 498   /* Look for fragment identifier, if any. */
 499   if (find_fragment (p, size, &frag_beg, &frag_end))
 500     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 501   p += size;
 502   if (quote_flag)
 503     ++p;
 504   putc (quote_char, fp);
 505
 506   return p;
 507 }
 508
 509 /* The same as REPLACE_ATTR, but used when replacing
 510    <meta http-equiv=refresh content="new_text"> because we need to
 511    append "timeout_value; URL=" before the next_text.  */
 512
 513 static const char *
 514 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 515                            const char *new_text, int timeout)
 516 {
 517   /* "0; URL=..." */
 518   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 519                                            + 6 /* "; URL=" */
 520                                            + strlen (new_text)
 521                                            + 1);
 522   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 523
 524   return replace_attr (p, size, fp, new_with_timeout);
 525 }
 526
 527 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 528    preceded by '&'.  If the character is not found, return zero.  If
 529    the character is found, return true and set BP and EP to point to
 530    the beginning and end of the region.
 531
 532    This is used for finding the fragment indentifiers in URLs.  */
 533
 534 static bool
 535 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 536 {
 537   const char *end = beg + size;
 538   bool saw_amp = false;
 539   for (; beg < end; beg++)
 540     {
 541       switch (*beg)
 542         {
 543         case '&':
 544           saw_amp = true;
 545           break;
 546         case '#':
 547           if (!saw_amp)
 548             {
 549               *bp = beg;
 550               *ep = end;
 551               return true;
 552             }
 553           /* fallthrough */
 554         default:
 555           saw_amp = false;
 556         }
 557     }
 558   return false;
 559 }
 560
 561 /* Quote FILE for use as local reference to an HTML file.
 562
 563    We quote ? as %3F to avoid passing part of the file name as the
 564    parameter when browsing the converted file through HTTP.  However,
 565    it is safe to do this only when `--html-extension' is turned on.
 566    This is because converting "index.html?foo=bar" to
 567    "index.html%3Ffoo=bar" would break local browsing, as the latter
 568    isn't even recognized as an HTML file!  However, converting
 569    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 570    safe for both local and HTTP-served browsing.
 571
 572    We always quote "#" as "%23" and "%" as "%25" because those
 573    characters have special meanings in URLs.  */
 574
 575 static char *
 576 local_quote_string (const char *file)
 577 {
 578   const char *from;
 579   char *newname, *to;
 580
 581   char *any = strpbrk (file, "?#%");
 582   if (!any)
 583     return html_quote_string (file);
 584
 585   /* Allocate space assuming the worst-case scenario, each character
 586      having to be quoted.  */
 587   to = newname = (char *)alloca (3 * strlen (file) + 1);
 588   for (from = file; *from; from++)
 589     switch (*from)
 590       {
 591       case '%':
 592         *to++ = '%';
 593         *to++ = '2';
 594         *to++ = '5';
 595         break;
 596       case '#':
 597         *to++ = '%';
 598         *to++ = '2';
 599         *to++ = '3';
 600         break;
 601       case '?':
 602         if (opt.html_extension)
 603           {
 604             *to++ = '%';
 605             *to++ = '3';
 606             *to++ = 'F';
 607             break;
 608           }
 609         /* fallthrough */
 610       default:
 611         *to++ = *from;
 612       }
 613   *to = '\0';
 614
 615   return html_quote_string (newname);
 616 }
 617 \f
 618 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 619    downloaded_html_list, and downloaded_html_set.  Other code calls
 620    these functions to let us know that a file has been downloaded.  */
 621
 622 #define ENSURE_TABLES_EXIST do {                        \
 623   if (!dl_file_url_map)                                 \
 624     dl_file_url_map = make_string_hash_table (0);       \
 625   if (!dl_url_file_map)                                 \
 626     dl_url_file_map = make_string_hash_table (0);       \
 627 } while (0)
 628
 629 /* Return true if S1 and S2 are the same, except for "/index.html".
 630    The three cases in which it returns one are (substitute any
 631    substring for "foo"):
 632
 633    m("foo/index.html", "foo/")  ==> 1
 634    m("foo/", "foo/index.html")  ==> 1
 635    m("foo", "foo/index.html")   ==> 1
 636    m("foo", "foo/"              ==> 1
 637    m("foo", "foo")              ==> 1  */
 638
 639 static bool
 640 match_except_index (const char *s1, const char *s2)
 641 {
 642   int i;
 643   const char *lng;
 644
 645   /* Skip common substring. */
 646   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 647     ;
 648   if (i == 0)
 649     /* Strings differ at the very beginning -- bail out.  We need to
 650        check this explicitly to avoid `lng - 1' reading outside the
 651        array.  */
 652     return false;
 653
 654   if (!*s1 && !*s2)
 655     /* Both strings hit EOF -- strings are equal. */
 656     return true;
 657   else if (*s1 && *s2)
 658     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 659     return false;
 660   else if (*s1)
 661     /* S1 is the longer one. */
 662     lng = s1;
 663   else
 664     /* S2 is the longer one. */
 665     lng = s2;
 666
 667   /* foo            */            /* foo/           */
 668   /* foo/index.html */  /* or */  /* foo/index.html */
 669   /*    ^           */            /*     ^          */
 670
 671   if (*lng != '/')
 672     /* The right-hand case. */
 673     --lng;
 674
 675   if (*lng == '/' && *(lng + 1) == '\0')
 676     /* foo  */
 677     /* foo/ */
 678     return true;
 679
 680   return 0 == strcmp (lng, "/index.html");
 681 }
 682
 683 static int
 684 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 685 {
 686   char *mapping_url = (char *)key;
 687   char *mapping_file = (char *)value;
 688   char *file = (char *)arg;
 689
 690   if (0 == strcmp (mapping_file, file))
 691     {
 692       hash_table_remove (dl_url_file_map, mapping_url);
 693       xfree (mapping_url);
 694       xfree (mapping_file);
 695     }
 696
 697   /* Continue mapping. */
 698   return 0;
 699 }
 700
 701 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 702
 703 static void
 704 dissociate_urls_from_file (const char *file)
 705 {
 706   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 707   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 708                        (char *) file);
 709 }
 710
 711 /* Register that URL has been successfully downloaded to FILE.  This
 712    is used by the link conversion code to convert references to URLs
 713    to references to local files.  It is also being used to check if a
 714    URL has already been downloaded.  */
 715
 716 void
 717 register_download (const char *url, const char *file)
 718 {
 719   char *old_file, *old_url;
 720
 721   ENSURE_TABLES_EXIST;
 722
 723   /* With some forms of retrieval, it is possible, although not likely
 724      or particularly desirable.  If both are downloaded, the second
 725      download will override the first one.  When that happens,
 726      dissociate the old file name from the URL.  */
 727
 728   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 729     {
 730       if (0 == strcmp (url, old_url))
 731         /* We have somehow managed to download the same URL twice.
 732            Nothing to do.  */
 733         return;
 734
 735       if (match_except_index (url, old_url)
 736           && !hash_table_contains (dl_url_file_map, url))
 737         /* The two URLs differ only in the "index.html" ending.  For
 738            example, one is "http://www.server.com/", and the other is
 739            "http://www.server.com/index.html".  Don't remove the old
 740            one, just add the new one as a non-canonical entry.  */
 741         goto url_only;
 742
 743       hash_table_remove (dl_file_url_map, file);
 744       xfree (old_file);
 745       xfree (old_url);
 746
 747       /* Remove all the URLs that point to this file.  Yes, there can
 748          be more than one such URL, because we store redirections as
 749          multiple entries in dl_url_file_map.  For example, if URL1
 750          redirects to URL2 which gets downloaded to FILE, we map both
 751          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 752          only points to URL2.)  When another URL gets loaded to FILE,
 753          we want both URL1 and URL2 dissociated from it.
 754
 755          This is a relatively expensive operation because it performs
 756          a linear search of the whole hash table, but it should be
 757          called very rarely, only when two URLs resolve to the same
 758          file name, *and* the "<file>.1" extensions are turned off.
 759          In other words, almost never.  */
 760       dissociate_urls_from_file (file);
 761     }
 762
 763   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 764
 765  url_only:
 766   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 767      If the latter were present, it should have been removed by the
 768      above `if'.  So we could write:
 769
 770          assert (!hash_table_contains (dl_url_file_map, url));
 771
 772      The above is correct when running in recursive mode where the
 773      same URL always resolves to the same file.  But if you do
 774      something like:
 775
 776          wget URL URL
 777
 778      then the first URL will resolve to "FILE", and the other to
 779      "FILE.1".  In that case, FILE.1 will not be found in
 780      dl_file_url_map, but URL will still point to FILE in
 781      dl_url_file_map.  */
 782   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 783     {
 784       hash_table_remove (dl_url_file_map, url);
 785       xfree (old_url);
 786       xfree (old_file);
 787     }
 788
 789   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 790 }
 791
 792 /* Register that FROM has been redirected to TO.  This assumes that TO
 793    is successfully downloaded and already registered using
 794    register_download() above.  */
 795
 796 void
 797 register_redirection (const char *from, const char *to)
 798 {
 799   char *file;
 800
 801   ENSURE_TABLES_EXIST;
 802
 803   file = hash_table_get (dl_url_file_map, to);
 804   assert (file != NULL);
 805   if (!hash_table_contains (dl_url_file_map, from))
 806     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 807 }
 808
 809 /* Register that the file has been deleted. */
 810
 811 void
 812 register_delete_file (const char *file)
 813 {
 814   char *old_url, *old_file;
 815
 816   ENSURE_TABLES_EXIST;
 817
 818   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 819     return;
 820
 821   hash_table_remove (dl_file_url_map, file);
 822   xfree (old_file);
 823   xfree (old_url);
 824   dissociate_urls_from_file (file);
 825 }
 826
 827 /* Register that FILE is an HTML file that has been downloaded. */
 828
 829 void
 830 register_html (const char *url, const char *file)
 831 {
 832   if (!downloaded_html_set)
 833     downloaded_html_set = make_string_hash_table (0);
 834   string_set_add (downloaded_html_set, file);
 835 }
 836
 837 static void downloaded_files_free (void);
 838 static void nonexisting_urls_free (void);
 839
 840 /* Cleanup the data structures associated with this file.  */
 841
 842 void
 843 convert_cleanup (void)
 844 {
 845   if (dl_file_url_map)
 846     {
 847       free_keys_and_values (dl_file_url_map);
 848       hash_table_destroy (dl_file_url_map);
 849       dl_file_url_map = NULL;
 850     }
 851   if (dl_url_file_map)
 852     {
 853       free_keys_and_values (dl_url_file_map);
 854       hash_table_destroy (dl_url_file_map);
 855       dl_url_file_map = NULL;
 856     }
 857   if (downloaded_html_set)
 858     string_set_free (downloaded_html_set);
 859   downloaded_files_free ();
 860   nonexisting_urls_free ();
 861   if (converted_files)
 862     string_set_free (converted_files);
 863 }
 864 \f
 865 /* Book-keeping code for downloaded files that enables extension
 866    hacks.  */
 867
 868 /* This table should really be merged with dl_file_url_map and
 869    downloaded_html_files.  This was originally a list, but I changed
 870    it to a hash table beause it was actually taking a lot of time to
 871    find things in it.  */
 872
 873 static struct hash_table *downloaded_files_hash;
 874
 875 /* We're storing "modes" of type downloaded_file_t in the hash table.
 876    However, our hash tables only accept pointers for keys and values.
 877    So when we need a pointer, we use the address of a
 878    downloaded_file_t variable of static storage.  */
 879
 880 static downloaded_file_t *
 881 downloaded_mode_to_ptr (downloaded_file_t mode)
 882 {
 883   static downloaded_file_t
 884     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 885     v2 = FILE_DOWNLOADED_NORMALLY,
 886     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 887     v4 = CHECK_FOR_FILE;
 888
 889   switch (mode)
 890     {
 891     case FILE_NOT_ALREADY_DOWNLOADED:
 892       return &v1;
 893     case FILE_DOWNLOADED_NORMALLY:
 894       return &v2;
 895     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 896       return &v3;
 897     case CHECK_FOR_FILE:
 898       return &v4;
 899     }
 900   return NULL;
 901 }
 902
 903 /* Remembers which files have been downloaded.  In the standard case,
 904    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 905    file we actually download successfully (i.e. not for ones we have
 906    failures on or that we skip due to -N).
 907
 908    When we've downloaded a file and tacked on a ".html" extension due
 909    to -E, call this function with
 910    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 911    FILE_DOWNLOADED_NORMALLY.
 912
 913    If you just want to check if a file has been previously added
 914    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 915    sure to call this function with local filenames, not remote
 916    URLs.  */
 917
 918 downloaded_file_t
 919 downloaded_file (downloaded_file_t mode, const char *file)
 920 {
 921   downloaded_file_t *ptr;
 922
 923   if (mode == CHECK_FOR_FILE)
 924     {
 925       if (!downloaded_files_hash)
 926         return FILE_NOT_ALREADY_DOWNLOADED;
 927       ptr = hash_table_get (downloaded_files_hash, file);
 928       if (!ptr)
 929         return FILE_NOT_ALREADY_DOWNLOADED;
 930       return *ptr;
 931     }
 932
 933   if (!downloaded_files_hash)
 934     downloaded_files_hash = make_string_hash_table (0);
 935
 936   ptr = hash_table_get (downloaded_files_hash, file);
 937   if (ptr)
 938     return *ptr;
 939
 940   ptr = downloaded_mode_to_ptr (mode);
 941   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
 942
 943   return FILE_NOT_ALREADY_DOWNLOADED;
 944 }
 945
 946 static void
 947 downloaded_files_free (void)
 948 {
 949   if (downloaded_files_hash)
 950     {
 951       hash_table_iterator iter;
 952       for (hash_table_iterate (downloaded_files_hash, &iter);
 953            hash_table_iter_next (&iter);
 954            )
 955         xfree (iter.key);
 956       hash_table_destroy (downloaded_files_hash);
 957       downloaded_files_hash = NULL;
 958     }
 959 }
 960 \f
 961 /* Remembers broken links.  */
 962
 963 struct broken_urls_list
 964 {
 965   char *url;
 966   struct broken_urls_list *next;
 967 };
 968
 969 static bool
 970 in_list (const struct broken_urls_list *list, const char *url)
 971 {
 972   const struct broken_urls_list *ptr;
 973
 974   for (ptr = list; ptr; ptr = ptr->next)
 975     {
 976       /* str[case]cmp is inadequate for URL comparison */
 977       if (are_urls_equal (url, ptr->url) == 0) return true;
 978     }
 979
 980   return false;
 981 }
 982
 983 void
 984 nonexisting_url (const char *url, const char *referrer)
 985 {
 986   struct broken_urls_list *list;
 987
 988   /* Ignore robots.txt URLs */
 989   if (is_robots_txt_url (url))
 990     return;
 991
 992   if (!nonexisting_urls_hash)
 993     nonexisting_urls_hash = make_string_hash_table (0);
 994
 995   list = hash_table_get (nonexisting_urls_hash, url);
 996   if (!list)
 997     {
 998       list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
 999       list->url = referrer ? xstrdup (referrer) : NULL;
1000       hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
1001     }
1002   else if (list && !in_list (list, referrer))
1003     {
1004       /* Append referrer at the end of the list */
1005       struct broken_urls_list *newnode;
1006
1007       while (list->next) list = list->next;
1008
1009       newnode = xnew0 (struct broken_urls_list);
1010       newnode->url = xstrdup (referrer);
1011       list->next = newnode;
1012     }
1013 }
1014
1015 static void
1016 nonexisting_urls_free (void)
1017 {
1018   if (nonexisting_urls_hash)
1019     {
1020       hash_table_iterator iter;
1021       for (hash_table_iterate (nonexisting_urls_hash, &iter);
1022            hash_table_iter_next (&iter);
1023            )
1024         {
1025           xfree (iter.key);
1026           xfree (iter.value);
1027         }
1028       hash_table_destroy (nonexisting_urls_hash);
1029       nonexisting_urls_hash = NULL;
1030     }
1031 }
1032
1033 void
1034 print_broken_links (void)
1035 {
1036   hash_table_iterator iter;
1037   int num_elems;
1038
1039   if (!nonexisting_urls_hash)
1040     {
1041       logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
1042       return;
1043     }
1044
1045   num_elems = hash_table_count (nonexisting_urls_hash);
1046   assert (num_elems > 0);
1047
1048   if (num_elems > 1)
1049     {
1050       logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
1051                  num_elems);
1052     }
1053   else
1054     {
1055       logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
1056     }
1057
1058   for (hash_table_iterate (nonexisting_urls_hash, &iter);
1059        hash_table_iter_next (&iter);
1060        )
1061     {
1062       struct broken_urls_list *list;
1063
1064       logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
1065
1066       for (list = (struct broken_urls_list *) iter.value;
1067            list;
1068            list = list->next)
1069         {
1070           logprintf (LOG_NOTQUIET, _("    %s\n"), list->url);
1071         }
1072     }
1073   logputs (LOG_NOTQUIET, "\n");
1074 }
1075
1076 \f
1077 /* The function returns the pointer to the malloc-ed quoted version of
1078    string s.  It will recognize and quote numeric and special graphic
1079    entities, as per RFC1866:
1080
1081    `&' -> `&amp;'
1082    `<' -> `&lt;'
1083    `>' -> `&gt;'
1084    `"' -> `&quot;'
1085    SP  -> `&#32;'
1086
1087    No other entities are recognized or replaced.  */
1088 char *
1089 html_quote_string (const char *s)
1090 {
1091   const char *b = s;
1092   char *p, *res;
1093   int i;
1094
1095   /* Pass through the string, and count the new size.  */
1096   for (i = 0; *s; s++, i++)
1097     {
1098       if (*s == '&')
1099         i += 4;                 /* `amp;' */
1100       else if (*s == '<' || *s == '>')
1101         i += 3;                 /* `lt;' and `gt;' */
1102       else if (*s == '\"')
1103         i += 5;                 /* `quot;' */
1104       else if (*s == ' ')
1105         i += 4;                 /* #32; */
1106     }
1107   res = xmalloc (i + 1);
1108   s = b;
1109   for (p = res; *s; s++)
1110     {
1111       switch (*s)
1112         {
1113         case '&':
1114           *p++ = '&';
1115           *p++ = 'a';
1116           *p++ = 'm';
1117           *p++ = 'p';
1118           *p++ = ';';
1119           break;
1120         case '<': case '>':
1121           *p++ = '&';
1122           *p++ = (*s == '<' ? 'l' : 'g');
1123           *p++ = 't';
1124           *p++ = ';';
1125           break;
1126         case '\"':
1127           *p++ = '&';
1128           *p++ = 'q';
1129           *p++ = 'u';
1130           *p++ = 'o';
1131           *p++ = 't';
1132           *p++ = ';';
1133           break;
1134         case ' ':
1135           *p++ = '&';
1136           *p++ = '#';
1137           *p++ = '3';
1138           *p++ = '2';
1139           *p++ = ';';
1140           break;
1141         default:
1142           *p++ = *s;
1143         }
1144     }
1145   *p = '\0';
1146   return res;
1147 }
1148
1149 /*
1150  * vim: et ts=2 sw=2
1151  */
1152