sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003-2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 In addition, as a special exception, the Free Software Foundation
  20 gives permission to link the code of its release of Wget with the
  21 OpenSSL project's "OpenSSL" library (or with modified versions of it
  22 that use the same license as the "OpenSSL" library), and distribute
  23 the linked executables.  You must obey the GNU General Public License
  24 in all respects for all of the code used other than "OpenSSL".  If you
  25 modify this file, you may extend this exception to your version of the
  26 file, but you are not obligated to do so.  If you do not wish to do
  27 so, delete this exception statement from your version.  */
  28
  29 #include <config.h>
  30
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #ifdef HAVE_UNISTD_H
  35 # include <unistd.h>
  36 #endif /* HAVE_UNISTD_H */
  37 #include <errno.h>
  38 #include <assert.h>
  39
  40 #include "wget.h"
  41 #include "convert.h"
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "hash.h"
  46 #include "ptimer.h"
  47 #include "res.h"
  48
  49 static struct hash_table *dl_file_url_map;
  50 struct hash_table *dl_url_file_map;
  51
  52 /* Set of HTML files downloaded in this Wget run, used for link
  53    conversion after Wget is done.  */
  54 struct hash_table *downloaded_html_set;
  55
  56 static void convert_links (const char *, struct urlpos *);
  57
  58 /* This function is called when the retrieval is done to convert the
  59    links that have been downloaded.  It has to be called at the end of
  60    the retrieval, because only then does Wget know conclusively which
  61    URLs have been downloaded, and which not, so it can tell which
  62    direction to convert to.
  63
  64    The "direction" means that the URLs to the files that have been
  65    downloaded get converted to the relative URL which will point to
  66    that file.  And the other URLs get converted to the remote URL on
  67    the server.
  68
  69    All the downloaded HTMLs are kept in downloaded_html_files, and
  70    downloaded URLs in urls_downloaded.  All the information is
  71    extracted from these two lists.  */
  72
  73 void
  74 convert_all_links (void)
  75 {
  76   int i;
  77   double secs;
  78   int file_count = 0;
  79
  80   struct ptimer *timer = ptimer_new ();
  81
  82   int cnt;
  83   char **file_array;
  84
  85   cnt = 0;
  86   if (downloaded_html_set)
  87     cnt = hash_table_count (downloaded_html_set);
  88   if (cnt == 0)
  89     return;
  90   file_array = alloca_array (char *, cnt);
  91   string_set_to_array (downloaded_html_set, file_array);
  92
  93   for (i = 0; i < cnt; i++)
  94     {
  95       struct urlpos *urls, *cur_url;
  96       char *url;
  97       char *file = file_array[i];
  98
  99       /* Determine the URL of the HTML file.  get_urls_html will need
 100          it.  */
 101       url = hash_table_get (dl_file_url_map, file);
 102       if (!url)
 103         {
 104           DEBUGP (("Apparently %s has been removed.\n", file));
 105           continue;
 106         }
 107
 108       DEBUGP (("Scanning %s (from %s)\n", file, url));
 109
 110       /* Parse the HTML file...  */
 111       urls = get_urls_html (file, url, NULL);
 112
 113       /* We don't respect meta_disallow_follow here because, even if
 114          the file is not followed, we might still want to convert the
 115          links that have been followed from other files.  */
 116
 117       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 118         {
 119           char *local_name;
 120           struct url *u = cur_url->url;
 121
 122           if (cur_url->link_base_p)
 123             {
 124               /* Base references have been resolved by our parser, so
 125                  we turn the base URL into an empty string.  (Perhaps
 126                  we should remove the tag entirely?)  */
 127               cur_url->convert = CO_NULLIFY_BASE;
 128               continue;
 129             }
 130
 131           /* We decide the direction of conversion according to whether
 132              a URL was downloaded.  Downloaded URLs will be converted
 133              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 134           local_name = hash_table_get (dl_url_file_map, u->url);
 135
 136           /* Decide on the conversion type.  */
 137           if (local_name)
 138             {
 139               /* We've downloaded this URL.  Convert it to relative
 140                  form.  We do this even if the URL already is in
 141                  relative form, because our directory structure may
 142                  not be identical to that on the server (think `-nd',
 143                  `--cut-dirs', etc.)  */
 144               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 145               cur_url->local_name = xstrdup (local_name);
 146               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 147             }
 148           else
 149             {
 150               /* We haven't downloaded this URL.  If it's not already
 151                  complete (including a full host name), convert it to
 152                  that form, so it can be reached while browsing this
 153                  HTML locally.  */
 154               if (!cur_url->link_complete_p)
 155                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 156               cur_url->local_name = NULL;
 157               DEBUGP (("will convert url %s to complete\n", u->url));
 158             }
 159         }
 160
 161       /* Convert the links in the file.  */
 162       convert_links (file, urls);
 163       ++file_count;
 164
 165       /* Free the data.  */
 166       free_urlpos (urls);
 167     }
 168
 169   secs = ptimer_measure (timer);
 170   ptimer_destroy (timer);
 171   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 172              file_count, print_decimal (secs));
 173 }
 174
 175 static void write_backup_file (const char *, downloaded_file_t);
 176 static const char *replace_attr (const char *, int, FILE *, const char *);
 177 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 178                                               const char *, int);
 179 static char *local_quote_string (const char *);
 180 static char *construct_relative (const char *, const char *);
 181
 182 /* Change the links in one HTML file.  LINKS is a list of links in the
 183    document, along with their positions and the desired direction of
 184    the conversion.  */
 185 static void
 186 convert_links (const char *file, struct urlpos *links)
 187 {
 188   struct file_memory *fm;
 189   FILE *fp;
 190   const char *p;
 191   downloaded_file_t downloaded_file_return;
 192
 193   struct urlpos *link;
 194   int to_url_count = 0, to_file_count = 0;
 195
 196   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 197
 198   {
 199     /* First we do a "dry run": go through the list L and see whether
 200        any URL needs to be converted in the first place.  If not, just
 201        leave the file alone.  */
 202     int dry_count = 0;
 203     struct urlpos *dry;
 204     for (dry = links; dry; dry = dry->next)
 205       if (dry->convert != CO_NOCONVERT)
 206         ++dry_count;
 207     if (!dry_count)
 208       {
 209         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 210         return;
 211       }
 212   }
 213
 214   fm = read_file (file);
 215   if (!fm)
 216     {
 217       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 218                  file, strerror (errno));
 219       return;
 220     }
 221
 222   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 223   if (opt.backup_converted && downloaded_file_return)
 224     write_backup_file (file, downloaded_file_return);
 225
 226   /* Before opening the file for writing, unlink the file.  This is
 227      important if the data in FM is mmaped.  In such case, nulling the
 228      file, which is what fopen() below does, would make us read all
 229      zeroes from the mmaped region.  */
 230   if (unlink (file) < 0 && errno != ENOENT)
 231     {
 232       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
 233                  file, strerror (errno));
 234       read_file_free (fm);
 235       return;
 236     }
 237   /* Now open the file for writing.  */
 238   fp = fopen (file, "wb");
 239   if (!fp)
 240     {
 241       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 242                  file, strerror (errno));
 243       read_file_free (fm);
 244       return;
 245     }
 246
 247   /* Here we loop through all the URLs in file, replacing those of
 248      them that are downloaded with relative references.  */
 249   p = fm->content;
 250   for (link = links; link; link = link->next)
 251     {
 252       char *url_start = fm->content + link->pos;
 253
 254       if (link->pos >= fm->length)
 255         {
 256           DEBUGP (("Something strange is going on.  Please investigate."));
 257           break;
 258         }
 259       /* If the URL is not to be converted, skip it.  */
 260       if (link->convert == CO_NOCONVERT)
 261         {
 262           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 263           continue;
 264         }
 265
 266       /* Echo the file contents, up to the offending URL's opening
 267          quote, to the outfile.  */
 268       fwrite (p, 1, url_start - p, fp);
 269       p = url_start;
 270
 271       switch (link->convert)
 272         {
 273         case CO_CONVERT_TO_RELATIVE:
 274           /* Convert absolute URL to relative. */
 275           {
 276             char *newname = construct_relative (file, link->local_name);
 277             char *quoted_newname = local_quote_string (newname);
 278
 279             if (!link->link_refresh_p)
 280               p = replace_attr (p, link->size, fp, quoted_newname);
 281             else
 282               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 283                                              link->refresh_timeout);
 284
 285             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 286                      link->url->url, newname, link->pos, file));
 287             xfree (newname);
 288             xfree (quoted_newname);
 289             ++to_file_count;
 290             break;
 291           }
 292         case CO_CONVERT_TO_COMPLETE:
 293           /* Convert the link to absolute URL. */
 294           {
 295             char *newlink = link->url->url;
 296             char *quoted_newlink = html_quote_string (newlink);
 297
 298             if (!link->link_refresh_p)
 299               p = replace_attr (p, link->size, fp, quoted_newlink);
 300             else
 301               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 302                                              link->refresh_timeout);
 303
 304             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 305                      newlink, link->pos, file));
 306             xfree (quoted_newlink);
 307             ++to_url_count;
 308             break;
 309           }
 310         case CO_NULLIFY_BASE:
 311           /* Change the base href to "". */
 312           p = replace_attr (p, link->size, fp, "");
 313           break;
 314         case CO_NOCONVERT:
 315           abort ();
 316           break;
 317         }
 318     }
 319
 320   /* Output the rest of the file. */
 321   if (p - fm->content < fm->length)
 322     fwrite (p, 1, fm->length - (p - fm->content), fp);
 323   fclose (fp);
 324   read_file_free (fm);
 325
 326   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 327 }
 328
 329 /* Construct and return a link that points from BASEFILE to LINKFILE.
 330    Both files should be local file names, BASEFILE of the referrering
 331    file, and LINKFILE of the referred file.
 332
 333    Examples:
 334
 335    cr("foo", "bar")         -> "bar"
 336    cr("A/foo", "A/bar")     -> "bar"
 337    cr("A/foo", "A/B/bar")   -> "B/bar"
 338    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 339    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 340
 341    Both files should be absolute or relative, otherwise strange
 342    results might ensue.  The function makes no special efforts to
 343    handle "." and ".." in links, so make sure they're not there
 344    (e.g. using path_simplify).  */
 345
 346 static char *
 347 construct_relative (const char *basefile, const char *linkfile)
 348 {
 349   char *link;
 350   int basedirs;
 351   const char *b, *l;
 352   int i, start;
 353
 354   /* First, skip the initial directory components common to both
 355      files.  */
 356   start = 0;
 357   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 358     {
 359       if (*b == '/')
 360         start = (b - basefile) + 1;
 361     }
 362   basefile += start;
 363   linkfile += start;
 364
 365   /* With common directories out of the way, the situation we have is
 366      as follows:
 367          b - b1/b2/[...]/bfile
 368          l - l1/l2/[...]/lfile
 369
 370      The link we're constructing needs to be:
 371        lnk - ../../l1/l2/[...]/lfile
 372
 373      Where the number of ".."'s equals the number of bN directory
 374      components in B.  */
 375
 376   /* Count the directory components in B. */
 377   basedirs = 0;
 378   for (b = basefile; *b; b++)
 379     {
 380       if (*b == '/')
 381         ++basedirs;
 382     }
 383
 384   /* Construct LINK as explained above. */
 385   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 386   for (i = 0; i < basedirs; i++)
 387     memcpy (link + 3 * i, "../", 3);
 388   strcpy (link + 3 * i, linkfile);
 389   return link;
 390 }
 391
 392 /* Used by write_backup_file to remember which files have been
 393    written. */
 394 static struct hash_table *converted_files;
 395
 396 static void
 397 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 398 {
 399   /* Rather than just writing over the original .html file with the
 400      converted version, save the former to *.orig.  Note we only do
 401      this for files we've _successfully_ downloaded, so we don't
 402      clobber .orig files sitting around from previous invocations. */
 403
 404   /* Construct the backup filename as the original name plus ".orig". */
 405   size_t         filename_len = strlen (file);
 406   char*          filename_plus_orig_suffix;
 407
 408   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 409     {
 410       /* Just write "orig" over "html".  We need to do it this way
 411          because when we're checking to see if we've downloaded the
 412          file before (to see if we can skip downloading it), we don't
 413          know if it's a text/html file.  Therefore we don't know yet
 414          at that stage that -E is going to cause us to tack on
 415          ".html", so we need to compare vs. the original URL plus
 416          ".orig", not the original URL plus ".html.orig". */
 417       filename_plus_orig_suffix = alloca (filename_len + 1);
 418       strcpy (filename_plus_orig_suffix, file);
 419       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 420     }
 421   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 422     {
 423       /* Append ".orig" to the name. */
 424       filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
 425       strcpy (filename_plus_orig_suffix, file);
 426       strcpy (filename_plus_orig_suffix + filename_len, ".orig");
 427     }
 428
 429   if (!converted_files)
 430     converted_files = make_string_hash_table (0);
 431
 432   /* We can get called twice on the same URL thanks to the
 433      convert_all_links() call in main().  If we write the .orig file
 434      each time in such a case, it'll end up containing the first-pass
 435      conversion, not the original file.  So, see if we've already been
 436      called on this file. */
 437   if (!string_set_contains (converted_files, file))
 438     {
 439       /* Rename <file> to <file>.orig before former gets written over. */
 440       if (rename (file, filename_plus_orig_suffix) != 0)
 441         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 442                    file, filename_plus_orig_suffix, strerror (errno));
 443
 444       /* Remember that we've already written a .orig backup for this file.
 445          Note that we never free this memory since we need it till the
 446          convert_all_links() call, which is one of the last things the
 447          program does before terminating.  BTW, I'm not sure if it would be
 448          safe to just set 'converted_file_ptr->string' to 'file' below,
 449          rather than making a copy of the string...  Another note is that I
 450          thought I could just add a field to the urlpos structure saying
 451          that we'd written a .orig file for this URL, but that didn't work,
 452          so I had to make this separate list.
 453          -- Dan Harkless <wget@harkless.org>
 454
 455          This [adding a field to the urlpos structure] didn't work
 456          because convert_file() is called from convert_all_links at
 457          the end of the retrieval with a freshly built new urlpos
 458          list.
 459          -- Hrvoje Niksic <hniksic@xemacs.org>
 460       */
 461       string_set_add (converted_files, file);
 462     }
 463 }
 464
 465 static bool find_fragment (const char *, int, const char **, const char **);
 466
 467 /* Replace an attribute's original text with NEW_TEXT. */
 468
 469 static const char *
 470 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 471 {
 472   bool quote_flag = false;
 473   char quote_char = '\"';       /* use "..." for quoting, unless the
 474                                    original value is quoted, in which
 475                                    case reuse its quoting char. */
 476   const char *frag_beg, *frag_end;
 477
 478   /* Structure of our string is:
 479        "...old-contents..."
 480        <---    size    --->  (with quotes)
 481      OR:
 482        ...old-contents...
 483        <---    size   -->    (no quotes)   */
 484
 485   if (*p == '\"' || *p == '\'')
 486     {
 487       quote_char = *p;
 488       quote_flag = true;
 489       ++p;
 490       size -= 2;                /* disregard opening and closing quote */
 491     }
 492   putc (quote_char, fp);
 493   fputs (new_text, fp);
 494
 495   /* Look for fragment identifier, if any. */
 496   if (find_fragment (p, size, &frag_beg, &frag_end))
 497     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 498   p += size;
 499   if (quote_flag)
 500     ++p;
 501   putc (quote_char, fp);
 502
 503   return p;
 504 }
 505
 506 /* The same as REPLACE_ATTR, but used when replacing
 507    <meta http-equiv=refresh content="new_text"> because we need to
 508    append "timeout_value; URL=" before the next_text.  */
 509
 510 static const char *
 511 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 512                            const char *new_text, int timeout)
 513 {
 514   /* "0; URL=..." */
 515   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 516                                            + 6 /* "; URL=" */
 517                                            + strlen (new_text)
 518                                            + 1);
 519   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 520
 521   return replace_attr (p, size, fp, new_with_timeout);
 522 }
 523
 524 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 525    preceded by '&'.  If the character is not found, return zero.  If
 526    the character is found, return true and set BP and EP to point to
 527    the beginning and end of the region.
 528
 529    This is used for finding the fragment indentifiers in URLs.  */
 530
 531 static bool
 532 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 533 {
 534   const char *end = beg + size;
 535   bool saw_amp = false;
 536   for (; beg < end; beg++)
 537     {
 538       switch (*beg)
 539         {
 540         case '&':
 541           saw_amp = true;
 542           break;
 543         case '#':
 544           if (!saw_amp)
 545             {
 546               *bp = beg;
 547               *ep = end;
 548               return true;
 549             }
 550           /* fallthrough */
 551         default:
 552           saw_amp = false;
 553         }
 554     }
 555   return false;
 556 }
 557
 558 /* Quote FILE for use as local reference to an HTML file.
 559
 560    We quote ? as %3F to avoid passing part of the file name as the
 561    parameter when browsing the converted file through HTTP.  However,
 562    it is safe to do this only when `--html-extension' is turned on.
 563    This is because converting "index.html?foo=bar" to
 564    "index.html%3Ffoo=bar" would break local browsing, as the latter
 565    isn't even recognized as an HTML file!  However, converting
 566    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 567    safe for both local and HTTP-served browsing.
 568
 569    We always quote "#" as "%23" and "%" as "%25" because those
 570    characters have special meanings in URLs.  */
 571
 572 static char *
 573 local_quote_string (const char *file)
 574 {
 575   const char *from;
 576   char *newname, *to;
 577
 578   char *any = strpbrk (file, "?#%");
 579   if (!any)
 580     return html_quote_string (file);
 581
 582   /* Allocate space assuming the worst-case scenario, each character
 583      having to be quoted.  */
 584   to = newname = (char *)alloca (3 * strlen (file) + 1);
 585   for (from = file; *from; from++)
 586     switch (*from)
 587       {
 588       case '%':
 589         *to++ = '%';
 590         *to++ = '2';
 591         *to++ = '5';
 592         break;
 593       case '#':
 594         *to++ = '%';
 595         *to++ = '2';
 596         *to++ = '3';
 597         break;
 598       case '?':
 599         if (opt.html_extension)
 600           {
 601             *to++ = '%';
 602             *to++ = '3';
 603             *to++ = 'F';
 604             break;
 605           }
 606         /* fallthrough */
 607       default:
 608         *to++ = *from;
 609       }
 610   *to = '\0';
 611
 612   return html_quote_string (newname);
 613 }
 614 \f
 615 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 616    downloaded_html_list, and downloaded_html_set.  Other code calls
 617    these functions to let us know that a file has been downloaded.  */
 618
 619 #define ENSURE_TABLES_EXIST do {                        \
 620   if (!dl_file_url_map)                                 \
 621     dl_file_url_map = make_string_hash_table (0);       \
 622   if (!dl_url_file_map)                                 \
 623     dl_url_file_map = make_string_hash_table (0);       \
 624 } while (0)
 625
 626 /* Return true if S1 and S2 are the same, except for "/index.html".
 627    The three cases in which it returns one are (substitute any
 628    substring for "foo"):
 629
 630    m("foo/index.html", "foo/")  ==> 1
 631    m("foo/", "foo/index.html")  ==> 1
 632    m("foo", "foo/index.html")   ==> 1
 633    m("foo", "foo/"              ==> 1
 634    m("foo", "foo")              ==> 1  */
 635
 636 static bool
 637 match_except_index (const char *s1, const char *s2)
 638 {
 639   int i;
 640   const char *lng;
 641
 642   /* Skip common substring. */
 643   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 644     ;
 645   if (i == 0)
 646     /* Strings differ at the very beginning -- bail out.  We need to
 647        check this explicitly to avoid `lng - 1' reading outside the
 648        array.  */
 649     return false;
 650
 651   if (!*s1 && !*s2)
 652     /* Both strings hit EOF -- strings are equal. */
 653     return true;
 654   else if (*s1 && *s2)
 655     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 656     return false;
 657   else if (*s1)
 658     /* S1 is the longer one. */
 659     lng = s1;
 660   else
 661     /* S2 is the longer one. */
 662     lng = s2;
 663
 664   /* foo            */            /* foo/           */
 665   /* foo/index.html */  /* or */  /* foo/index.html */
 666   /*    ^           */            /*     ^          */
 667
 668   if (*lng != '/')
 669     /* The right-hand case. */
 670     --lng;
 671
 672   if (*lng == '/' && *(lng + 1) == '\0')
 673     /* foo  */
 674     /* foo/ */
 675     return true;
 676
 677   return 0 == strcmp (lng, "/index.html");
 678 }
 679
 680 static int
 681 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 682 {
 683   char *mapping_url = (char *)key;
 684   char *mapping_file = (char *)value;
 685   char *file = (char *)arg;
 686
 687   if (0 == strcmp (mapping_file, file))
 688     {
 689       hash_table_remove (dl_url_file_map, mapping_url);
 690       xfree (mapping_url);
 691       xfree (mapping_file);
 692     }
 693
 694   /* Continue mapping. */
 695   return 0;
 696 }
 697
 698 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 699
 700 static void
 701 dissociate_urls_from_file (const char *file)
 702 {
 703   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 704   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 705                        (char *) file);
 706 }
 707
 708 /* Register that URL has been successfully downloaded to FILE.  This
 709    is used by the link conversion code to convert references to URLs
 710    to references to local files.  It is also being used to check if a
 711    URL has already been downloaded.  */
 712
 713 void
 714 register_download (const char *url, const char *file)
 715 {
 716   char *old_file, *old_url;
 717
 718   ENSURE_TABLES_EXIST;
 719
 720   /* With some forms of retrieval, it is possible, although not likely
 721      or particularly desirable.  If both are downloaded, the second
 722      download will override the first one.  When that happens,
 723      dissociate the old file name from the URL.  */
 724
 725   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 726     {
 727       if (0 == strcmp (url, old_url))
 728         /* We have somehow managed to download the same URL twice.
 729            Nothing to do.  */
 730         return;
 731
 732       if (match_except_index (url, old_url)
 733           && !hash_table_contains (dl_url_file_map, url))
 734         /* The two URLs differ only in the "index.html" ending.  For
 735            example, one is "http://www.server.com/", and the other is
 736            "http://www.server.com/index.html".  Don't remove the old
 737            one, just add the new one as a non-canonical entry.  */
 738         goto url_only;
 739
 740       hash_table_remove (dl_file_url_map, file);
 741       xfree (old_file);
 742       xfree (old_url);
 743
 744       /* Remove all the URLs that point to this file.  Yes, there can
 745          be more than one such URL, because we store redirections as
 746          multiple entries in dl_url_file_map.  For example, if URL1
 747          redirects to URL2 which gets downloaded to FILE, we map both
 748          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 749          only points to URL2.)  When another URL gets loaded to FILE,
 750          we want both URL1 and URL2 dissociated from it.
 751
 752          This is a relatively expensive operation because it performs
 753          a linear search of the whole hash table, but it should be
 754          called very rarely, only when two URLs resolve to the same
 755          file name, *and* the "<file>.1" extensions are turned off.
 756          In other words, almost never.  */
 757       dissociate_urls_from_file (file);
 758     }
 759
 760   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 761
 762  url_only:
 763   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 764      If the latter were present, it should have been removed by the
 765      above `if'.  So we could write:
 766
 767          assert (!hash_table_contains (dl_url_file_map, url));
 768
 769      The above is correct when running in recursive mode where the
 770      same URL always resolves to the same file.  But if you do
 771      something like:
 772
 773          wget URL URL
 774
 775      then the first URL will resolve to "FILE", and the other to
 776      "FILE.1".  In that case, FILE.1 will not be found in
 777      dl_file_url_map, but URL will still point to FILE in
 778      dl_url_file_map.  */
 779   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 780     {
 781       hash_table_remove (dl_url_file_map, url);
 782       xfree (old_url);
 783       xfree (old_file);
 784     }
 785
 786   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 787 }
 788
 789 /* Register that FROM has been redirected to TO.  This assumes that TO
 790    is successfully downloaded and already registered using
 791    register_download() above.  */
 792
 793 void
 794 register_redirection (const char *from, const char *to)
 795 {
 796   char *file;
 797
 798   ENSURE_TABLES_EXIST;
 799
 800   file = hash_table_get (dl_url_file_map, to);
 801   assert (file != NULL);
 802   if (!hash_table_contains (dl_url_file_map, from))
 803     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 804 }
 805
 806 /* Register that the file has been deleted. */
 807
 808 void
 809 register_delete_file (const char *file)
 810 {
 811   char *old_url, *old_file;
 812
 813   ENSURE_TABLES_EXIST;
 814
 815   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 816     return;
 817
 818   hash_table_remove (dl_file_url_map, file);
 819   xfree (old_file);
 820   xfree (old_url);
 821   dissociate_urls_from_file (file);
 822 }
 823
 824 /* Register that FILE is an HTML file that has been downloaded. */
 825
 826 void
 827 register_html (const char *url, const char *file)
 828 {
 829   if (!downloaded_html_set)
 830     downloaded_html_set = make_string_hash_table (0);
 831   string_set_add (downloaded_html_set, file);
 832 }
 833
 834 static void downloaded_files_free (void);
 835
 836 /* Cleanup the data structures associated with this file.  */
 837
 838 void
 839 convert_cleanup (void)
 840 {
 841   if (dl_file_url_map)
 842     {
 843       free_keys_and_values (dl_file_url_map);
 844       hash_table_destroy (dl_file_url_map);
 845       dl_file_url_map = NULL;
 846     }
 847   if (dl_url_file_map)
 848     {
 849       free_keys_and_values (dl_url_file_map);
 850       hash_table_destroy (dl_url_file_map);
 851       dl_url_file_map = NULL;
 852     }
 853   if (downloaded_html_set)
 854     string_set_free (downloaded_html_set);
 855   downloaded_files_free ();
 856   if (converted_files)
 857     string_set_free (converted_files);
 858 }
 859 \f
 860 /* Book-keeping code for downloaded files that enables extension
 861    hacks.  */
 862
 863 /* This table should really be merged with dl_file_url_map and
 864    downloaded_html_files.  This was originally a list, but I changed
 865    it to a hash table beause it was actually taking a lot of time to
 866    find things in it.  */
 867
 868 static struct hash_table *downloaded_files_hash;
 869
 870 /* We're storing "modes" of type downloaded_file_t in the hash table.
 871    However, our hash tables only accept pointers for keys and values.
 872    So when we need a pointer, we use the address of a
 873    downloaded_file_t variable of static storage.  */
 874
 875 static downloaded_file_t *
 876 downloaded_mode_to_ptr (downloaded_file_t mode)
 877 {
 878   static downloaded_file_t
 879     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 880     v2 = FILE_DOWNLOADED_NORMALLY,
 881     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 882     v4 = CHECK_FOR_FILE;
 883
 884   switch (mode)
 885     {
 886     case FILE_NOT_ALREADY_DOWNLOADED:
 887       return &v1;
 888     case FILE_DOWNLOADED_NORMALLY:
 889       return &v2;
 890     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 891       return &v3;
 892     case CHECK_FOR_FILE:
 893       return &v4;
 894     }
 895   return NULL;
 896 }
 897
 898 /* Remembers which files have been downloaded.  In the standard case,
 899    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 900    file we actually download successfully (i.e. not for ones we have
 901    failures on or that we skip due to -N).
 902
 903    When we've downloaded a file and tacked on a ".html" extension due
 904    to -E, call this function with
 905    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 906    FILE_DOWNLOADED_NORMALLY.
 907
 908    If you just want to check if a file has been previously added
 909    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 910    sure to call this function with local filenames, not remote
 911    URLs.  */
 912
 913 downloaded_file_t
 914 downloaded_file (downloaded_file_t mode, const char *file)
 915 {
 916   downloaded_file_t *ptr;
 917
 918   if (mode == CHECK_FOR_FILE)
 919     {
 920       if (!downloaded_files_hash)
 921         return FILE_NOT_ALREADY_DOWNLOADED;
 922       ptr = hash_table_get (downloaded_files_hash, file);
 923       if (!ptr)
 924         return FILE_NOT_ALREADY_DOWNLOADED;
 925       return *ptr;
 926     }
 927
 928   if (!downloaded_files_hash)
 929     downloaded_files_hash = make_string_hash_table (0);
 930
 931   ptr = hash_table_get (downloaded_files_hash, file);
 932   if (ptr)
 933     return *ptr;
 934
 935   ptr = downloaded_mode_to_ptr (mode);
 936   hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
 937
 938   return FILE_NOT_ALREADY_DOWNLOADED;
 939 }
 940
 941 static void
 942 downloaded_files_free (void)
 943 {
 944   if (downloaded_files_hash)
 945     {
 946       hash_table_iterator iter;
 947       for (hash_table_iterate (downloaded_files_hash, &iter);
 948            hash_table_iter_next (&iter);
 949            )
 950         xfree (iter.key);
 951       hash_table_destroy (downloaded_files_hash);
 952       downloaded_files_hash = NULL;
 953     }
 954 }
 955 \f
 956 /* The function returns the pointer to the malloc-ed quoted version of
 957    string s.  It will recognize and quote numeric and special graphic
 958    entities, as per RFC1866:
 959
 960    `&' -> `&amp;'
 961    `<' -> `&lt;'
 962    `>' -> `&gt;'
 963    `"' -> `&quot;'
 964    SP  -> `&#32;'
 965
 966    No other entities are recognized or replaced.  */
 967 char *
 968 html_quote_string (const char *s)
 969 {
 970   const char *b = s;
 971   char *p, *res;
 972   int i;
 973
 974   /* Pass through the string, and count the new size.  */
 975   for (i = 0; *s; s++, i++)
 976     {
 977       if (*s == '&')
 978         i += 4;                 /* `amp;' */
 979       else if (*s == '<' || *s == '>')
 980         i += 3;                 /* `lt;' and `gt;' */
 981       else if (*s == '\"')
 982         i += 5;                 /* `quot;' */
 983       else if (*s == ' ')
 984         i += 4;                 /* #32; */
 985     }
 986   res = xmalloc (i + 1);
 987   s = b;
 988   for (p = res; *s; s++)
 989     {
 990       switch (*s)
 991         {
 992         case '&':
 993           *p++ = '&';
 994           *p++ = 'a';
 995           *p++ = 'm';
 996           *p++ = 'p';
 997           *p++ = ';';
 998           break;
 999         case '<': case '>':
1000           *p++ = '&';
1001           *p++ = (*s == '<' ? 'l' : 'g');
1002           *p++ = 't';
1003           *p++ = ';';
1004           break;
1005         case '\"':
1006           *p++ = '&';
1007           *p++ = 'q';
1008           *p++ = 'u';
1009           *p++ = 'o';
1010           *p++ = 't';
1011           *p++ = ';';
1012           break;
1013         case ' ':
1014           *p++ = '&';
1015           *p++ = '#';
1016           *p++ = '3';
1017           *p++ = '2';
1018           *p++ = ';';
1019           break;
1020         default:
1021           *p++ = *s;
1022         }
1023     }
1024   *p = '\0';
1025   return res;
1026 }
1027
1028 /*
1029  * vim: et ts=2 sw=2
1030  */
1031