sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007,
   3    2008 Free Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif /* HAVE_UNISTD_H */
  39 #include <errno.h>
  40 #include <assert.h>
  41 #include "convert.h"
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "hash.h"
  46 #include "ptimer.h"
  47 #include "res.h"
  48
  49 static struct hash_table *dl_file_url_map;
  50 struct hash_table *dl_url_file_map;
  51
  52 /* Set of HTML files downloaded in this Wget run, used for link
  53    conversion after Wget is done.  */
  54 struct hash_table *downloaded_html_set;
  55
  56 static void convert_links (const char *, struct urlpos *);
  57
  58 /* This function is called when the retrieval is done to convert the
  59    links that have been downloaded.  It has to be called at the end of
  60    the retrieval, because only then does Wget know conclusively which
  61    URLs have been downloaded, and which not, so it can tell which
  62    direction to convert to.
  63
  64    The "direction" means that the URLs to the files that have been
  65    downloaded get converted to the relative URL which will point to
  66    that file.  And the other URLs get converted to the remote URL on
  67    the server.
  68
  69    All the downloaded HTMLs are kept in downloaded_html_files, and
  70    downloaded URLs in urls_downloaded.  All the information is
  71    extracted from these two lists.  */
  72
  73 void
  74 convert_all_links (void)
  75 {
  76   int i;
  77   double secs;
  78   int file_count = 0;
  79
  80   struct ptimer *timer = ptimer_new ();
  81
  82   int cnt;
  83   char **file_array;
  84
  85   cnt = 0;
  86   if (downloaded_html_set)
  87     cnt = hash_table_count (downloaded_html_set);
  88   if (cnt == 0)
  89     goto cleanup;
  90   file_array = alloca_array (char *, cnt);
  91   string_set_to_array (downloaded_html_set, file_array);
  92
  93   for (i = 0; i < cnt; i++)
  94     {
  95       struct urlpos *urls, *cur_url;
  96       char *url;
  97       char *file = file_array[i];
  98
  99       /* Determine the URL of the HTML file.  get_urls_html will need
 100          it.  */
 101       url = hash_table_get (dl_file_url_map, file);
 102       if (!url)
 103         {
 104           DEBUGP (("Apparently %s has been removed.\n", file));
 105           continue;
 106         }
 107
 108       DEBUGP (("Scanning %s (from %s)\n", file, url));
 109
 110       /* Parse the HTML file...  */
 111       urls = get_urls_html (file, url, NULL);
 112
 113       /* We don't respect meta_disallow_follow here because, even if
 114          the file is not followed, we might still want to convert the
 115          links that have been followed from other files.  */
 116
 117       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 118         {
 119           char *local_name;
 120           struct url *u = cur_url->url;
 121
 122           if (cur_url->link_base_p)
 123             {
 124               /* Base references have been resolved by our parser, so
 125                  we turn the base URL into an empty string.  (Perhaps
 126                  we should remove the tag entirely?)  */
 127               cur_url->convert = CO_NULLIFY_BASE;
 128               continue;
 129             }
 130
 131           /* We decide the direction of conversion according to whether
 132              a URL was downloaded.  Downloaded URLs will be converted
 133              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 134           local_name = hash_table_get (dl_url_file_map, u->url);
 135
 136           /* Decide on the conversion type.  */
 137           if (local_name)
 138             {
 139               /* We've downloaded this URL.  Convert it to relative
 140                  form.  We do this even if the URL already is in
 141                  relative form, because our directory structure may
 142                  not be identical to that on the server (think `-nd',
 143                  `--cut-dirs', etc.)  */
 144               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 145               cur_url->local_name = xstrdup (local_name);
 146               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 147             }
 148           else
 149             {
 150               /* We haven't downloaded this URL.  If it's not already
 151                  complete (including a full host name), convert it to
 152                  that form, so it can be reached while browsing this
 153                  HTML locally.  */
 154               if (!cur_url->link_complete_p)
 155                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 156               cur_url->local_name = NULL;
 157               DEBUGP (("will convert url %s to complete\n", u->url));
 158             }
 159         }
 160
 161       /* Convert the links in the file.  */
 162       convert_links (file, urls);
 163       ++file_count;
 164
 165       /* Free the data.  */
 166       free_urlpos (urls);
 167     }
 168
 169   secs = ptimer_measure (timer);
 170   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 171              file_count, print_decimal (secs));
 172 cleanup:
 173   ptimer_destroy (timer);
 174 }
 175
 176 static void write_backup_file (const char *, downloaded_file_t);
 177 static const char *replace_attr (const char *, int, FILE *, const char *);
 178 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 179                                               const char *, int);
 180 static char *local_quote_string (const char *);
 181 static char *construct_relative (const char *, const char *);
 182
 183 /* Change the links in one HTML file.  LINKS is a list of links in the
 184    document, along with their positions and the desired direction of
 185    the conversion.  */
 186 static void
 187 convert_links (const char *file, struct urlpos *links)
 188 {
 189   struct file_memory *fm;
 190   FILE *fp;
 191   const char *p;
 192   downloaded_file_t downloaded_file_return;
 193
 194   struct urlpos *link;
 195   int to_url_count = 0, to_file_count = 0;
 196
 197   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 198
 199   {
 200     /* First we do a "dry run": go through the list L and see whether
 201        any URL needs to be converted in the first place.  If not, just
 202        leave the file alone.  */
 203     int dry_count = 0;
 204     struct urlpos *dry;
 205     for (dry = links; dry; dry = dry->next)
 206       if (dry->convert != CO_NOCONVERT)
 207         ++dry_count;
 208     if (!dry_count)
 209       {
 210         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 211         return;
 212       }
 213   }
 214
 215   fm = read_file (file);
 216   if (!fm)
 217     {
 218       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 219                  file, strerror (errno));
 220       return;
 221     }
 222
 223   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 224   if (opt.backup_converted && downloaded_file_return)
 225     write_backup_file (file, downloaded_file_return);
 226
 227   /* Before opening the file for writing, unlink the file.  This is
 228      important if the data in FM is mmaped.  In such case, nulling the
 229      file, which is what fopen() below does, would make us read all
 230      zeroes from the mmaped region.  */
 231   if (unlink (file) < 0 && errno != ENOENT)
 232     {
 233       logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
 234                  quote (file), strerror (errno));
 235       read_file_free (fm);
 236       return;
 237     }
 238   /* Now open the file for writing.  */
 239   fp = fopen (file, "wb");
 240   if (!fp)
 241     {
 242       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 243                  file, strerror (errno));
 244       read_file_free (fm);
 245       return;
 246     }
 247
 248   /* Here we loop through all the URLs in file, replacing those of
 249      them that are downloaded with relative references.  */
 250   p = fm->content;
 251   for (link = links; link; link = link->next)
 252     {
 253       char *url_start = fm->content + link->pos;
 254
 255       if (link->pos >= fm->length)
 256         {
 257           DEBUGP (("Something strange is going on.  Please investigate."));
 258           break;
 259         }
 260       /* If the URL is not to be converted, skip it.  */
 261       if (link->convert == CO_NOCONVERT)
 262         {
 263           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 264           continue;
 265         }
 266
 267       /* Echo the file contents, up to the offending URL's opening
 268          quote, to the outfile.  */
 269       fwrite (p, 1, url_start - p, fp);
 270       p = url_start;
 271
 272       switch (link->convert)
 273         {
 274         case CO_CONVERT_TO_RELATIVE:
 275           /* Convert absolute URL to relative. */
 276           {
 277             char *newname = construct_relative (file, link->local_name);
 278             char *quoted_newname = local_quote_string (newname);
 279
 280             if (!link->link_refresh_p)
 281               p = replace_attr (p, link->size, fp, quoted_newname);
 282             else
 283               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 284                                              link->refresh_timeout);
 285
 286             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 287                      link->url->url, newname, link->pos, file));
 288             xfree (newname);
 289             xfree (quoted_newname);
 290             ++to_file_count;
 291             break;
 292           }
 293         case CO_CONVERT_TO_COMPLETE:
 294           /* Convert the link to absolute URL. */
 295           {
 296             char *newlink = link->url->url;
 297             char *quoted_newlink = html_quote_string (newlink);
 298
 299             if (!link->link_refresh_p)
 300               p = replace_attr (p, link->size, fp, quoted_newlink);
 301             else
 302               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 303                                              link->refresh_timeout);
 304
 305             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 306                      newlink, link->pos, file));
 307             xfree (quoted_newlink);
 308             ++to_url_count;
 309             break;
 310           }
 311         case CO_NULLIFY_BASE:
 312           /* Change the base href to "". */
 313           p = replace_attr (p, link->size, fp, "");
 314           break;
 315         case CO_NOCONVERT:
 316           abort ();
 317           break;
 318         }
 319     }
 320
 321   /* Output the rest of the file. */
 322   if (p - fm->content < fm->length)
 323     fwrite (p, 1, fm->length - (p - fm->content), fp);
 324   fclose (fp);
 325   read_file_free (fm);
 326
 327   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 328 }
 329
 330 /* Construct and return a link that points from BASEFILE to LINKFILE.
 331    Both files should be local file names, BASEFILE of the referrering
 332    file, and LINKFILE of the referred file.
 333
 334    Examples:
 335
 336    cr("foo", "bar")         -> "bar"
 337    cr("A/foo", "A/bar")     -> "bar"
 338    cr("A/foo", "A/B/bar")   -> "B/bar"
 339    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 340    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 341
 342    Both files should be absolute or relative, otherwise strange
 343    results might ensue.  The function makes no special efforts to
 344    handle "." and ".." in links, so make sure they're not there
 345    (e.g. using path_simplify).  */
 346
 347 static char *
 348 construct_relative (const char *basefile, const char *linkfile)
 349 {
 350   char *link;
 351   int basedirs;
 352   const char *b, *l;
 353   int i, start;
 354
 355   /* First, skip the initial directory components common to both
 356      files.  */
 357   start = 0;
 358   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 359     {
 360       if (*b == '/')
 361         start = (b - basefile) + 1;
 362     }
 363   basefile += start;
 364   linkfile += start;
 365
 366   /* With common directories out of the way, the situation we have is
 367      as follows:
 368          b - b1/b2/[...]/bfile
 369          l - l1/l2/[...]/lfile
 370
 371      The link we're constructing needs to be:
 372        lnk - ../../l1/l2/[...]/lfile
 373
 374      Where the number of ".."'s equals the number of bN directory
 375      components in B.  */
 376
 377   /* Count the directory components in B. */
 378   basedirs = 0;
 379   for (b = basefile; *b; b++)
 380     {
 381       if (*b == '/')
 382         ++basedirs;
 383     }
 384
 385   /* Construct LINK as explained above. */
 386   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 387   for (i = 0; i < basedirs; i++)
 388     memcpy (link + 3 * i, "../", 3);
 389   strcpy (link + 3 * i, linkfile);
 390   return link;
 391 }
 392
 393 /* Used by write_backup_file to remember which files have been
 394    written. */
 395 static struct hash_table *converted_files;
 396
 397 static void
 398 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 399 {
 400   /* Rather than just writing over the original .html file with the
 401      converted version, save the former to *.orig.  Note we only do
 402      this for files we've _successfully_ downloaded, so we don't
 403      clobber .orig files sitting around from previous invocations. */
 404
 405   /* Construct the backup filename as the original name plus ".orig". */
 406   size_t         filename_len = strlen (file);
 407   char*          filename_plus_orig_suffix;
 408
 409   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 410     {
 411       /* Just write "orig" over "html".  We need to do it this way
 412          because when we're checking to see if we've downloaded the
 413          file before (to see if we can skip downloading it), we don't
 414          know if it's a text/html file.  Therefore we don't know yet
 415          at that stage that -E is going to cause us to tack on
 416          ".html", so we need to compare vs. the original URL plus
 417          ".orig", not the original URL plus ".html.orig". */
 418       filename_plus_orig_suffix = alloca (filename_len + 1);
 419       strcpy (filename_plus_orig_suffix, file);
 420       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 421     }
 422   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 423     {
 424       /* Append ".orig" to the name. */
 425       filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
 426       strcpy (filename_plus_orig_suffix, file);
 427       strcpy (filename_plus_orig_suffix + filename_len, ".orig");
 428     }
 429
 430   if (!converted_files)
 431     converted_files = make_string_hash_table (0);
 432
 433   /* We can get called twice on the same URL thanks to the
 434      convert_all_links() call in main().  If we write the .orig file
 435      each time in such a case, it'll end up containing the first-pass
 436      conversion, not the original file.  So, see if we've already been
 437      called on this file. */
 438   if (!string_set_contains (converted_files, file))
 439     {
 440       /* Rename <file> to <file>.orig before former gets written over. */
 441       if (rename (file, filename_plus_orig_suffix) != 0)
 442         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 443                    file, filename_plus_orig_suffix, strerror (errno));
 444
 445       /* Remember that we've already written a .orig backup for this file.
 446          Note that we never free this memory since we need it till the
 447          convert_all_links() call, which is one of the last things the
 448          program does before terminating.  BTW, I'm not sure if it would be
 449          safe to just set 'converted_file_ptr->string' to 'file' below,
 450          rather than making a copy of the string...  Another note is that I
 451          thought I could just add a field to the urlpos structure saying
 452          that we'd written a .orig file for this URL, but that didn't work,
 453          so I had to make this separate list.
 454          -- Dan Harkless <wget@harkless.org>
 455
 456          This [adding a field to the urlpos structure] didn't work
 457          because convert_file() is called from convert_all_links at
 458          the end of the retrieval with a freshly built new urlpos
 459          list.
 460          -- Hrvoje Niksic <hniksic@xemacs.org>
 461       */
 462       string_set_add (converted_files, file);
 463     }
 464 }
 465
 466 static bool find_fragment (const char *, int, const char **, const char **);
 467
 468 /* Replace an attribute's original text with NEW_TEXT. */
 469
 470 static const char *
 471 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 472 {
 473   bool quote_flag = false;
 474   char quote_char = '\"';       /* use "..." for quoting, unless the
 475                                    original value is quoted, in which
 476                                    case reuse its quoting char. */
 477   const char *frag_beg, *frag_end;
 478
 479   /* Structure of our string is:
 480        "...old-contents..."
 481        <---    size    --->  (with quotes)
 482      OR:
 483        ...old-contents...
 484        <---    size   -->    (no quotes)   */
 485
 486   if (*p == '\"' || *p == '\'')
 487     {
 488       quote_char = *p;
 489       quote_flag = true;
 490       ++p;
 491       size -= 2;                /* disregard opening and closing quote */
 492     }
 493   putc (quote_char, fp);
 494   fputs (new_text, fp);
 495
 496   /* Look for fragment identifier, if any. */
 497   if (find_fragment (p, size, &frag_beg, &frag_end))
 498     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 499   p += size;
 500   if (quote_flag)
 501     ++p;
 502   putc (quote_char, fp);
 503
 504   return p;
 505 }
 506
 507 /* The same as REPLACE_ATTR, but used when replacing
 508    <meta http-equiv=refresh content="new_text"> because we need to
 509    append "timeout_value; URL=" before the next_text.  */
 510
 511 static const char *
 512 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 513                            const char *new_text, int timeout)
 514 {
 515   /* "0; URL=..." */
 516   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 517                                            + 6 /* "; URL=" */
 518                                            + strlen (new_text)
 519                                            + 1);
 520   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 521
 522   return replace_attr (p, size, fp, new_with_timeout);
 523 }
 524
 525 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 526    preceded by '&'.  If the character is not found, return zero.  If
 527    the character is found, return true and set BP and EP to point to
 528    the beginning and end of the region.
 529
 530    This is used for finding the fragment indentifiers in URLs.  */
 531
 532 static bool
 533 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 534 {
 535   const char *end = beg + size;
 536   bool saw_amp = false;
 537   for (; beg < end; beg++)
 538     {
 539       switch (*beg)
 540         {
 541         case '&':
 542           saw_amp = true;
 543           break;
 544         case '#':
 545           if (!saw_amp)
 546             {
 547               *bp = beg;
 548               *ep = end;
 549               return true;
 550             }
 551           /* fallthrough */
 552         default:
 553           saw_amp = false;
 554         }
 555     }
 556   return false;
 557 }
 558
 559 /* Quote FILE for use as local reference to an HTML file.
 560
 561    We quote ? as %3F to avoid passing part of the file name as the
 562    parameter when browsing the converted file through HTTP.  However,
 563    it is safe to do this only when `--html-extension' is turned on.
 564    This is because converting "index.html?foo=bar" to
 565    "index.html%3Ffoo=bar" would break local browsing, as the latter
 566    isn't even recognized as an HTML file!  However, converting
 567    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 568    safe for both local and HTTP-served browsing.
 569
 570    We always quote "#" as "%23" and "%" as "%25" because those
 571    characters have special meanings in URLs.  */
 572
 573 static char *
 574 local_quote_string (const char *file)
 575 {
 576   const char *from;
 577   char *newname, *to;
 578
 579   char *any = strpbrk (file, "?#%");
 580   if (!any)
 581     return html_quote_string (file);
 582
 583   /* Allocate space assuming the worst-case scenario, each character
 584      having to be quoted.  */
 585   to = newname = (char *)alloca (3 * strlen (file) + 1);
 586   for (from = file; *from; from++)
 587     switch (*from)
 588       {
 589       case '%':
 590         *to++ = '%';
 591         *to++ = '2';
 592         *to++ = '5';
 593         break;
 594       case '#':
 595         *to++ = '%';
 596         *to++ = '2';
 597         *to++ = '3';
 598         break;
 599       case '?':
 600         if (opt.html_extension)
 601           {
 602             *to++ = '%';
 603             *to++ = '3';
 604             *to++ = 'F';
 605             break;
 606           }
 607         /* fallthrough */
 608       default:
 609         *to++ = *from;
 610       }
 611   *to = '\0';
 612
 613   return html_quote_string (newname);
 614 }
 615 \f
 616 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 617    downloaded_html_list, and downloaded_html_set.  Other code calls
 618    these functions to let us know that a file has been downloaded.  */
 619
 620 #define ENSURE_TABLES_EXIST do {                        \
 621   if (!dl_file_url_map)                                 \
 622     dl_file_url_map = make_string_hash_table (0);       \
 623   if (!dl_url_file_map)                                 \
 624     dl_url_file_map = make_string_hash_table (0);       \
 625 } while (0)
 626
 627 /* Return true if S1 and S2 are the same, except for "/index.html".
 628    The three cases in which it returns one are (substitute any
 629    substring for "foo"):
 630
 631    m("foo/index.html", "foo/")  ==> 1
 632    m("foo/", "foo/index.html")  ==> 1
 633    m("foo", "foo/index.html")   ==> 1
 634    m("foo", "foo/"              ==> 1
 635    m("foo", "foo")              ==> 1  */
 636
 637 static bool
 638 match_except_index (const char *s1, const char *s2)
 639 {
 640   int i;
 641   const char *lng;
 642
 643   /* Skip common substring. */
 644   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 645     ;
 646   if (i == 0)
 647     /* Strings differ at the very beginning -- bail out.  We need to
 648        check this explicitly to avoid `lng - 1' reading outside the
 649        array.  */
 650     return false;
 651
 652   if (!*s1 && !*s2)
 653     /* Both strings hit EOF -- strings are equal. */
 654     return true;
 655   else if (*s1 && *s2)
 656     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 657     return false;
 658   else if (*s1)
 659     /* S1 is the longer one. */
 660     lng = s1;
 661   else
 662     /* S2 is the longer one. */
 663     lng = s2;
 664
 665   /* foo            */            /* foo/           */
 666   /* foo/index.html */  /* or */  /* foo/index.html */
 667   /*    ^           */            /*     ^          */
 668
 669   if (*lng != '/')
 670     /* The right-hand case. */
 671     --lng;
 672
 673   if (*lng == '/' && *(lng + 1) == '\0')
 674     /* foo  */
 675     /* foo/ */
 676     return true;
 677
 678   return 0 == strcmp (lng, "/index.html");
 679 }
 680
 681 static int
 682 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 683 {
 684   char *mapping_url = (char *)key;
 685   char *mapping_file = (char *)value;
 686   char *file = (char *)arg;
 687
 688   if (0 == strcmp (mapping_file, file))
 689     {
 690       hash_table_remove (dl_url_file_map, mapping_url);
 691       xfree (mapping_url);
 692       xfree (mapping_file);
 693     }
 694
 695   /* Continue mapping. */
 696   return 0;
 697 }
 698
 699 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 700
 701 static void
 702 dissociate_urls_from_file (const char *file)
 703 {
 704   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 705   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 706                        (char *) file);
 707 }
 708
 709 /* Register that URL has been successfully downloaded to FILE.  This
 710    is used by the link conversion code to convert references to URLs
 711    to references to local files.  It is also being used to check if a
 712    URL has already been downloaded.  */
 713
 714 void
 715 register_download (const char *url, const char *file)
 716 {
 717   char *old_file, *old_url;
 718
 719   ENSURE_TABLES_EXIST;
 720
 721   /* With some forms of retrieval, it is possible, although not likely
 722      or particularly desirable.  If both are downloaded, the second
 723      download will override the first one.  When that happens,
 724      dissociate the old file name from the URL.  */
 725
 726   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 727     {
 728       if (0 == strcmp (url, old_url))
 729         /* We have somehow managed to download the same URL twice.
 730            Nothing to do.  */
 731         return;
 732
 733       if (match_except_index (url, old_url)
 734           && !hash_table_contains (dl_url_file_map, url))
 735         /* The two URLs differ only in the "index.html" ending.  For
 736            example, one is "http://www.server.com/", and the other is
 737            "http://www.server.com/index.html".  Don't remove the old
 738            one, just add the new one as a non-canonical entry.  */
 739         goto url_only;
 740
 741       hash_table_remove (dl_file_url_map, file);
 742       xfree (old_file);
 743       xfree (old_url);
 744
 745       /* Remove all the URLs that point to this file.  Yes, there can
 746          be more than one such URL, because we store redirections as
 747          multiple entries in dl_url_file_map.  For example, if URL1
 748          redirects to URL2 which gets downloaded to FILE, we map both
 749          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 750          only points to URL2.)  When another URL gets loaded to FILE,
 751          we want both URL1 and URL2 dissociated from it.
 752
 753          This is a relatively expensive operation because it performs
 754          a linear search of the whole hash table, but it should be
 755          called very rarely, only when two URLs resolve to the same
 756          file name, *and* the "<file>.1" extensions are turned off.
 757          In other words, almost never.  */
 758       dissociate_urls_from_file (file);
 759     }
 760
 761   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 762
 763  url_only:
 764   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 765      If the latter were present, it should have been removed by the
 766      above `if'.  So we could write:
 767
 768          assert (!hash_table_contains (dl_url_file_map, url));
 769
 770      The above is correct when running in recursive mode where the
 771      same URL always resolves to the same file.  But if you do
 772      something like:
 773
 774          wget URL URL
 775
 776      then the first URL will resolve to "FILE", and the other to
 777      "FILE.1".  In that case, FILE.1 will not be found in
 778      dl_file_url_map, but URL will still point to FILE in
 779      dl_url_file_map.  */
 780   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 781     {
 782       hash_table_remove (dl_url_file_map, url);
 783       xfree (old_url);
 784       xfree (old_file);
 785     }
 786
 787   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 788 }
 789
 790 /* Register that FROM has been redirected to TO.  This assumes that TO
 791    is successfully downloaded and already registered using
 792    register_download() above.  */
 793
 794 void
 795 register_redirection (const char *from, const char *to)
 796 {
 797   char *file;
 798
 799   ENSURE_TABLES_EXIST;
 800
 801   file = hash_table_get (dl_url_file_map, to);
 802   assert (file != NULL);
 803   if (!hash_table_contains (dl_url_file_map, from))
 804     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 805 }
 806
 807 /* Register that the file has been deleted. */
 808
 809 void
 810 register_delete_file (const char *file)
 811 {
 812   char *old_url, *old_file;
 813
 814   ENSURE_TABLES_EXIST;
 815
 816   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 817     return;
 818
 819   hash_table_remove (dl_file_url_map, file);
 820   xfree (old_file);
 821   xfree (old_url);
 822   dissociate_urls_from_file (file);
 823 }
 824
 825 /* Register that FILE is an HTML file that has been downloaded. */
 826
 827 void
 828 register_html (const char *url, const char *file)
 829 {
 830   if (!downloaded_html_set)
 831     downloaded_html_set = make_string_hash_table (0);
 832   string_set_add (downloaded_html_set, file);
 833 }
 834
 835 static void downloaded_files_free (void);
 836
 837 /* Cleanup the data structures associated with this file.  */
 838
 839 void
 840 convert_cleanup (void)
 841 {
 842   if (dl_file_url_map)
 843     {
 844       free_keys_and_values (dl_file_url_map);
 845       hash_table_destroy (dl_file_url_map);
 846       dl_file_url_map = NULL;
 847     }
 848   if (dl_url_file_map)
 849     {
 850       free_keys_and_values (dl_url_file_map);
 851       hash_table_destroy (dl_url_file_map);
 852       dl_url_file_map = NULL;
 853     }
 854   if (downloaded_html_set)
 855     string_set_free (downloaded_html_set);
 856   downloaded_files_free ();
 857   if (converted_files)
 858     string_set_free (converted_files);
 859 }
 860 \f
 861 /* Book-keeping code for downloaded files that enables extension
 862    hacks.  */
 863
 864 /* This table should really be merged with dl_file_url_map and
 865    downloaded_html_files.  This was originally a list, but I changed
 866    it to a hash table beause it was actually taking a lot of time to
 867    find things in it.  */
 868
 869 static struct hash_table *downloaded_files_hash;
 870
 871 /* We're storing "modes" of type downloaded_file_t in the hash table.
 872    However, our hash tables only accept pointers for keys and values.
 873    So when we need a pointer, we use the address of a
 874    downloaded_file_t variable of static storage.  */
 875
 876 static downloaded_file_t *
 877 downloaded_mode_to_ptr (downloaded_file_t mode)
 878 {
 879   static downloaded_file_t
 880     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 881     v2 = FILE_DOWNLOADED_NORMALLY,
 882     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 883     v4 = CHECK_FOR_FILE;
 884
 885   switch (mode)
 886     {
 887     case FILE_NOT_ALREADY_DOWNLOADED:
 888       return &v1;
 889     case FILE_DOWNLOADED_NORMALLY:
 890       return &v2;
 891     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 892       return &v3;
 893     case CHECK_FOR_FILE:
 894       return &v4;
 895     }
 896   return NULL;
 897 }
 898
 899 /* Remembers which files have been downloaded.  In the standard case,
 900    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 901    file we actually download successfully (i.e. not for ones we have
 902    failures on or that we skip due to -N).
 903
 904    When we've downloaded a file and tacked on a ".html" extension due
 905    to -E, call this function with
 906    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 907    FILE_DOWNLOADED_NORMALLY.
 908
 909    If you just want to check if a file has been previously added
 910    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 911    sure to call this function with local filenames, not remote
 912    URLs.  */
 913
 914 downloaded_file_t
 915 downloaded_file (downloaded_file_t mode, const char *file)
 916 {
 917   downloaded_file_t *ptr;
 918
 919   if (mode == CHECK_FOR_FILE)
 920     {
 921       if (!downloaded_files_hash)
 922         return FILE_NOT_ALREADY_DOWNLOADED;
 923       ptr = hash_table_get (downloaded_files_hash, file);
 924       if (!ptr)
 925         return FILE_NOT_ALREADY_DOWNLOADED;
 926       return *ptr;
 927     }
 928
 929   if (!downloaded_files_hash)
 930     downloaded_files_hash = make_string_hash_table (0);
 931
 932   ptr = hash_table_get (downloaded_files_hash, file);
 933   if (ptr)
 934     return *ptr;
 935
 936   ptr = downloaded_mode_to_ptr (mode);
 937   hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
 938
 939   return FILE_NOT_ALREADY_DOWNLOADED;
 940 }
 941
 942 static void
 943 downloaded_files_free (void)
 944 {
 945   if (downloaded_files_hash)
 946     {
 947       hash_table_iterator iter;
 948       for (hash_table_iterate (downloaded_files_hash, &iter);
 949            hash_table_iter_next (&iter);
 950            )
 951         xfree (iter.key);
 952       hash_table_destroy (downloaded_files_hash);
 953       downloaded_files_hash = NULL;
 954     }
 955 }
 956 \f
 957 /* The function returns the pointer to the malloc-ed quoted version of
 958    string s.  It will recognize and quote numeric and special graphic
 959    entities, as per RFC1866:
 960
 961    `&' -> `&amp;'
 962    `<' -> `&lt;'
 963    `>' -> `&gt;'
 964    `"' -> `&quot;'
 965    SP  -> `&#32;'
 966
 967    No other entities are recognized or replaced.  */
 968 char *
 969 html_quote_string (const char *s)
 970 {
 971   const char *b = s;
 972   char *p, *res;
 973   int i;
 974
 975   /* Pass through the string, and count the new size.  */
 976   for (i = 0; *s; s++, i++)
 977     {
 978       if (*s == '&')
 979         i += 4;                 /* `amp;' */
 980       else if (*s == '<' || *s == '>')
 981         i += 3;                 /* `lt;' and `gt;' */
 982       else if (*s == '\"')
 983         i += 5;                 /* `quot;' */
 984       else if (*s == ' ')
 985         i += 4;                 /* #32; */
 986     }
 987   res = xmalloc (i + 1);
 988   s = b;
 989   for (p = res; *s; s++)
 990     {
 991       switch (*s)
 992         {
 993         case '&':
 994           *p++ = '&';
 995           *p++ = 'a';
 996           *p++ = 'm';
 997           *p++ = 'p';
 998           *p++ = ';';
 999           break;
1000         case '<': case '>':
1001           *p++ = '&';
1002           *p++ = (*s == '<' ? 'l' : 'g');
1003           *p++ = 't';
1004           *p++ = ';';
1005           break;
1006         case '\"':
1007           *p++ = '&';
1008           *p++ = 'q';
1009           *p++ = 'u';
1010           *p++ = 'o';
1011           *p++ = 't';
1012           *p++ = ';';
1013           break;
1014         case ' ':
1015           *p++ = '&';
1016           *p++ = '#';
1017           *p++ = '3';
1018           *p++ = '2';
1019           *p++ = ';';
1020           break;
1021         default:
1022           *p++ = *s;
1023         }
1024     }
1025   *p = '\0';
1026   return res;
1027 }
1028
1029 /*
1030  * vim: et ts=2 sw=2
1031  */
1032