sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif /* HAVE_UNISTD_H */
  39 #include <errno.h>
  40 #include <assert.h>
  41 #include "convert.h"
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "hash.h"
  46 #include "ptimer.h"
  47 #include "res.h"
  48 #include "html-url.h"
  49 #include "css-url.h"
  50 #include "iri.h"
  51
  52 static struct hash_table *dl_file_url_map;
  53 struct hash_table *dl_url_file_map;
  54
  55 /* Set of HTML/CSS files downloaded in this Wget run, used for link
  56    conversion after Wget is done.  */
  57 struct hash_table *downloaded_html_set;
  58 struct hash_table *downloaded_css_set;
  59
  60 static void convert_links (const char *, struct urlpos *);
  61
  62
  63 void
  64 convert_links_in_hashtable (struct hash_table *downloaded_set,
  65                             int is_css,
  66                             int *file_count)
  67 {
  68   int i;
  69
  70   int cnt;
  71   char **file_array;
  72
  73   cnt = 0;
  74   if (downloaded_set)
  75     cnt = hash_table_count (downloaded_set);
  76   if (cnt == 0)
  77     return;
  78   file_array = alloca_array (char *, cnt);
  79   string_set_to_array (downloaded_set, file_array);
  80
  81   for (i = 0; i < cnt; i++)
  82     {
  83       struct urlpos *urls, *cur_url;
  84       char *url;
  85       char *file = file_array[i];
  86
  87       /* Determine the URL of the file.  get_urls_{html,css} will need
  88          it.  */
  89       url = hash_table_get (dl_file_url_map, file);
  90       if (!url)
  91         {
  92           DEBUGP (("Apparently %s has been removed.\n", file));
  93           continue;
  94         }
  95
  96       DEBUGP (("Scanning %s (from %s)\n", file, url));
  97
  98       /* Parse the file...  */
  99       urls = is_css ? get_urls_css_file (file, url) :
 100                       get_urls_html (file, url, NULL, NULL);
 101
 102       /* We don't respect meta_disallow_follow here because, even if
 103          the file is not followed, we might still want to convert the
 104          links that have been followed from other files.  */
 105
 106       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 107         {
 108           char *local_name;
 109           struct url *u;
 110           struct iri *pi;
 111
 112           if (cur_url->link_base_p)
 113             {
 114               /* Base references have been resolved by our parser, so
 115                  we turn the base URL into an empty string.  (Perhaps
 116                  we should remove the tag entirely?)  */
 117               cur_url->convert = CO_NULLIFY_BASE;
 118               continue;
 119             }
 120
 121           /* We decide the direction of conversion according to whether
 122              a URL was downloaded.  Downloaded URLs will be converted
 123              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 124
 125           pi = iri_new ();
 126           set_uri_encoding (pi, opt.locale, true);
 127
 128           u = url_parse (cur_url->url->url, NULL, pi, true);
 129           local_name = hash_table_get (dl_url_file_map, u->url);
 130
 131           /* Decide on the conversion type.  */
 132           if (local_name)
 133             {
 134               /* We've downloaded this URL.  Convert it to relative
 135                  form.  We do this even if the URL already is in
 136                  relative form, because our directory structure may
 137                  not be identical to that on the server (think `-nd',
 138                  `--cut-dirs', etc.)  */
 139               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 140               cur_url->local_name = xstrdup (local_name);
 141               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 142             }
 143           else
 144             {
 145               /* We haven't downloaded this URL.  If it's not already
 146                  complete (including a full host name), convert it to
 147                  that form, so it can be reached while browsing this
 148                  HTML locally.  */
 149               if (!cur_url->link_complete_p)
 150                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 151               cur_url->local_name = NULL;
 152               DEBUGP (("will convert url %s to complete\n", u->url));
 153             }
 154
 155           url_free (u);
 156           iri_free (pi);
 157         }
 158
 159       /* Convert the links in the file.  */
 160       convert_links (file, urls);
 161       ++*file_count;
 162
 163       /* Free the data.  */
 164       free_urlpos (urls);
 165     }
 166 }
 167
 168 /* This function is called when the retrieval is done to convert the
 169    links that have been downloaded.  It has to be called at the end of
 170    the retrieval, because only then does Wget know conclusively which
 171    URLs have been downloaded, and which not, so it can tell which
 172    direction to convert to.
 173
 174    The "direction" means that the URLs to the files that have been
 175    downloaded get converted to the relative URL which will point to
 176    that file.  And the other URLs get converted to the remote URL on
 177    the server.
 178
 179    All the downloaded HTMLs are kept in downloaded_html_files, and
 180    downloaded URLs in urls_downloaded.  All the information is
 181    extracted from these two lists.  */
 182
 183 void
 184 convert_all_links (void)
 185 {
 186   double secs;
 187   int file_count = 0;
 188
 189   struct ptimer *timer = ptimer_new ();
 190
 191   convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
 192   convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
 193
 194   secs = ptimer_measure (timer);
 195   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 196              file_count, print_decimal (secs));
 197
 198   ptimer_destroy (timer);
 199 }
 200
 201 static void write_backup_file (const char *, downloaded_file_t);
 202 static const char *replace_plain (const char*, int, FILE*, const char *);
 203 static const char *replace_attr (const char *, int, FILE *, const char *);
 204 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 205                                               const char *, int);
 206 static char *local_quote_string (const char *);
 207 static char *construct_relative (const char *, const char *);
 208
 209 /* Change the links in one file.  LINKS is a list of links in the
 210    document, along with their positions and the desired direction of
 211    the conversion.  */
 212 static void
 213 convert_links (const char *file, struct urlpos *links)
 214 {
 215   struct file_memory *fm;
 216   FILE *fp;
 217   const char *p;
 218   downloaded_file_t downloaded_file_return;
 219
 220   struct urlpos *link;
 221   int to_url_count = 0, to_file_count = 0;
 222
 223   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 224
 225   {
 226     /* First we do a "dry run": go through the list L and see whether
 227        any URL needs to be converted in the first place.  If not, just
 228        leave the file alone.  */
 229     int dry_count = 0;
 230     struct urlpos *dry;
 231     for (dry = links; dry; dry = dry->next)
 232       if (dry->convert != CO_NOCONVERT)
 233         ++dry_count;
 234     if (!dry_count)
 235       {
 236         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 237         return;
 238       }
 239   }
 240
 241   fm = wget_read_file (file);
 242   if (!fm)
 243     {
 244       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 245                  file, strerror (errno));
 246       return;
 247     }
 248
 249   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 250   if (opt.backup_converted && downloaded_file_return)
 251     write_backup_file (file, downloaded_file_return);
 252
 253   /* Before opening the file for writing, unlink the file.  This is
 254      important if the data in FM is mmaped.  In such case, nulling the
 255      file, which is what fopen() below does, would make us read all
 256      zeroes from the mmaped region.  */
 257   if (unlink (file) < 0 && errno != ENOENT)
 258     {
 259       logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
 260                  quote (file), strerror (errno));
 261       wget_read_file_free (fm);
 262       return;
 263     }
 264   /* Now open the file for writing.  */
 265   fp = fopen (file, "wb");
 266   if (!fp)
 267     {
 268       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 269                  file, strerror (errno));
 270       wget_read_file_free (fm);
 271       return;
 272     }
 273
 274   /* Here we loop through all the URLs in file, replacing those of
 275      them that are downloaded with relative references.  */
 276   p = fm->content;
 277   for (link = links; link; link = link->next)
 278     {
 279       char *url_start = fm->content + link->pos;
 280
 281       if (link->pos >= fm->length)
 282         {
 283           DEBUGP (("Something strange is going on.  Please investigate."));
 284           break;
 285         }
 286       /* If the URL is not to be converted, skip it.  */
 287       if (link->convert == CO_NOCONVERT)
 288         {
 289           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 290           continue;
 291         }
 292
 293       /* Echo the file contents, up to the offending URL's opening
 294          quote, to the outfile.  */
 295       fwrite (p, 1, url_start - p, fp);
 296       p = url_start;
 297
 298       switch (link->convert)
 299         {
 300         case CO_CONVERT_TO_RELATIVE:
 301           /* Convert absolute URL to relative. */
 302           {
 303             char *newname = construct_relative (file, link->local_name);
 304             char *quoted_newname = local_quote_string (newname);
 305
 306             if (link->link_css_p)
 307               p = replace_plain (p, link->size, fp, quoted_newname);
 308             else if (!link->link_refresh_p)
 309               p = replace_attr (p, link->size, fp, quoted_newname);
 310             else
 311               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 312                                              link->refresh_timeout);
 313
 314             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 315                      link->url->url, newname, link->pos, file));
 316             xfree (newname);
 317             xfree (quoted_newname);
 318             ++to_file_count;
 319             break;
 320           }
 321         case CO_CONVERT_TO_COMPLETE:
 322           /* Convert the link to absolute URL. */
 323           {
 324             char *newlink = link->url->url;
 325             char *quoted_newlink = html_quote_string (newlink);
 326
 327             if (link->link_css_p)
 328               p = replace_plain (p, link->size, fp, quoted_newlink);
 329             else if (!link->link_refresh_p)
 330               p = replace_attr (p, link->size, fp, quoted_newlink);
 331             else
 332               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 333                                              link->refresh_timeout);
 334
 335             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 336                      newlink, link->pos, file));
 337             xfree (quoted_newlink);
 338             ++to_url_count;
 339             break;
 340           }
 341         case CO_NULLIFY_BASE:
 342           /* Change the base href to "". */
 343           p = replace_attr (p, link->size, fp, "");
 344           break;
 345         case CO_NOCONVERT:
 346           abort ();
 347           break;
 348         }
 349     }
 350
 351   /* Output the rest of the file. */
 352   if (p - fm->content < fm->length)
 353     fwrite (p, 1, fm->length - (p - fm->content), fp);
 354   fclose (fp);
 355   wget_read_file_free (fm);
 356
 357   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 358 }
 359
 360 /* Construct and return a link that points from BASEFILE to LINKFILE.
 361    Both files should be local file names, BASEFILE of the referrering
 362    file, and LINKFILE of the referred file.
 363
 364    Examples:
 365
 366    cr("foo", "bar")         -> "bar"
 367    cr("A/foo", "A/bar")     -> "bar"
 368    cr("A/foo", "A/B/bar")   -> "B/bar"
 369    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 370    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 371
 372    Both files should be absolute or relative, otherwise strange
 373    results might ensue.  The function makes no special efforts to
 374    handle "." and ".." in links, so make sure they're not there
 375    (e.g. using path_simplify).  */
 376
 377 static char *
 378 construct_relative (const char *basefile, const char *linkfile)
 379 {
 380   char *link;
 381   int basedirs;
 382   const char *b, *l;
 383   int i, start;
 384
 385   /* First, skip the initial directory components common to both
 386      files.  */
 387   start = 0;
 388   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 389     {
 390       if (*b == '/')
 391         start = (b - basefile) + 1;
 392     }
 393   basefile += start;
 394   linkfile += start;
 395
 396   /* With common directories out of the way, the situation we have is
 397      as follows:
 398          b - b1/b2/[...]/bfile
 399          l - l1/l2/[...]/lfile
 400
 401      The link we're constructing needs to be:
 402        lnk - ../../l1/l2/[...]/lfile
 403
 404      Where the number of ".."'s equals the number of bN directory
 405      components in B.  */
 406
 407   /* Count the directory components in B. */
 408   basedirs = 0;
 409   for (b = basefile; *b; b++)
 410     {
 411       if (*b == '/')
 412         ++basedirs;
 413     }
 414
 415   /* Construct LINK as explained above. */
 416   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 417   for (i = 0; i < basedirs; i++)
 418     memcpy (link + 3 * i, "../", 3);
 419   strcpy (link + 3 * i, linkfile);
 420   return link;
 421 }
 422
 423 /* Used by write_backup_file to remember which files have been
 424    written. */
 425 static struct hash_table *converted_files;
 426
 427 static void
 428 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 429 {
 430   /* Rather than just writing over the original .html file with the
 431      converted version, save the former to *.orig.  Note we only do
 432      this for files we've _successfully_ downloaded, so we don't
 433      clobber .orig files sitting around from previous invocations.
 434      On VMS, use "_orig" instead of ".orig".  See "wget.h". */
 435
 436   /* Construct the backup filename as the original name plus ".orig". */
 437   size_t         filename_len = strlen (file);
 438   char*          filename_plus_orig_suffix;
 439
 440   /* TODO: hack this to work with css files */
 441   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 442     {
 443       /* Just write "orig" over "html".  We need to do it this way
 444          because when we're checking to see if we've downloaded the
 445          file before (to see if we can skip downloading it), we don't
 446          know if it's a text/html file.  Therefore we don't know yet
 447          at that stage that -E is going to cause us to tack on
 448          ".html", so we need to compare vs. the original URL plus
 449          ".orig", not the original URL plus ".html.orig". */
 450       filename_plus_orig_suffix = alloca (filename_len + 1);
 451       strcpy (filename_plus_orig_suffix, file);
 452       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 453     }
 454   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 455     {
 456       /* Append ".orig" to the name. */
 457       filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX));
 458       strcpy (filename_plus_orig_suffix, file);
 459       strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX);
 460     }
 461
 462   if (!converted_files)
 463     converted_files = make_string_hash_table (0);
 464
 465   /* We can get called twice on the same URL thanks to the
 466      convert_all_links() call in main().  If we write the .orig file
 467      each time in such a case, it'll end up containing the first-pass
 468      conversion, not the original file.  So, see if we've already been
 469      called on this file. */
 470   if (!string_set_contains (converted_files, file))
 471     {
 472       /* Rename <file> to <file>.orig before former gets written over. */
 473       if (rename (file, filename_plus_orig_suffix) != 0)
 474         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 475                    file, filename_plus_orig_suffix, strerror (errno));
 476
 477       /* Remember that we've already written a .orig backup for this file.
 478          Note that we never free this memory since we need it till the
 479          convert_all_links() call, which is one of the last things the
 480          program does before terminating.  BTW, I'm not sure if it would be
 481          safe to just set 'converted_file_ptr->string' to 'file' below,
 482          rather than making a copy of the string...  Another note is that I
 483          thought I could just add a field to the urlpos structure saying
 484          that we'd written a .orig file for this URL, but that didn't work,
 485          so I had to make this separate list.
 486          -- Dan Harkless <wget@harkless.org>
 487
 488          This [adding a field to the urlpos structure] didn't work
 489          because convert_file() is called from convert_all_links at
 490          the end of the retrieval with a freshly built new urlpos
 491          list.
 492          -- Hrvoje Niksic <hniksic@xemacs.org>
 493       */
 494       string_set_add (converted_files, file);
 495     }
 496 }
 497
 498 static bool find_fragment (const char *, int, const char **, const char **);
 499
 500 /* Replace a string with NEW_TEXT.  Ignore quoting. */
 501 static const char *
 502 replace_plain (const char *p, int size, FILE *fp, const char *new_text)
 503 {
 504   fputs (new_text, fp);
 505   p += size;
 506   return p;
 507 }
 508
 509 /* Replace an attribute's original text with NEW_TEXT. */
 510
 511 static const char *
 512 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 513 {
 514   bool quote_flag = false;
 515   char quote_char = '\"';       /* use "..." for quoting, unless the
 516                                    original value is quoted, in which
 517                                    case reuse its quoting char. */
 518   const char *frag_beg, *frag_end;
 519
 520   /* Structure of our string is:
 521        "...old-contents..."
 522        <---    size    --->  (with quotes)
 523      OR:
 524        ...old-contents...
 525        <---    size   -->    (no quotes)   */
 526
 527   if (*p == '\"' || *p == '\'')
 528     {
 529       quote_char = *p;
 530       quote_flag = true;
 531       ++p;
 532       size -= 2;                /* disregard opening and closing quote */
 533     }
 534   putc (quote_char, fp);
 535   fputs (new_text, fp);
 536
 537   /* Look for fragment identifier, if any. */
 538   if (find_fragment (p, size, &frag_beg, &frag_end))
 539     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 540   p += size;
 541   if (quote_flag)
 542     ++p;
 543   putc (quote_char, fp);
 544
 545   return p;
 546 }
 547
 548 /* The same as REPLACE_ATTR, but used when replacing
 549    <meta http-equiv=refresh content="new_text"> because we need to
 550    append "timeout_value; URL=" before the next_text.  */
 551
 552 static const char *
 553 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 554                            const char *new_text, int timeout)
 555 {
 556   /* "0; URL=..." */
 557   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 558                                            + 6 /* "; URL=" */
 559                                            + strlen (new_text)
 560                                            + 1);
 561   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 562
 563   return replace_attr (p, size, fp, new_with_timeout);
 564 }
 565
 566 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 567    preceded by '&'.  If the character is not found, return zero.  If
 568    the character is found, return true and set BP and EP to point to
 569    the beginning and end of the region.
 570
 571    This is used for finding the fragment indentifiers in URLs.  */
 572
 573 static bool
 574 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 575 {
 576   const char *end = beg + size;
 577   bool saw_amp = false;
 578   for (; beg < end; beg++)
 579     {
 580       switch (*beg)
 581         {
 582         case '&':
 583           saw_amp = true;
 584           break;
 585         case '#':
 586           if (!saw_amp)
 587             {
 588               *bp = beg;
 589               *ep = end;
 590               return true;
 591             }
 592           /* fallthrough */
 593         default:
 594           saw_amp = false;
 595         }
 596     }
 597   return false;
 598 }
 599
 600 /* Quote FILE for use as local reference to an HTML file.
 601
 602    We quote ? as %3F to avoid passing part of the file name as the
 603    parameter when browsing the converted file through HTTP.  However,
 604    it is safe to do this only when `--adjust-extension' is turned on.
 605    This is because converting "index.html?foo=bar" to
 606    "index.html%3Ffoo=bar" would break local browsing, as the latter
 607    isn't even recognized as an HTML file!  However, converting
 608    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 609    safe for both local and HTTP-served browsing.
 610
 611    We always quote "#" as "%23", "%" as "%25" and ";" as "%3B"
 612    because those characters have special meanings in URLs.  */
 613
 614 static char *
 615 local_quote_string (const char *file)
 616 {
 617   const char *from;
 618   char *newname, *to;
 619
 620   char *any = strpbrk (file, "?#%;");
 621   if (!any)
 622     return html_quote_string (file);
 623
 624   /* Allocate space assuming the worst-case scenario, each character
 625      having to be quoted.  */
 626   to = newname = (char *)alloca (3 * strlen (file) + 1);
 627   for (from = file; *from; from++)
 628     switch (*from)
 629       {
 630       case '%':
 631         *to++ = '%';
 632         *to++ = '2';
 633         *to++ = '5';
 634         break;
 635       case '#':
 636         *to++ = '%';
 637         *to++ = '2';
 638         *to++ = '3';
 639         break;
 640       case ';':
 641         *to++ = '%';
 642         *to++ = '3';
 643         *to++ = 'B';
 644         break;
 645       case '?':
 646         if (opt.adjust_extension)
 647           {
 648             *to++ = '%';
 649             *to++ = '3';
 650             *to++ = 'F';
 651             break;
 652           }
 653         /* fallthrough */
 654       default:
 655         *to++ = *from;
 656       }
 657   *to = '\0';
 658
 659   return html_quote_string (newname);
 660 }
 661 \f
 662 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 663    downloaded_html_list, and downloaded_html_set.  Other code calls
 664    these functions to let us know that a file has been downloaded.  */
 665
 666 #define ENSURE_TABLES_EXIST do {                        \
 667   if (!dl_file_url_map)                                 \
 668     dl_file_url_map = make_string_hash_table (0);       \
 669   if (!dl_url_file_map)                                 \
 670     dl_url_file_map = make_string_hash_table (0);       \
 671 } while (0)
 672
 673 /* Return true if S1 and S2 are the same, except for "/index.html".
 674    The three cases in which it returns one are (substitute any
 675    substring for "foo"):
 676
 677    m("foo/index.html", "foo/")  ==> 1
 678    m("foo/", "foo/index.html")  ==> 1
 679    m("foo", "foo/index.html")   ==> 1
 680    m("foo", "foo/"              ==> 1
 681    m("foo", "foo")              ==> 1  */
 682
 683 static bool
 684 match_except_index (const char *s1, const char *s2)
 685 {
 686   int i;
 687   const char *lng;
 688
 689   /* Skip common substring. */
 690   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 691     ;
 692   if (i == 0)
 693     /* Strings differ at the very beginning -- bail out.  We need to
 694        check this explicitly to avoid `lng - 1' reading outside the
 695        array.  */
 696     return false;
 697
 698   if (!*s1 && !*s2)
 699     /* Both strings hit EOF -- strings are equal. */
 700     return true;
 701   else if (*s1 && *s2)
 702     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 703     return false;
 704   else if (*s1)
 705     /* S1 is the longer one. */
 706     lng = s1;
 707   else
 708     /* S2 is the longer one. */
 709     lng = s2;
 710
 711   /* foo            */            /* foo/           */
 712   /* foo/index.html */  /* or */  /* foo/index.html */
 713   /*    ^           */            /*     ^          */
 714
 715   if (*lng != '/')
 716     /* The right-hand case. */
 717     --lng;
 718
 719   if (*lng == '/' && *(lng + 1) == '\0')
 720     /* foo  */
 721     /* foo/ */
 722     return true;
 723
 724   return 0 == strcmp (lng, "/index.html");
 725 }
 726
 727 static int
 728 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 729 {
 730   char *mapping_url = (char *)key;
 731   char *mapping_file = (char *)value;
 732   char *file = (char *)arg;
 733
 734   if (0 == strcmp (mapping_file, file))
 735     {
 736       hash_table_remove (dl_url_file_map, mapping_url);
 737       xfree (mapping_url);
 738       xfree (mapping_file);
 739     }
 740
 741   /* Continue mapping. */
 742   return 0;
 743 }
 744
 745 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 746
 747 static void
 748 dissociate_urls_from_file (const char *file)
 749 {
 750   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 751   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 752                        (char *) file);
 753 }
 754
 755 /* Register that URL has been successfully downloaded to FILE.  This
 756    is used by the link conversion code to convert references to URLs
 757    to references to local files.  It is also being used to check if a
 758    URL has already been downloaded.  */
 759
 760 void
 761 register_download (const char *url, const char *file)
 762 {
 763   char *old_file, *old_url;
 764
 765   ENSURE_TABLES_EXIST;
 766
 767   /* With some forms of retrieval, it is possible, although not likely
 768      or particularly desirable.  If both are downloaded, the second
 769      download will override the first one.  When that happens,
 770      dissociate the old file name from the URL.  */
 771
 772   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 773     {
 774       if (0 == strcmp (url, old_url))
 775         /* We have somehow managed to download the same URL twice.
 776            Nothing to do.  */
 777         return;
 778
 779       if (match_except_index (url, old_url)
 780           && !hash_table_contains (dl_url_file_map, url))
 781         /* The two URLs differ only in the "index.html" ending.  For
 782            example, one is "http://www.server.com/", and the other is
 783            "http://www.server.com/index.html".  Don't remove the old
 784            one, just add the new one as a non-canonical entry.  */
 785         goto url_only;
 786
 787       hash_table_remove (dl_file_url_map, file);
 788       xfree (old_file);
 789       xfree (old_url);
 790
 791       /* Remove all the URLs that point to this file.  Yes, there can
 792          be more than one such URL, because we store redirections as
 793          multiple entries in dl_url_file_map.  For example, if URL1
 794          redirects to URL2 which gets downloaded to FILE, we map both
 795          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 796          only points to URL2.)  When another URL gets loaded to FILE,
 797          we want both URL1 and URL2 dissociated from it.
 798
 799          This is a relatively expensive operation because it performs
 800          a linear search of the whole hash table, but it should be
 801          called very rarely, only when two URLs resolve to the same
 802          file name, *and* the "<file>.1" extensions are turned off.
 803          In other words, almost never.  */
 804       dissociate_urls_from_file (file);
 805     }
 806
 807   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 808
 809  url_only:
 810   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 811      If the latter were present, it should have been removed by the
 812      above `if'.  So we could write:
 813
 814          assert (!hash_table_contains (dl_url_file_map, url));
 815
 816      The above is correct when running in recursive mode where the
 817      same URL always resolves to the same file.  But if you do
 818      something like:
 819
 820          wget URL URL
 821
 822      then the first URL will resolve to "FILE", and the other to
 823      "FILE.1".  In that case, FILE.1 will not be found in
 824      dl_file_url_map, but URL will still point to FILE in
 825      dl_url_file_map.  */
 826   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 827     {
 828       hash_table_remove (dl_url_file_map, url);
 829       xfree (old_url);
 830       xfree (old_file);
 831     }
 832
 833   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 834 }
 835
 836 /* Register that FROM has been redirected to TO.  This assumes that TO
 837    is successfully downloaded and already registered using
 838    register_download() above.  */
 839
 840 void
 841 register_redirection (const char *from, const char *to)
 842 {
 843   char *file;
 844
 845   ENSURE_TABLES_EXIST;
 846
 847   file = hash_table_get (dl_url_file_map, to);
 848   assert (file != NULL);
 849   if (!hash_table_contains (dl_url_file_map, from))
 850     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 851 }
 852
 853 /* Register that the file has been deleted. */
 854
 855 void
 856 register_delete_file (const char *file)
 857 {
 858   char *old_url, *old_file;
 859
 860   ENSURE_TABLES_EXIST;
 861
 862   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 863     return;
 864
 865   hash_table_remove (dl_file_url_map, file);
 866   xfree (old_file);
 867   xfree (old_url);
 868   dissociate_urls_from_file (file);
 869 }
 870
 871 /* Register that FILE is an HTML file that has been downloaded. */
 872
 873 void
 874 register_html (const char *url, const char *file)
 875 {
 876   if (!downloaded_html_set)
 877     downloaded_html_set = make_string_hash_table (0);
 878   string_set_add (downloaded_html_set, file);
 879 }
 880
 881 /* Register that FILE is a CSS file that has been downloaded. */
 882
 883 void
 884 register_css (const char *url, const char *file)
 885 {
 886   if (!downloaded_css_set)
 887     downloaded_css_set = make_string_hash_table (0);
 888   string_set_add (downloaded_css_set, file);
 889 }
 890
 891 static void downloaded_files_free (void);
 892
 893 /* Cleanup the data structures associated with this file.  */
 894
 895 void
 896 convert_cleanup (void)
 897 {
 898   if (dl_file_url_map)
 899     {
 900       free_keys_and_values (dl_file_url_map);
 901       hash_table_destroy (dl_file_url_map);
 902       dl_file_url_map = NULL;
 903     }
 904   if (dl_url_file_map)
 905     {
 906       free_keys_and_values (dl_url_file_map);
 907       hash_table_destroy (dl_url_file_map);
 908       dl_url_file_map = NULL;
 909     }
 910   if (downloaded_html_set)
 911     string_set_free (downloaded_html_set);
 912   downloaded_files_free ();
 913   if (converted_files)
 914     string_set_free (converted_files);
 915 }
 916 \f
 917 /* Book-keeping code for downloaded files that enables extension
 918    hacks.  */
 919
 920 /* This table should really be merged with dl_file_url_map and
 921    downloaded_html_files.  This was originally a list, but I changed
 922    it to a hash table beause it was actually taking a lot of time to
 923    find things in it.  */
 924
 925 static struct hash_table *downloaded_files_hash;
 926
 927 /* We're storing "modes" of type downloaded_file_t in the hash table.
 928    However, our hash tables only accept pointers for keys and values.
 929    So when we need a pointer, we use the address of a
 930    downloaded_file_t variable of static storage.  */
 931
 932 static downloaded_file_t *
 933 downloaded_mode_to_ptr (downloaded_file_t mode)
 934 {
 935   static downloaded_file_t
 936     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 937     v2 = FILE_DOWNLOADED_NORMALLY,
 938     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 939     v4 = CHECK_FOR_FILE;
 940
 941   switch (mode)
 942     {
 943     case FILE_NOT_ALREADY_DOWNLOADED:
 944       return &v1;
 945     case FILE_DOWNLOADED_NORMALLY:
 946       return &v2;
 947     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 948       return &v3;
 949     case CHECK_FOR_FILE:
 950       return &v4;
 951     }
 952   return NULL;
 953 }
 954
 955 /* Remembers which files have been downloaded.  In the standard case,
 956    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 957    file we actually download successfully (i.e. not for ones we have
 958    failures on or that we skip due to -N).
 959
 960    When we've downloaded a file and tacked on a ".html" extension due
 961    to -E, call this function with
 962    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 963    FILE_DOWNLOADED_NORMALLY.
 964
 965    If you just want to check if a file has been previously added
 966    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 967    sure to call this function with local filenames, not remote
 968    URLs.  */
 969
 970 downloaded_file_t
 971 downloaded_file (downloaded_file_t mode, const char *file)
 972 {
 973   downloaded_file_t *ptr;
 974
 975   if (mode == CHECK_FOR_FILE)
 976     {
 977       if (!downloaded_files_hash)
 978         return FILE_NOT_ALREADY_DOWNLOADED;
 979       ptr = hash_table_get (downloaded_files_hash, file);
 980       if (!ptr)
 981         return FILE_NOT_ALREADY_DOWNLOADED;
 982       return *ptr;
 983     }
 984
 985   if (!downloaded_files_hash)
 986     downloaded_files_hash = make_string_hash_table (0);
 987
 988   ptr = hash_table_get (downloaded_files_hash, file);
 989   if (ptr)
 990     return *ptr;
 991
 992   ptr = downloaded_mode_to_ptr (mode);
 993   hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
 994
 995   return FILE_NOT_ALREADY_DOWNLOADED;
 996 }
 997
 998 static void
 999 downloaded_files_free (void)
1000 {
1001   if (downloaded_files_hash)
1002     {
1003       hash_table_iterator iter;
1004       for (hash_table_iterate (downloaded_files_hash, &iter);
1005            hash_table_iter_next (&iter);
1006            )
1007         xfree (iter.key);
1008       hash_table_destroy (downloaded_files_hash);
1009       downloaded_files_hash = NULL;
1010     }
1011 }
1012 \f
1013 /* The function returns the pointer to the malloc-ed quoted version of
1014    string s.  It will recognize and quote numeric and special graphic
1015    entities, as per RFC1866:
1016
1017    `&' -> `&amp;'
1018    `<' -> `&lt;'
1019    `>' -> `&gt;'
1020    `"' -> `&quot;'
1021    SP  -> `&#32;'
1022
1023    No other entities are recognized or replaced.  */
1024 char *
1025 html_quote_string (const char *s)
1026 {
1027   const char *b = s;
1028   char *p, *res;
1029   int i;
1030
1031   /* Pass through the string, and count the new size.  */
1032   for (i = 0; *s; s++, i++)
1033     {
1034       if (*s == '&')
1035         i += 4;                 /* `amp;' */
1036       else if (*s == '<' || *s == '>')
1037         i += 3;                 /* `lt;' and `gt;' */
1038       else if (*s == '\"')
1039         i += 5;                 /* `quot;' */
1040       else if (*s == ' ')
1041         i += 4;                 /* #32; */
1042     }
1043   res = xmalloc (i + 1);
1044   s = b;
1045   for (p = res; *s; s++)
1046     {
1047       switch (*s)
1048         {
1049         case '&':
1050           *p++ = '&';
1051           *p++ = 'a';
1052           *p++ = 'm';
1053           *p++ = 'p';
1054           *p++ = ';';
1055           break;
1056         case '<': case '>':
1057           *p++ = '&';
1058           *p++ = (*s == '<' ? 'l' : 'g');
1059           *p++ = 't';
1060           *p++ = ';';
1061           break;
1062         case '\"':
1063           *p++ = '&';
1064           *p++ = 'q';
1065           *p++ = 'u';
1066           *p++ = 'o';
1067           *p++ = 't';
1068           *p++ = ';';
1069           break;
1070         case ' ':
1071           *p++ = '&';
1072           *p++ = '#';
1073           *p++ = '3';
1074           *p++ = '2';
1075           *p++ = ';';
1076           break;
1077         default:
1078           *p++ = *s;
1079         }
1080     }
1081   *p = '\0';
1082   return res;
1083 }
1084
1085 /*
1086  * vim: et ts=2 sw=2
1087  */
1088