sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <assert.h>
  39 #include "convert.h"
  40 #include "url.h"
  41 #include "recur.h"
  42 #include "utils.h"
  43 #include "hash.h"
  44 #include "ptimer.h"
  45 #include "res.h"
  46 #include "html-url.h"
  47 #include "css-url.h"
  48 #include "iri.h"
  49
  50 static struct hash_table *dl_file_url_map;
  51 struct hash_table *dl_url_file_map;
  52
  53 /* Set of HTML/CSS files downloaded in this Wget run, used for link
  54    conversion after Wget is done.  */
  55 struct hash_table *downloaded_html_set;
  56 struct hash_table *downloaded_css_set;
  57
  58 static void convert_links (const char *, struct urlpos *);
  59
  60
  61 void
  62 convert_links_in_hashtable (struct hash_table *downloaded_set,
  63                             int is_css,
  64                             int *file_count)
  65 {
  66   int i;
  67
  68   int cnt;
  69   char **file_array;
  70
  71   cnt = 0;
  72   if (downloaded_set)
  73     cnt = hash_table_count (downloaded_set);
  74   if (cnt == 0)
  75     return;
  76   file_array = alloca_array (char *, cnt);
  77   string_set_to_array (downloaded_set, file_array);
  78
  79   for (i = 0; i < cnt; i++)
  80     {
  81       struct urlpos *urls, *cur_url;
  82       char *url;
  83       char *file = file_array[i];
  84
  85       /* Determine the URL of the file.  get_urls_{html,css} will need
  86          it.  */
  87       url = hash_table_get (dl_file_url_map, file);
  88       if (!url)
  89         {
  90           DEBUGP (("Apparently %s has been removed.\n", file));
  91           continue;
  92         }
  93
  94       DEBUGP (("Scanning %s (from %s)\n", file, url));
  95
  96       /* Parse the file...  */
  97       urls = is_css ? get_urls_css_file (file, url) :
  98                       get_urls_html (file, url, NULL, NULL);
  99
 100       /* We don't respect meta_disallow_follow here because, even if
 101          the file is not followed, we might still want to convert the
 102          links that have been followed from other files.  */
 103
 104       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 105         {
 106           char *local_name;
 107           struct url *u;
 108           struct iri *pi;
 109
 110           if (cur_url->link_base_p)
 111             {
 112               /* Base references have been resolved by our parser, so
 113                  we turn the base URL into an empty string.  (Perhaps
 114                  we should remove the tag entirely?)  */
 115               cur_url->convert = CO_NULLIFY_BASE;
 116               continue;
 117             }
 118
 119           /* We decide the direction of conversion according to whether
 120              a URL was downloaded.  Downloaded URLs will be converted
 121              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 122
 123           pi = iri_new ();
 124           set_uri_encoding (pi, opt.locale, true);
 125
 126           u = url_parse (cur_url->url->url, NULL, pi, true);
 127           local_name = hash_table_get (dl_url_file_map, u->url);
 128
 129           /* Decide on the conversion type.  */
 130           if (local_name)
 131             {
 132               /* We've downloaded this URL.  Convert it to relative
 133                  form.  We do this even if the URL already is in
 134                  relative form, because our directory structure may
 135                  not be identical to that on the server (think `-nd',
 136                  `--cut-dirs', etc.)  */
 137               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 138               cur_url->local_name = xstrdup (local_name);
 139               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 140             }
 141           else
 142             {
 143               /* We haven't downloaded this URL.  If it's not already
 144                  complete (including a full host name), convert it to
 145                  that form, so it can be reached while browsing this
 146                  HTML locally.  */
 147               if (!cur_url->link_complete_p)
 148                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 149               cur_url->local_name = NULL;
 150               DEBUGP (("will convert url %s to complete\n", u->url));
 151             }
 152
 153           url_free (u);
 154           iri_free (pi);
 155         }
 156
 157       /* Convert the links in the file.  */
 158       convert_links (file, urls);
 159       ++*file_count;
 160
 161       /* Free the data.  */
 162       free_urlpos (urls);
 163     }
 164 }
 165
 166 /* This function is called when the retrieval is done to convert the
 167    links that have been downloaded.  It has to be called at the end of
 168    the retrieval, because only then does Wget know conclusively which
 169    URLs have been downloaded, and which not, so it can tell which
 170    direction to convert to.
 171
 172    The "direction" means that the URLs to the files that have been
 173    downloaded get converted to the relative URL which will point to
 174    that file.  And the other URLs get converted to the remote URL on
 175    the server.
 176
 177    All the downloaded HTMLs are kept in downloaded_html_files, and
 178    downloaded URLs in urls_downloaded.  All the information is
 179    extracted from these two lists.  */
 180
 181 void
 182 convert_all_links (void)
 183 {
 184   double secs;
 185   int file_count = 0;
 186
 187   struct ptimer *timer = ptimer_new ();
 188
 189   convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
 190   convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
 191
 192   secs = ptimer_measure (timer);
 193   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 194              file_count, print_decimal (secs));
 195
 196   ptimer_destroy (timer);
 197 }
 198
 199 static void write_backup_file (const char *, downloaded_file_t);
 200 static const char *replace_plain (const char*, int, FILE*, const char *);
 201 static const char *replace_attr (const char *, int, FILE *, const char *);
 202 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 203                                               const char *, int);
 204 static char *local_quote_string (const char *, bool);
 205 static char *construct_relative (const char *, const char *);
 206
 207 /* Change the links in one file.  LINKS is a list of links in the
 208    document, along with their positions and the desired direction of
 209    the conversion.  */
 210 static void
 211 convert_links (const char *file, struct urlpos *links)
 212 {
 213   struct file_memory *fm;
 214   FILE *fp;
 215   const char *p;
 216   downloaded_file_t downloaded_file_return;
 217
 218   struct urlpos *link;
 219   int to_url_count = 0, to_file_count = 0;
 220
 221   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 222
 223   {
 224     /* First we do a "dry run": go through the list L and see whether
 225        any URL needs to be converted in the first place.  If not, just
 226        leave the file alone.  */
 227     int dry_count = 0;
 228     struct urlpos *dry;
 229     for (dry = links; dry; dry = dry->next)
 230       if (dry->convert != CO_NOCONVERT)
 231         ++dry_count;
 232     if (!dry_count)
 233       {
 234         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 235         return;
 236       }
 237   }
 238
 239   fm = wget_read_file (file);
 240   if (!fm)
 241     {
 242       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 243                  file, strerror (errno));
 244       return;
 245     }
 246
 247   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 248   if (opt.backup_converted && downloaded_file_return)
 249     write_backup_file (file, downloaded_file_return);
 250
 251   /* Before opening the file for writing, unlink the file.  This is
 252      important if the data in FM is mmaped.  In such case, nulling the
 253      file, which is what fopen() below does, would make us read all
 254      zeroes from the mmaped region.  */
 255   if (unlink (file) < 0 && errno != ENOENT)
 256     {
 257       logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
 258                  quote (file), strerror (errno));
 259       wget_read_file_free (fm);
 260       return;
 261     }
 262   /* Now open the file for writing.  */
 263   fp = fopen (file, "wb");
 264   if (!fp)
 265     {
 266       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 267                  file, strerror (errno));
 268       wget_read_file_free (fm);
 269       return;
 270     }
 271
 272   /* Here we loop through all the URLs in file, replacing those of
 273      them that are downloaded with relative references.  */
 274   p = fm->content;
 275   for (link = links; link; link = link->next)
 276     {
 277       char *url_start = fm->content + link->pos;
 278
 279       if (link->pos >= fm->length)
 280         {
 281           DEBUGP (("Something strange is going on.  Please investigate."));
 282           break;
 283         }
 284       /* If the URL is not to be converted, skip it.  */
 285       if (link->convert == CO_NOCONVERT)
 286         {
 287           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 288           continue;
 289         }
 290
 291       /* Echo the file contents, up to the offending URL's opening
 292          quote, to the outfile.  */
 293       fwrite (p, 1, url_start - p, fp);
 294       p = url_start;
 295
 296       switch (link->convert)
 297         {
 298         case CO_CONVERT_TO_RELATIVE:
 299           /* Convert absolute URL to relative. */
 300           {
 301             char *newname = construct_relative (file, link->local_name);
 302             char *quoted_newname = local_quote_string (newname,
 303                                                        link->link_css_p);
 304
 305             if (link->link_css_p)
 306               p = replace_plain (p, link->size, fp, quoted_newname);
 307             else if (!link->link_refresh_p)
 308               p = replace_attr (p, link->size, fp, quoted_newname);
 309             else
 310               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 311                                              link->refresh_timeout);
 312
 313             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 314                      link->url->url, newname, link->pos, file));
 315             xfree (newname);
 316             xfree (quoted_newname);
 317             ++to_file_count;
 318             break;
 319           }
 320         case CO_CONVERT_TO_COMPLETE:
 321           /* Convert the link to absolute URL. */
 322           {
 323             char *newlink = link->url->url;
 324             char *quoted_newlink = html_quote_string (newlink);
 325
 326             if (link->link_css_p)
 327               p = replace_plain (p, link->size, fp, newlink);
 328             else if (!link->link_refresh_p)
 329               p = replace_attr (p, link->size, fp, quoted_newlink);
 330             else
 331               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 332                                              link->refresh_timeout);
 333
 334             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 335                      newlink, link->pos, file));
 336             xfree (quoted_newlink);
 337             ++to_url_count;
 338             break;
 339           }
 340         case CO_NULLIFY_BASE:
 341           /* Change the base href to "". */
 342           p = replace_attr (p, link->size, fp, "");
 343           break;
 344         case CO_NOCONVERT:
 345           abort ();
 346           break;
 347         }
 348     }
 349
 350   /* Output the rest of the file. */
 351   if (p - fm->content < fm->length)
 352     fwrite (p, 1, fm->length - (p - fm->content), fp);
 353   fclose (fp);
 354   wget_read_file_free (fm);
 355
 356   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 357 }
 358
 359 /* Construct and return a link that points from BASEFILE to LINKFILE.
 360    Both files should be local file names, BASEFILE of the referrering
 361    file, and LINKFILE of the referred file.
 362
 363    Examples:
 364
 365    cr("foo", "bar")         -> "bar"
 366    cr("A/foo", "A/bar")     -> "bar"
 367    cr("A/foo", "A/B/bar")   -> "B/bar"
 368    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 369    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 370
 371    Both files should be absolute or relative, otherwise strange
 372    results might ensue.  The function makes no special efforts to
 373    handle "." and ".." in links, so make sure they're not there
 374    (e.g. using path_simplify).  */
 375
 376 static char *
 377 construct_relative (const char *basefile, const char *linkfile)
 378 {
 379   char *link;
 380   int basedirs;
 381   const char *b, *l;
 382   int i, start;
 383
 384   /* First, skip the initial directory components common to both
 385      files.  */
 386   start = 0;
 387   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 388     {
 389       if (*b == '/')
 390         start = (b - basefile) + 1;
 391     }
 392   basefile += start;
 393   linkfile += start;
 394
 395   /* With common directories out of the way, the situation we have is
 396      as follows:
 397          b - b1/b2/[...]/bfile
 398          l - l1/l2/[...]/lfile
 399
 400      The link we're constructing needs to be:
 401        lnk - ../../l1/l2/[...]/lfile
 402
 403      Where the number of ".."'s equals the number of bN directory
 404      components in B.  */
 405
 406   /* Count the directory components in B. */
 407   basedirs = 0;
 408   for (b = basefile; *b; b++)
 409     {
 410       if (*b == '/')
 411         ++basedirs;
 412     }
 413
 414   /* Construct LINK as explained above. */
 415   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 416   for (i = 0; i < basedirs; i++)
 417     memcpy (link + 3 * i, "../", 3);
 418   strcpy (link + 3 * i, linkfile);
 419   return link;
 420 }
 421
 422 /* Used by write_backup_file to remember which files have been
 423    written. */
 424 static struct hash_table *converted_files;
 425
 426 static void
 427 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 428 {
 429   /* Rather than just writing over the original .html file with the
 430      converted version, save the former to *.orig.  Note we only do
 431      this for files we've _successfully_ downloaded, so we don't
 432      clobber .orig files sitting around from previous invocations.
 433      On VMS, use "_orig" instead of ".orig".  See "wget.h". */
 434
 435   /* Construct the backup filename as the original name plus ".orig". */
 436   size_t         filename_len = strlen (file);
 437   char*          filename_plus_orig_suffix;
 438
 439   /* TODO: hack this to work with css files */
 440   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 441     {
 442       /* Just write "orig" over "html".  We need to do it this way
 443          because when we're checking to see if we've downloaded the
 444          file before (to see if we can skip downloading it), we don't
 445          know if it's a text/html file.  Therefore we don't know yet
 446          at that stage that -E is going to cause us to tack on
 447          ".html", so we need to compare vs. the original URL plus
 448          ".orig", not the original URL plus ".html.orig". */
 449       filename_plus_orig_suffix = alloca (filename_len + 1);
 450       strcpy (filename_plus_orig_suffix, file);
 451       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 452     }
 453   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 454     {
 455       /* Append ".orig" to the name. */
 456       filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX));
 457       strcpy (filename_plus_orig_suffix, file);
 458       strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX);
 459     }
 460
 461   if (!converted_files)
 462     converted_files = make_string_hash_table (0);
 463
 464   /* We can get called twice on the same URL thanks to the
 465      convert_all_links() call in main().  If we write the .orig file
 466      each time in such a case, it'll end up containing the first-pass
 467      conversion, not the original file.  So, see if we've already been
 468      called on this file. */
 469   if (!string_set_contains (converted_files, file))
 470     {
 471       /* Rename <file> to <file>.orig before former gets written over. */
 472       if (rename (file, filename_plus_orig_suffix) != 0)
 473         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 474                    file, filename_plus_orig_suffix, strerror (errno));
 475
 476       /* Remember that we've already written a .orig backup for this file.
 477          Note that we never free this memory since we need it till the
 478          convert_all_links() call, which is one of the last things the
 479          program does before terminating.  BTW, I'm not sure if it would be
 480          safe to just set 'converted_file_ptr->string' to 'file' below,
 481          rather than making a copy of the string...  Another note is that I
 482          thought I could just add a field to the urlpos structure saying
 483          that we'd written a .orig file for this URL, but that didn't work,
 484          so I had to make this separate list.
 485          -- Dan Harkless <wget@harkless.org>
 486
 487          This [adding a field to the urlpos structure] didn't work
 488          because convert_file() is called from convert_all_links at
 489          the end of the retrieval with a freshly built new urlpos
 490          list.
 491          -- Hrvoje Niksic <hniksic@xemacs.org>
 492       */
 493       string_set_add (converted_files, file);
 494     }
 495 }
 496
 497 static bool find_fragment (const char *, int, const char **, const char **);
 498
 499 /* Replace a string with NEW_TEXT.  Ignore quoting. */
 500 static const char *
 501 replace_plain (const char *p, int size, FILE *fp, const char *new_text)
 502 {
 503   fputs (new_text, fp);
 504   p += size;
 505   return p;
 506 }
 507
 508 /* Replace an attribute's original text with NEW_TEXT. */
 509
 510 static const char *
 511 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 512 {
 513   bool quote_flag = false;
 514   char quote_char = '\"';       /* use "..." for quoting, unless the
 515                                    original value is quoted, in which
 516                                    case reuse its quoting char. */
 517   const char *frag_beg, *frag_end;
 518
 519   /* Structure of our string is:
 520        "...old-contents..."
 521        <---    size    --->  (with quotes)
 522      OR:
 523        ...old-contents...
 524        <---    size   -->    (no quotes)   */
 525
 526   if (*p == '\"' || *p == '\'')
 527     {
 528       quote_char = *p;
 529       quote_flag = true;
 530       ++p;
 531       size -= 2;                /* disregard opening and closing quote */
 532     }
 533   putc (quote_char, fp);
 534   fputs (new_text, fp);
 535
 536   /* Look for fragment identifier, if any. */
 537   if (find_fragment (p, size, &frag_beg, &frag_end))
 538     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 539   p += size;
 540   if (quote_flag)
 541     ++p;
 542   putc (quote_char, fp);
 543
 544   return p;
 545 }
 546
 547 /* The same as REPLACE_ATTR, but used when replacing
 548    <meta http-equiv=refresh content="new_text"> because we need to
 549    append "timeout_value; URL=" before the next_text.  */
 550
 551 static const char *
 552 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 553                            const char *new_text, int timeout)
 554 {
 555   /* "0; URL=..." */
 556   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 557                                            + 6 /* "; URL=" */
 558                                            + strlen (new_text)
 559                                            + 1);
 560   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 561
 562   return replace_attr (p, size, fp, new_with_timeout);
 563 }
 564
 565 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 566    preceded by '&'.  If the character is not found, return zero.  If
 567    the character is found, return true and set BP and EP to point to
 568    the beginning and end of the region.
 569
 570    This is used for finding the fragment indentifiers in URLs.  */
 571
 572 static bool
 573 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 574 {
 575   const char *end = beg + size;
 576   bool saw_amp = false;
 577   for (; beg < end; beg++)
 578     {
 579       switch (*beg)
 580         {
 581         case '&':
 582           saw_amp = true;
 583           break;
 584         case '#':
 585           if (!saw_amp)
 586             {
 587               *bp = beg;
 588               *ep = end;
 589               return true;
 590             }
 591           /* fallthrough */
 592         default:
 593           saw_amp = false;
 594         }
 595     }
 596   return false;
 597 }
 598
 599 /* Quote FILE for use as local reference to an HTML file.
 600
 601    We quote ? as %3F to avoid passing part of the file name as the
 602    parameter when browsing the converted file through HTTP.  However,
 603    it is safe to do this only when `--adjust-extension' is turned on.
 604    This is because converting "index.html?foo=bar" to
 605    "index.html%3Ffoo=bar" would break local browsing, as the latter
 606    isn't even recognized as an HTML file!  However, converting
 607    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 608    safe for both local and HTTP-served browsing.
 609
 610    We always quote "#" as "%23", "%" as "%25" and ";" as "%3B"
 611    because those characters have special meanings in URLs.  */
 612
 613 static char *
 614 local_quote_string (const char *file, bool no_html_quote)
 615 {
 616   const char *from;
 617   char *newname, *to;
 618
 619   char *any = strpbrk (file, "?#%;");
 620   if (!any)
 621     return no_html_quote ? strdup (file) : html_quote_string (file);
 622
 623   /* Allocate space assuming the worst-case scenario, each character
 624      having to be quoted.  */
 625   to = newname = (char *)alloca (3 * strlen (file) + 1);
 626   for (from = file; *from; from++)
 627     switch (*from)
 628       {
 629       case '%':
 630         *to++ = '%';
 631         *to++ = '2';
 632         *to++ = '5';
 633         break;
 634       case '#':
 635         *to++ = '%';
 636         *to++ = '2';
 637         *to++ = '3';
 638         break;
 639       case ';':
 640         *to++ = '%';
 641         *to++ = '3';
 642         *to++ = 'B';
 643         break;
 644       case '?':
 645         if (opt.adjust_extension)
 646           {
 647             *to++ = '%';
 648             *to++ = '3';
 649             *to++ = 'F';
 650             break;
 651           }
 652         /* fallthrough */
 653       default:
 654         *to++ = *from;
 655       }
 656   *to = '\0';
 657
 658   return no_html_quote ? strdup (newname) : html_quote_string (newname);
 659 }
 660 \f
 661 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 662    downloaded_html_list, and downloaded_html_set.  Other code calls
 663    these functions to let us know that a file has been downloaded.  */
 664
 665 #define ENSURE_TABLES_EXIST do {                        \
 666   if (!dl_file_url_map)                                 \
 667     dl_file_url_map = make_string_hash_table (0);       \
 668   if (!dl_url_file_map)                                 \
 669     dl_url_file_map = make_string_hash_table (0);       \
 670 } while (0)
 671
 672 /* Return true if S1 and S2 are the same, except for "/index.html".
 673    The three cases in which it returns one are (substitute any
 674    substring for "foo"):
 675
 676    m("foo/index.html", "foo/")  ==> 1
 677    m("foo/", "foo/index.html")  ==> 1
 678    m("foo", "foo/index.html")   ==> 1
 679    m("foo", "foo/"              ==> 1
 680    m("foo", "foo")              ==> 1  */
 681
 682 static bool
 683 match_except_index (const char *s1, const char *s2)
 684 {
 685   int i;
 686   const char *lng;
 687
 688   /* Skip common substring. */
 689   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 690     ;
 691   if (i == 0)
 692     /* Strings differ at the very beginning -- bail out.  We need to
 693        check this explicitly to avoid `lng - 1' reading outside the
 694        array.  */
 695     return false;
 696
 697   if (!*s1 && !*s2)
 698     /* Both strings hit EOF -- strings are equal. */
 699     return true;
 700   else if (*s1 && *s2)
 701     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 702     return false;
 703   else if (*s1)
 704     /* S1 is the longer one. */
 705     lng = s1;
 706   else
 707     /* S2 is the longer one. */
 708     lng = s2;
 709
 710   /* foo            */            /* foo/           */
 711   /* foo/index.html */  /* or */  /* foo/index.html */
 712   /*    ^           */            /*     ^          */
 713
 714   if (*lng != '/')
 715     /* The right-hand case. */
 716     --lng;
 717
 718   if (*lng == '/' && *(lng + 1) == '\0')
 719     /* foo  */
 720     /* foo/ */
 721     return true;
 722
 723   return 0 == strcmp (lng, "/index.html");
 724 }
 725
 726 static int
 727 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 728 {
 729   char *mapping_url = (char *)key;
 730   char *mapping_file = (char *)value;
 731   char *file = (char *)arg;
 732
 733   if (0 == strcmp (mapping_file, file))
 734     {
 735       hash_table_remove (dl_url_file_map, mapping_url);
 736       xfree (mapping_url);
 737       xfree (mapping_file);
 738     }
 739
 740   /* Continue mapping. */
 741   return 0;
 742 }
 743
 744 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 745
 746 static void
 747 dissociate_urls_from_file (const char *file)
 748 {
 749   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 750   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 751                        (char *) file);
 752 }
 753
 754 /* Register that URL has been successfully downloaded to FILE.  This
 755    is used by the link conversion code to convert references to URLs
 756    to references to local files.  It is also being used to check if a
 757    URL has already been downloaded.  */
 758
 759 void
 760 register_download (const char *url, const char *file)
 761 {
 762   char *old_file, *old_url;
 763
 764   ENSURE_TABLES_EXIST;
 765
 766   /* With some forms of retrieval, it is possible, although not likely
 767      or particularly desirable.  If both are downloaded, the second
 768      download will override the first one.  When that happens,
 769      dissociate the old file name from the URL.  */
 770
 771   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 772     {
 773       if (0 == strcmp (url, old_url))
 774         /* We have somehow managed to download the same URL twice.
 775            Nothing to do.  */
 776         return;
 777
 778       if (match_except_index (url, old_url)
 779           && !hash_table_contains (dl_url_file_map, url))
 780         /* The two URLs differ only in the "index.html" ending.  For
 781            example, one is "http://www.server.com/", and the other is
 782            "http://www.server.com/index.html".  Don't remove the old
 783            one, just add the new one as a non-canonical entry.  */
 784         goto url_only;
 785
 786       hash_table_remove (dl_file_url_map, file);
 787       xfree (old_file);
 788       xfree (old_url);
 789
 790       /* Remove all the URLs that point to this file.  Yes, there can
 791          be more than one such URL, because we store redirections as
 792          multiple entries in dl_url_file_map.  For example, if URL1
 793          redirects to URL2 which gets downloaded to FILE, we map both
 794          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 795          only points to URL2.)  When another URL gets loaded to FILE,
 796          we want both URL1 and URL2 dissociated from it.
 797
 798          This is a relatively expensive operation because it performs
 799          a linear search of the whole hash table, but it should be
 800          called very rarely, only when two URLs resolve to the same
 801          file name, *and* the "<file>.1" extensions are turned off.
 802          In other words, almost never.  */
 803       dissociate_urls_from_file (file);
 804     }
 805
 806   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 807
 808  url_only:
 809   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 810      If the latter were present, it should have been removed by the
 811      above `if'.  So we could write:
 812
 813          assert (!hash_table_contains (dl_url_file_map, url));
 814
 815      The above is correct when running in recursive mode where the
 816      same URL always resolves to the same file.  But if you do
 817      something like:
 818
 819          wget URL URL
 820
 821      then the first URL will resolve to "FILE", and the other to
 822      "FILE.1".  In that case, FILE.1 will not be found in
 823      dl_file_url_map, but URL will still point to FILE in
 824      dl_url_file_map.  */
 825   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 826     {
 827       hash_table_remove (dl_url_file_map, url);
 828       xfree (old_url);
 829       xfree (old_file);
 830     }
 831
 832   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 833 }
 834
 835 /* Register that FROM has been redirected to TO.  This assumes that TO
 836    is successfully downloaded and already registered using
 837    register_download() above.  */
 838
 839 void
 840 register_redirection (const char *from, const char *to)
 841 {
 842   char *file;
 843
 844   ENSURE_TABLES_EXIST;
 845
 846   file = hash_table_get (dl_url_file_map, to);
 847   assert (file != NULL);
 848   if (!hash_table_contains (dl_url_file_map, from))
 849     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 850 }
 851
 852 /* Register that the file has been deleted. */
 853
 854 void
 855 register_delete_file (const char *file)
 856 {
 857   char *old_url, *old_file;
 858
 859   ENSURE_TABLES_EXIST;
 860
 861   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 862     return;
 863
 864   hash_table_remove (dl_file_url_map, file);
 865   xfree (old_file);
 866   xfree (old_url);
 867   dissociate_urls_from_file (file);
 868 }
 869
 870 /* Register that FILE is an HTML file that has been downloaded. */
 871
 872 void
 873 register_html (const char *url, const char *file)
 874 {
 875   if (!downloaded_html_set)
 876     downloaded_html_set = make_string_hash_table (0);
 877   string_set_add (downloaded_html_set, file);
 878 }
 879
 880 /* Register that FILE is a CSS file that has been downloaded. */
 881
 882 void
 883 register_css (const char *url, const char *file)
 884 {
 885   if (!downloaded_css_set)
 886     downloaded_css_set = make_string_hash_table (0);
 887   string_set_add (downloaded_css_set, file);
 888 }
 889
 890 static void downloaded_files_free (void);
 891
 892 /* Cleanup the data structures associated with this file.  */
 893
 894 void
 895 convert_cleanup (void)
 896 {
 897   if (dl_file_url_map)
 898     {
 899       free_keys_and_values (dl_file_url_map);
 900       hash_table_destroy (dl_file_url_map);
 901       dl_file_url_map = NULL;
 902     }
 903   if (dl_url_file_map)
 904     {
 905       free_keys_and_values (dl_url_file_map);
 906       hash_table_destroy (dl_url_file_map);
 907       dl_url_file_map = NULL;
 908     }
 909   if (downloaded_html_set)
 910     string_set_free (downloaded_html_set);
 911   downloaded_files_free ();
 912   if (converted_files)
 913     string_set_free (converted_files);
 914 }
 915 \f
 916 /* Book-keeping code for downloaded files that enables extension
 917    hacks.  */
 918
 919 /* This table should really be merged with dl_file_url_map and
 920    downloaded_html_files.  This was originally a list, but I changed
 921    it to a hash table beause it was actually taking a lot of time to
 922    find things in it.  */
 923
 924 static struct hash_table *downloaded_files_hash;
 925
 926 /* We're storing "modes" of type downloaded_file_t in the hash table.
 927    However, our hash tables only accept pointers for keys and values.
 928    So when we need a pointer, we use the address of a
 929    downloaded_file_t variable of static storage.  */
 930
 931 static downloaded_file_t *
 932 downloaded_mode_to_ptr (downloaded_file_t mode)
 933 {
 934   static downloaded_file_t
 935     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 936     v2 = FILE_DOWNLOADED_NORMALLY,
 937     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 938     v4 = CHECK_FOR_FILE;
 939
 940   switch (mode)
 941     {
 942     case FILE_NOT_ALREADY_DOWNLOADED:
 943       return &v1;
 944     case FILE_DOWNLOADED_NORMALLY:
 945       return &v2;
 946     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 947       return &v3;
 948     case CHECK_FOR_FILE:
 949       return &v4;
 950     }
 951   return NULL;
 952 }
 953
 954 /* Remembers which files have been downloaded.  In the standard case,
 955    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 956    file we actually download successfully (i.e. not for ones we have
 957    failures on or that we skip due to -N).
 958
 959    When we've downloaded a file and tacked on a ".html" extension due
 960    to -E, call this function with
 961    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 962    FILE_DOWNLOADED_NORMALLY.
 963
 964    If you just want to check if a file has been previously added
 965    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 966    sure to call this function with local filenames, not remote
 967    URLs.  */
 968
 969 downloaded_file_t
 970 downloaded_file (downloaded_file_t mode, const char *file)
 971 {
 972   downloaded_file_t *ptr;
 973
 974   if (mode == CHECK_FOR_FILE)
 975     {
 976       if (!downloaded_files_hash)
 977         return FILE_NOT_ALREADY_DOWNLOADED;
 978       ptr = hash_table_get (downloaded_files_hash, file);
 979       if (!ptr)
 980         return FILE_NOT_ALREADY_DOWNLOADED;
 981       return *ptr;
 982     }
 983
 984   if (!downloaded_files_hash)
 985     downloaded_files_hash = make_string_hash_table (0);
 986
 987   ptr = hash_table_get (downloaded_files_hash, file);
 988   if (ptr)
 989     return *ptr;
 990
 991   ptr = downloaded_mode_to_ptr (mode);
 992   hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
 993
 994   return FILE_NOT_ALREADY_DOWNLOADED;
 995 }
 996
 997 static void
 998 downloaded_files_free (void)
 999 {
1000   if (downloaded_files_hash)
1001     {
1002       hash_table_iterator iter;
1003       for (hash_table_iterate (downloaded_files_hash, &iter);
1004            hash_table_iter_next (&iter);
1005            )
1006         xfree (iter.key);
1007       hash_table_destroy (downloaded_files_hash);
1008       downloaded_files_hash = NULL;
1009     }
1010 }
1011 \f
1012 /* The function returns the pointer to the malloc-ed quoted version of
1013    string s.  It will recognize and quote numeric and special graphic
1014    entities, as per RFC1866:
1015
1016    `&' -> `&amp;'
1017    `<' -> `&lt;'
1018    `>' -> `&gt;'
1019    `"' -> `&quot;'
1020    SP  -> `&#32;'
1021
1022    No other entities are recognized or replaced.  */
1023 char *
1024 html_quote_string (const char *s)
1025 {
1026   const char *b = s;
1027   char *p, *res;
1028   int i;
1029
1030   /* Pass through the string, and count the new size.  */
1031   for (i = 0; *s; s++, i++)
1032     {
1033       if (*s == '&')
1034         i += 4;                 /* `amp;' */
1035       else if (*s == '<' || *s == '>')
1036         i += 3;                 /* `lt;' and `gt;' */
1037       else if (*s == '\"')
1038         i += 5;                 /* `quot;' */
1039       else if (*s == ' ')
1040         i += 4;                 /* #32; */
1041     }
1042   res = xmalloc (i + 1);
1043   s = b;
1044   for (p = res; *s; s++)
1045     {
1046       switch (*s)
1047         {
1048         case '&':
1049           *p++ = '&';
1050           *p++ = 'a';
1051           *p++ = 'm';
1052           *p++ = 'p';
1053           *p++ = ';';
1054           break;
1055         case '<': case '>':
1056           *p++ = '&';
1057           *p++ = (*s == '<' ? 'l' : 'g');
1058           *p++ = 't';
1059           *p++ = ';';
1060           break;
1061         case '\"':
1062           *p++ = '&';
1063           *p++ = 'q';
1064           *p++ = 'u';
1065           *p++ = 'o';
1066           *p++ = 't';
1067           *p++ = ';';
1068           break;
1069         case ' ':
1070           *p++ = '&';
1071           *p++ = '#';
1072           *p++ = '3';
1073           *p++ = '2';
1074           *p++ = ';';
1075           break;
1076         default:
1077           *p++ = *s;
1078         }
1079     }
1080   *p = '\0';
1081   return res;
1082 }
1083
1084 /*
1085  * vim: et ts=2 sw=2
1086  */
1087