sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
   3    Software Foundation, Inc.
   4
   5 This file is part of GNU Wget.
   6
   7 GNU Wget is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or
  10  (at your option) any later version.
  11
  12 GNU Wget is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 Additional permission under GNU GPL version 3 section 7
  21
  22 If you modify this program, or any covered work, by linking or
  23 combining it with the OpenSSL project's OpenSSL library (or a
  24 modified version of that library), containing parts covered by the
  25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  26 grants you additional permission to convey the resulting work.
  27 Corresponding Source for a non-source form of such a combination
  28 shall include the source code for the parts of OpenSSL used as well
  29 as that of the covered work.  */
  30
  31 #include "wget.h"
  32
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_UNISTD_H
  37 # include <unistd.h>
  38 #endif /* HAVE_UNISTD_H */
  39 #include <errno.h>
  40 #include <assert.h>
  41 #include "convert.h"
  42 #include "url.h"
  43 #include "recur.h"
  44 #include "utils.h"
  45 #include "hash.h"
  46 #include "ptimer.h"
  47 #include "res.h"
  48 #include "html-url.h"
  49 #include "css-url.h"
  50 #include "iri.h"
  51
  52 static struct hash_table *dl_file_url_map;
  53 struct hash_table *dl_url_file_map;
  54
  55 /* Set of HTML/CSS files downloaded in this Wget run, used for link
  56    conversion after Wget is done.  */
  57 struct hash_table *downloaded_html_set;
  58 struct hash_table *downloaded_css_set;
  59
  60 static void convert_links (const char *, struct urlpos *);
  61
  62
  63 void
  64 convert_links_in_hashtable (struct hash_table *downloaded_set,
  65                             int is_css,
  66                             int *file_count)
  67 {
  68   int i;
  69
  70   int cnt;
  71   char **file_array;
  72
  73   cnt = 0;
  74   if (downloaded_set)
  75     cnt = hash_table_count (downloaded_set);
  76   if (cnt == 0)
  77     return;
  78   file_array = alloca_array (char *, cnt);
  79   string_set_to_array (downloaded_set, file_array);
  80
  81   for (i = 0; i < cnt; i++)
  82     {
  83       struct urlpos *urls, *cur_url;
  84       char *url;
  85       char *file = file_array[i];
  86
  87       /* Determine the URL of the file.  get_urls_{html,css} will need
  88          it.  */
  89       url = hash_table_get (dl_file_url_map, file);
  90       if (!url)
  91         {
  92           DEBUGP (("Apparently %s has been removed.\n", file));
  93           continue;
  94         }
  95
  96       DEBUGP (("Scanning %s (from %s)\n", file, url));
  97
  98       /* Parse the file...  */
  99       urls = is_css ? get_urls_css_file (file, url) :
 100                       get_urls_html (file, url, NULL, NULL);
 101
 102       /* We don't respect meta_disallow_follow here because, even if
 103          the file is not followed, we might still want to convert the
 104          links that have been followed from other files.  */
 105
 106       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 107         {
 108           char *local_name;
 109           struct url *u;
 110           struct iri *pi;
 111
 112           if (cur_url->link_base_p)
 113             {
 114               /* Base references have been resolved by our parser, so
 115                  we turn the base URL into an empty string.  (Perhaps
 116                  we should remove the tag entirely?)  */
 117               cur_url->convert = CO_NULLIFY_BASE;
 118               continue;
 119             }
 120
 121           /* We decide the direction of conversion according to whether
 122              a URL was downloaded.  Downloaded URLs will be converted
 123              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 124
 125           pi = iri_new ();
 126           set_uri_encoding (pi, opt.locale, true);
 127
 128           u = url_parse (cur_url->url->url, NULL, pi, true);
 129           local_name = hash_table_get (dl_url_file_map, u->url);
 130
 131           /* Decide on the conversion type.  */
 132           if (local_name)
 133             {
 134               /* We've downloaded this URL.  Convert it to relative
 135                  form.  We do this even if the URL already is in
 136                  relative form, because our directory structure may
 137                  not be identical to that on the server (think `-nd',
 138                  `--cut-dirs', etc.)  */
 139               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 140               cur_url->local_name = xstrdup (local_name);
 141               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 142             }
 143           else
 144             {
 145               /* We haven't downloaded this URL.  If it's not already
 146                  complete (including a full host name), convert it to
 147                  that form, so it can be reached while browsing this
 148                  HTML locally.  */
 149               if (!cur_url->link_complete_p)
 150                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 151               cur_url->local_name = NULL;
 152               DEBUGP (("will convert url %s to complete\n", u->url));
 153             }
 154
 155           url_free (u);
 156           iri_free (pi);
 157         }
 158
 159       /* Convert the links in the file.  */
 160       convert_links (file, urls);
 161       ++*file_count;
 162
 163       /* Free the data.  */
 164       free_urlpos (urls);
 165     }
 166 }
 167
 168 /* This function is called when the retrieval is done to convert the
 169    links that have been downloaded.  It has to be called at the end of
 170    the retrieval, because only then does Wget know conclusively which
 171    URLs have been downloaded, and which not, so it can tell which
 172    direction to convert to.
 173
 174    The "direction" means that the URLs to the files that have been
 175    downloaded get converted to the relative URL which will point to
 176    that file.  And the other URLs get converted to the remote URL on
 177    the server.
 178
 179    All the downloaded HTMLs are kept in downloaded_html_files, and
 180    downloaded URLs in urls_downloaded.  All the information is
 181    extracted from these two lists.  */
 182
 183 void
 184 convert_all_links (void)
 185 {
 186   double secs;
 187   int file_count = 0;
 188
 189   struct ptimer *timer = ptimer_new ();
 190
 191   convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
 192   convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
 193
 194   secs = ptimer_measure (timer);
 195   logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
 196              file_count, print_decimal (secs));
 197
 198   ptimer_destroy (timer);
 199 }
 200
 201 static void write_backup_file (const char *, downloaded_file_t);
 202 static const char *replace_plain (const char*, int, FILE*, const char *);
 203 static const char *replace_attr (const char *, int, FILE *, const char *);
 204 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
 205                                               const char *, int);
 206 static char *local_quote_string (const char *, bool);
 207 static char *construct_relative (const char *, const char *);
 208
 209 /* Change the links in one file.  LINKS is a list of links in the
 210    document, along with their positions and the desired direction of
 211    the conversion.  */
 212 static void
 213 convert_links (const char *file, struct urlpos *links)
 214 {
 215   struct file_memory *fm;
 216   FILE *fp;
 217   const char *p;
 218   downloaded_file_t downloaded_file_return;
 219
 220   struct urlpos *link;
 221   int to_url_count = 0, to_file_count = 0;
 222
 223   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 224
 225   {
 226     /* First we do a "dry run": go through the list L and see whether
 227        any URL needs to be converted in the first place.  If not, just
 228        leave the file alone.  */
 229     int dry_count = 0;
 230     struct urlpos *dry;
 231     for (dry = links; dry; dry = dry->next)
 232       if (dry->convert != CO_NOCONVERT)
 233         ++dry_count;
 234     if (!dry_count)
 235       {
 236         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 237         return;
 238       }
 239   }
 240
 241   fm = wget_read_file (file);
 242   if (!fm)
 243     {
 244       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 245                  file, strerror (errno));
 246       return;
 247     }
 248
 249   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 250   if (opt.backup_converted && downloaded_file_return)
 251     write_backup_file (file, downloaded_file_return);
 252
 253   /* Before opening the file for writing, unlink the file.  This is
 254      important if the data in FM is mmaped.  In such case, nulling the
 255      file, which is what fopen() below does, would make us read all
 256      zeroes from the mmaped region.  */
 257   if (unlink (file) < 0 && errno != ENOENT)
 258     {
 259       logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
 260                  quote (file), strerror (errno));
 261       wget_read_file_free (fm);
 262       return;
 263     }
 264   /* Now open the file for writing.  */
 265   fp = fopen (file, "wb");
 266   if (!fp)
 267     {
 268       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 269                  file, strerror (errno));
 270       wget_read_file_free (fm);
 271       return;
 272     }
 273
 274   /* Here we loop through all the URLs in file, replacing those of
 275      them that are downloaded with relative references.  */
 276   p = fm->content;
 277   for (link = links; link; link = link->next)
 278     {
 279       char *url_start = fm->content + link->pos;
 280
 281       if (link->pos >= fm->length)
 282         {
 283           DEBUGP (("Something strange is going on.  Please investigate."));
 284           break;
 285         }
 286       /* If the URL is not to be converted, skip it.  */
 287       if (link->convert == CO_NOCONVERT)
 288         {
 289           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 290           continue;
 291         }
 292
 293       /* Echo the file contents, up to the offending URL's opening
 294          quote, to the outfile.  */
 295       fwrite (p, 1, url_start - p, fp);
 296       p = url_start;
 297
 298       switch (link->convert)
 299         {
 300         case CO_CONVERT_TO_RELATIVE:
 301           /* Convert absolute URL to relative. */
 302           {
 303             char *newname = construct_relative (file, link->local_name);
 304             char *quoted_newname = local_quote_string (newname,
 305                                                        link->link_css_p);
 306
 307             if (link->link_css_p)
 308               p = replace_plain (p, link->size, fp, quoted_newname);
 309             else if (!link->link_refresh_p)
 310               p = replace_attr (p, link->size, fp, quoted_newname);
 311             else
 312               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 313                                              link->refresh_timeout);
 314
 315             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 316                      link->url->url, newname, link->pos, file));
 317             xfree (newname);
 318             xfree (quoted_newname);
 319             ++to_file_count;
 320             break;
 321           }
 322         case CO_CONVERT_TO_COMPLETE:
 323           /* Convert the link to absolute URL. */
 324           {
 325             char *newlink = link->url->url;
 326             char *quoted_newlink = html_quote_string (newlink);
 327
 328             if (link->link_css_p)
 329               p = replace_plain (p, link->size, fp, newlink);
 330             else if (!link->link_refresh_p)
 331               p = replace_attr (p, link->size, fp, quoted_newlink);
 332             else
 333               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 334                                              link->refresh_timeout);
 335
 336             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 337                      newlink, link->pos, file));
 338             xfree (quoted_newlink);
 339             ++to_url_count;
 340             break;
 341           }
 342         case CO_NULLIFY_BASE:
 343           /* Change the base href to "". */
 344           p = replace_attr (p, link->size, fp, "");
 345           break;
 346         case CO_NOCONVERT:
 347           abort ();
 348           break;
 349         }
 350     }
 351
 352   /* Output the rest of the file. */
 353   if (p - fm->content < fm->length)
 354     fwrite (p, 1, fm->length - (p - fm->content), fp);
 355   fclose (fp);
 356   wget_read_file_free (fm);
 357
 358   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 359 }
 360
 361 /* Construct and return a link that points from BASEFILE to LINKFILE.
 362    Both files should be local file names, BASEFILE of the referrering
 363    file, and LINKFILE of the referred file.
 364
 365    Examples:
 366
 367    cr("foo", "bar")         -> "bar"
 368    cr("A/foo", "A/bar")     -> "bar"
 369    cr("A/foo", "A/B/bar")   -> "B/bar"
 370    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 371    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 372
 373    Both files should be absolute or relative, otherwise strange
 374    results might ensue.  The function makes no special efforts to
 375    handle "." and ".." in links, so make sure they're not there
 376    (e.g. using path_simplify).  */
 377
 378 static char *
 379 construct_relative (const char *basefile, const char *linkfile)
 380 {
 381   char *link;
 382   int basedirs;
 383   const char *b, *l;
 384   int i, start;
 385
 386   /* First, skip the initial directory components common to both
 387      files.  */
 388   start = 0;
 389   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 390     {
 391       if (*b == '/')
 392         start = (b - basefile) + 1;
 393     }
 394   basefile += start;
 395   linkfile += start;
 396
 397   /* With common directories out of the way, the situation we have is
 398      as follows:
 399          b - b1/b2/[...]/bfile
 400          l - l1/l2/[...]/lfile
 401
 402      The link we're constructing needs to be:
 403        lnk - ../../l1/l2/[...]/lfile
 404
 405      Where the number of ".."'s equals the number of bN directory
 406      components in B.  */
 407
 408   /* Count the directory components in B. */
 409   basedirs = 0;
 410   for (b = basefile; *b; b++)
 411     {
 412       if (*b == '/')
 413         ++basedirs;
 414     }
 415
 416   /* Construct LINK as explained above. */
 417   link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
 418   for (i = 0; i < basedirs; i++)
 419     memcpy (link + 3 * i, "../", 3);
 420   strcpy (link + 3 * i, linkfile);
 421   return link;
 422 }
 423
 424 /* Used by write_backup_file to remember which files have been
 425    written. */
 426 static struct hash_table *converted_files;
 427
 428 static void
 429 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 430 {
 431   /* Rather than just writing over the original .html file with the
 432      converted version, save the former to *.orig.  Note we only do
 433      this for files we've _successfully_ downloaded, so we don't
 434      clobber .orig files sitting around from previous invocations.
 435      On VMS, use "_orig" instead of ".orig".  See "wget.h". */
 436
 437   /* Construct the backup filename as the original name plus ".orig". */
 438   size_t         filename_len = strlen (file);
 439   char*          filename_plus_orig_suffix;
 440
 441   /* TODO: hack this to work with css files */
 442   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 443     {
 444       /* Just write "orig" over "html".  We need to do it this way
 445          because when we're checking to see if we've downloaded the
 446          file before (to see if we can skip downloading it), we don't
 447          know if it's a text/html file.  Therefore we don't know yet
 448          at that stage that -E is going to cause us to tack on
 449          ".html", so we need to compare vs. the original URL plus
 450          ".orig", not the original URL plus ".html.orig". */
 451       filename_plus_orig_suffix = alloca (filename_len + 1);
 452       strcpy (filename_plus_orig_suffix, file);
 453       strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
 454     }
 455   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 456     {
 457       /* Append ".orig" to the name. */
 458       filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX));
 459       strcpy (filename_plus_orig_suffix, file);
 460       strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX);
 461     }
 462
 463   if (!converted_files)
 464     converted_files = make_string_hash_table (0);
 465
 466   /* We can get called twice on the same URL thanks to the
 467      convert_all_links() call in main().  If we write the .orig file
 468      each time in such a case, it'll end up containing the first-pass
 469      conversion, not the original file.  So, see if we've already been
 470      called on this file. */
 471   if (!string_set_contains (converted_files, file))
 472     {
 473       /* Rename <file> to <file>.orig before former gets written over. */
 474       if (rename (file, filename_plus_orig_suffix) != 0)
 475         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 476                    file, filename_plus_orig_suffix, strerror (errno));
 477
 478       /* Remember that we've already written a .orig backup for this file.
 479          Note that we never free this memory since we need it till the
 480          convert_all_links() call, which is one of the last things the
 481          program does before terminating.  BTW, I'm not sure if it would be
 482          safe to just set 'converted_file_ptr->string' to 'file' below,
 483          rather than making a copy of the string...  Another note is that I
 484          thought I could just add a field to the urlpos structure saying
 485          that we'd written a .orig file for this URL, but that didn't work,
 486          so I had to make this separate list.
 487          -- Dan Harkless <wget@harkless.org>
 488
 489          This [adding a field to the urlpos structure] didn't work
 490          because convert_file() is called from convert_all_links at
 491          the end of the retrieval with a freshly built new urlpos
 492          list.
 493          -- Hrvoje Niksic <hniksic@xemacs.org>
 494       */
 495       string_set_add (converted_files, file);
 496     }
 497 }
 498
 499 static bool find_fragment (const char *, int, const char **, const char **);
 500
 501 /* Replace a string with NEW_TEXT.  Ignore quoting. */
 502 static const char *
 503 replace_plain (const char *p, int size, FILE *fp, const char *new_text)
 504 {
 505   fputs (new_text, fp);
 506   p += size;
 507   return p;
 508 }
 509
 510 /* Replace an attribute's original text with NEW_TEXT. */
 511
 512 static const char *
 513 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 514 {
 515   bool quote_flag = false;
 516   char quote_char = '\"';       /* use "..." for quoting, unless the
 517                                    original value is quoted, in which
 518                                    case reuse its quoting char. */
 519   const char *frag_beg, *frag_end;
 520
 521   /* Structure of our string is:
 522        "...old-contents..."
 523        <---    size    --->  (with quotes)
 524      OR:
 525        ...old-contents...
 526        <---    size   -->    (no quotes)   */
 527
 528   if (*p == '\"' || *p == '\'')
 529     {
 530       quote_char = *p;
 531       quote_flag = true;
 532       ++p;
 533       size -= 2;                /* disregard opening and closing quote */
 534     }
 535   putc (quote_char, fp);
 536   fputs (new_text, fp);
 537
 538   /* Look for fragment identifier, if any. */
 539   if (find_fragment (p, size, &frag_beg, &frag_end))
 540     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 541   p += size;
 542   if (quote_flag)
 543     ++p;
 544   putc (quote_char, fp);
 545
 546   return p;
 547 }
 548
 549 /* The same as REPLACE_ATTR, but used when replacing
 550    <meta http-equiv=refresh content="new_text"> because we need to
 551    append "timeout_value; URL=" before the next_text.  */
 552
 553 static const char *
 554 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 555                            const char *new_text, int timeout)
 556 {
 557   /* "0; URL=..." */
 558   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 559                                            + 6 /* "; URL=" */
 560                                            + strlen (new_text)
 561                                            + 1);
 562   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 563
 564   return replace_attr (p, size, fp, new_with_timeout);
 565 }
 566
 567 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 568    preceded by '&'.  If the character is not found, return zero.  If
 569    the character is found, return true and set BP and EP to point to
 570    the beginning and end of the region.
 571
 572    This is used for finding the fragment indentifiers in URLs.  */
 573
 574 static bool
 575 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 576 {
 577   const char *end = beg + size;
 578   bool saw_amp = false;
 579   for (; beg < end; beg++)
 580     {
 581       switch (*beg)
 582         {
 583         case '&':
 584           saw_amp = true;
 585           break;
 586         case '#':
 587           if (!saw_amp)
 588             {
 589               *bp = beg;
 590               *ep = end;
 591               return true;
 592             }
 593           /* fallthrough */
 594         default:
 595           saw_amp = false;
 596         }
 597     }
 598   return false;
 599 }
 600
 601 /* Quote FILE for use as local reference to an HTML file.
 602
 603    We quote ? as %3F to avoid passing part of the file name as the
 604    parameter when browsing the converted file through HTTP.  However,
 605    it is safe to do this only when `--adjust-extension' is turned on.
 606    This is because converting "index.html?foo=bar" to
 607    "index.html%3Ffoo=bar" would break local browsing, as the latter
 608    isn't even recognized as an HTML file!  However, converting
 609    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 610    safe for both local and HTTP-served browsing.
 611
 612    We always quote "#" as "%23", "%" as "%25" and ";" as "%3B"
 613    because those characters have special meanings in URLs.  */
 614
 615 static char *
 616 local_quote_string (const char *file, bool no_html_quote)
 617 {
 618   const char *from;
 619   char *newname, *to;
 620
 621   char *any = strpbrk (file, "?#%;");
 622   if (!any)
 623     return no_html_quote ? strdup (file) : html_quote_string (file);
 624
 625   /* Allocate space assuming the worst-case scenario, each character
 626      having to be quoted.  */
 627   to = newname = (char *)alloca (3 * strlen (file) + 1);
 628   for (from = file; *from; from++)
 629     switch (*from)
 630       {
 631       case '%':
 632         *to++ = '%';
 633         *to++ = '2';
 634         *to++ = '5';
 635         break;
 636       case '#':
 637         *to++ = '%';
 638         *to++ = '2';
 639         *to++ = '3';
 640         break;
 641       case ';':
 642         *to++ = '%';
 643         *to++ = '3';
 644         *to++ = 'B';
 645         break;
 646       case '?':
 647         if (opt.adjust_extension)
 648           {
 649             *to++ = '%';
 650             *to++ = '3';
 651             *to++ = 'F';
 652             break;
 653           }
 654         /* fallthrough */
 655       default:
 656         *to++ = *from;
 657       }
 658   *to = '\0';
 659
 660   return no_html_quote ? strdup (newname) : html_quote_string (newname);
 661 }
 662 \f
 663 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 664    downloaded_html_list, and downloaded_html_set.  Other code calls
 665    these functions to let us know that a file has been downloaded.  */
 666
 667 #define ENSURE_TABLES_EXIST do {                        \
 668   if (!dl_file_url_map)                                 \
 669     dl_file_url_map = make_string_hash_table (0);       \
 670   if (!dl_url_file_map)                                 \
 671     dl_url_file_map = make_string_hash_table (0);       \
 672 } while (0)
 673
 674 /* Return true if S1 and S2 are the same, except for "/index.html".
 675    The three cases in which it returns one are (substitute any
 676    substring for "foo"):
 677
 678    m("foo/index.html", "foo/")  ==> 1
 679    m("foo/", "foo/index.html")  ==> 1
 680    m("foo", "foo/index.html")   ==> 1
 681    m("foo", "foo/"              ==> 1
 682    m("foo", "foo")              ==> 1  */
 683
 684 static bool
 685 match_except_index (const char *s1, const char *s2)
 686 {
 687   int i;
 688   const char *lng;
 689
 690   /* Skip common substring. */
 691   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 692     ;
 693   if (i == 0)
 694     /* Strings differ at the very beginning -- bail out.  We need to
 695        check this explicitly to avoid `lng - 1' reading outside the
 696        array.  */
 697     return false;
 698
 699   if (!*s1 && !*s2)
 700     /* Both strings hit EOF -- strings are equal. */
 701     return true;
 702   else if (*s1 && *s2)
 703     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 704     return false;
 705   else if (*s1)
 706     /* S1 is the longer one. */
 707     lng = s1;
 708   else
 709     /* S2 is the longer one. */
 710     lng = s2;
 711
 712   /* foo            */            /* foo/           */
 713   /* foo/index.html */  /* or */  /* foo/index.html */
 714   /*    ^           */            /*     ^          */
 715
 716   if (*lng != '/')
 717     /* The right-hand case. */
 718     --lng;
 719
 720   if (*lng == '/' && *(lng + 1) == '\0')
 721     /* foo  */
 722     /* foo/ */
 723     return true;
 724
 725   return 0 == strcmp (lng, "/index.html");
 726 }
 727
 728 static int
 729 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 730 {
 731   char *mapping_url = (char *)key;
 732   char *mapping_file = (char *)value;
 733   char *file = (char *)arg;
 734
 735   if (0 == strcmp (mapping_file, file))
 736     {
 737       hash_table_remove (dl_url_file_map, mapping_url);
 738       xfree (mapping_url);
 739       xfree (mapping_file);
 740     }
 741
 742   /* Continue mapping. */
 743   return 0;
 744 }
 745
 746 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 747
 748 static void
 749 dissociate_urls_from_file (const char *file)
 750 {
 751   /* Can't use hash_table_iter_* because the table mutates while mapping.  */
 752   hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
 753                        (char *) file);
 754 }
 755
 756 /* Register that URL has been successfully downloaded to FILE.  This
 757    is used by the link conversion code to convert references to URLs
 758    to references to local files.  It is also being used to check if a
 759    URL has already been downloaded.  */
 760
 761 void
 762 register_download (const char *url, const char *file)
 763 {
 764   char *old_file, *old_url;
 765
 766   ENSURE_TABLES_EXIST;
 767
 768   /* With some forms of retrieval, it is possible, although not likely
 769      or particularly desirable.  If both are downloaded, the second
 770      download will override the first one.  When that happens,
 771      dissociate the old file name from the URL.  */
 772
 773   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 774     {
 775       if (0 == strcmp (url, old_url))
 776         /* We have somehow managed to download the same URL twice.
 777            Nothing to do.  */
 778         return;
 779
 780       if (match_except_index (url, old_url)
 781           && !hash_table_contains (dl_url_file_map, url))
 782         /* The two URLs differ only in the "index.html" ending.  For
 783            example, one is "http://www.server.com/", and the other is
 784            "http://www.server.com/index.html".  Don't remove the old
 785            one, just add the new one as a non-canonical entry.  */
 786         goto url_only;
 787
 788       hash_table_remove (dl_file_url_map, file);
 789       xfree (old_file);
 790       xfree (old_url);
 791
 792       /* Remove all the URLs that point to this file.  Yes, there can
 793          be more than one such URL, because we store redirections as
 794          multiple entries in dl_url_file_map.  For example, if URL1
 795          redirects to URL2 which gets downloaded to FILE, we map both
 796          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 797          only points to URL2.)  When another URL gets loaded to FILE,
 798          we want both URL1 and URL2 dissociated from it.
 799
 800          This is a relatively expensive operation because it performs
 801          a linear search of the whole hash table, but it should be
 802          called very rarely, only when two URLs resolve to the same
 803          file name, *and* the "<file>.1" extensions are turned off.
 804          In other words, almost never.  */
 805       dissociate_urls_from_file (file);
 806     }
 807
 808   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 809
 810  url_only:
 811   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 812      If the latter were present, it should have been removed by the
 813      above `if'.  So we could write:
 814
 815          assert (!hash_table_contains (dl_url_file_map, url));
 816
 817      The above is correct when running in recursive mode where the
 818      same URL always resolves to the same file.  But if you do
 819      something like:
 820
 821          wget URL URL
 822
 823      then the first URL will resolve to "FILE", and the other to
 824      "FILE.1".  In that case, FILE.1 will not be found in
 825      dl_file_url_map, but URL will still point to FILE in
 826      dl_url_file_map.  */
 827   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 828     {
 829       hash_table_remove (dl_url_file_map, url);
 830       xfree (old_url);
 831       xfree (old_file);
 832     }
 833
 834   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 835 }
 836
 837 /* Register that FROM has been redirected to TO.  This assumes that TO
 838    is successfully downloaded and already registered using
 839    register_download() above.  */
 840
 841 void
 842 register_redirection (const char *from, const char *to)
 843 {
 844   char *file;
 845
 846   ENSURE_TABLES_EXIST;
 847
 848   file = hash_table_get (dl_url_file_map, to);
 849   assert (file != NULL);
 850   if (!hash_table_contains (dl_url_file_map, from))
 851     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 852 }
 853
 854 /* Register that the file has been deleted. */
 855
 856 void
 857 register_delete_file (const char *file)
 858 {
 859   char *old_url, *old_file;
 860
 861   ENSURE_TABLES_EXIST;
 862
 863   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 864     return;
 865
 866   hash_table_remove (dl_file_url_map, file);
 867   xfree (old_file);
 868   xfree (old_url);
 869   dissociate_urls_from_file (file);
 870 }
 871
 872 /* Register that FILE is an HTML file that has been downloaded. */
 873
 874 void
 875 register_html (const char *url, const char *file)
 876 {
 877   if (!downloaded_html_set)
 878     downloaded_html_set = make_string_hash_table (0);
 879   string_set_add (downloaded_html_set, file);
 880 }
 881
 882 /* Register that FILE is a CSS file that has been downloaded. */
 883
 884 void
 885 register_css (const char *url, const char *file)
 886 {
 887   if (!downloaded_css_set)
 888     downloaded_css_set = make_string_hash_table (0);
 889   string_set_add (downloaded_css_set, file);
 890 }
 891
 892 static void downloaded_files_free (void);
 893
 894 /* Cleanup the data structures associated with this file.  */
 895
 896 void
 897 convert_cleanup (void)
 898 {
 899   if (dl_file_url_map)
 900     {
 901       free_keys_and_values (dl_file_url_map);
 902       hash_table_destroy (dl_file_url_map);
 903       dl_file_url_map = NULL;
 904     }
 905   if (dl_url_file_map)
 906     {
 907       free_keys_and_values (dl_url_file_map);
 908       hash_table_destroy (dl_url_file_map);
 909       dl_url_file_map = NULL;
 910     }
 911   if (downloaded_html_set)
 912     string_set_free (downloaded_html_set);
 913   downloaded_files_free ();
 914   if (converted_files)
 915     string_set_free (converted_files);
 916 }
 917 \f
 918 /* Book-keeping code for downloaded files that enables extension
 919    hacks.  */
 920
 921 /* This table should really be merged with dl_file_url_map and
 922    downloaded_html_files.  This was originally a list, but I changed
 923    it to a hash table beause it was actually taking a lot of time to
 924    find things in it.  */
 925
 926 static struct hash_table *downloaded_files_hash;
 927
 928 /* We're storing "modes" of type downloaded_file_t in the hash table.
 929    However, our hash tables only accept pointers for keys and values.
 930    So when we need a pointer, we use the address of a
 931    downloaded_file_t variable of static storage.  */
 932
 933 static downloaded_file_t *
 934 downloaded_mode_to_ptr (downloaded_file_t mode)
 935 {
 936   static downloaded_file_t
 937     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 938     v2 = FILE_DOWNLOADED_NORMALLY,
 939     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 940     v4 = CHECK_FOR_FILE;
 941
 942   switch (mode)
 943     {
 944     case FILE_NOT_ALREADY_DOWNLOADED:
 945       return &v1;
 946     case FILE_DOWNLOADED_NORMALLY:
 947       return &v2;
 948     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 949       return &v3;
 950     case CHECK_FOR_FILE:
 951       return &v4;
 952     }
 953   return NULL;
 954 }
 955
 956 /* Remembers which files have been downloaded.  In the standard case,
 957    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 958    file we actually download successfully (i.e. not for ones we have
 959    failures on or that we skip due to -N).
 960
 961    When we've downloaded a file and tacked on a ".html" extension due
 962    to -E, call this function with
 963    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 964    FILE_DOWNLOADED_NORMALLY.
 965
 966    If you just want to check if a file has been previously added
 967    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 968    sure to call this function with local filenames, not remote
 969    URLs.  */
 970
 971 downloaded_file_t
 972 downloaded_file (downloaded_file_t mode, const char *file)
 973 {
 974   downloaded_file_t *ptr;
 975
 976   if (mode == CHECK_FOR_FILE)
 977     {
 978       if (!downloaded_files_hash)
 979         return FILE_NOT_ALREADY_DOWNLOADED;
 980       ptr = hash_table_get (downloaded_files_hash, file);
 981       if (!ptr)
 982         return FILE_NOT_ALREADY_DOWNLOADED;
 983       return *ptr;
 984     }
 985
 986   if (!downloaded_files_hash)
 987     downloaded_files_hash = make_string_hash_table (0);
 988
 989   ptr = hash_table_get (downloaded_files_hash, file);
 990   if (ptr)
 991     return *ptr;
 992
 993   ptr = downloaded_mode_to_ptr (mode);
 994   hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
 995
 996   return FILE_NOT_ALREADY_DOWNLOADED;
 997 }
 998
 999 static void
1000 downloaded_files_free (void)
1001 {
1002   if (downloaded_files_hash)
1003     {
1004       hash_table_iterator iter;
1005       for (hash_table_iterate (downloaded_files_hash, &iter);
1006            hash_table_iter_next (&iter);
1007            )
1008         xfree (iter.key);
1009       hash_table_destroy (downloaded_files_hash);
1010       downloaded_files_hash = NULL;
1011     }
1012 }
1013 \f
1014 /* The function returns the pointer to the malloc-ed quoted version of
1015    string s.  It will recognize and quote numeric and special graphic
1016    entities, as per RFC1866:
1017
1018    `&' -> `&amp;'
1019    `<' -> `&lt;'
1020    `>' -> `&gt;'
1021    `"' -> `&quot;'
1022    SP  -> `&#32;'
1023
1024    No other entities are recognized or replaced.  */
1025 char *
1026 html_quote_string (const char *s)
1027 {
1028   const char *b = s;
1029   char *p, *res;
1030   int i;
1031
1032   /* Pass through the string, and count the new size.  */
1033   for (i = 0; *s; s++, i++)
1034     {
1035       if (*s == '&')
1036         i += 4;                 /* `amp;' */
1037       else if (*s == '<' || *s == '>')
1038         i += 3;                 /* `lt;' and `gt;' */
1039       else if (*s == '\"')
1040         i += 5;                 /* `quot;' */
1041       else if (*s == ' ')
1042         i += 4;                 /* #32; */
1043     }
1044   res = xmalloc (i + 1);
1045   s = b;
1046   for (p = res; *s; s++)
1047     {
1048       switch (*s)
1049         {
1050         case '&':
1051           *p++ = '&';
1052           *p++ = 'a';
1053           *p++ = 'm';
1054           *p++ = 'p';
1055           *p++ = ';';
1056           break;
1057         case '<': case '>':
1058           *p++ = '&';
1059           *p++ = (*s == '<' ? 'l' : 'g');
1060           *p++ = 't';
1061           *p++ = ';';
1062           break;
1063         case '\"':
1064           *p++ = '&';
1065           *p++ = 'q';
1066           *p++ = 'u';
1067           *p++ = 'o';
1068           *p++ = 't';
1069           *p++ = ';';
1070           break;
1071         case ' ':
1072           *p++ = '&';
1073           *p++ = '#';
1074           *p++ = '3';
1075           *p++ = '2';
1076           *p++ = ';';
1077           break;
1078         default:
1079           *p++ = *s;
1080         }
1081     }
1082   *p = '\0';
1083   return res;
1084 }
1085
1086 /*
1087  * vim: et ts=2 sw=2
1088  */
1089