sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif /* HAVE_STRING_H */
  39 #ifdef HAVE_UNISTD_H
  40 # include <unistd.h>
  41 #endif /* HAVE_UNISTD_H */
  42 #include <errno.h>
  43 #include <assert.h>
  44 #include <sys/types.h>
  45
  46 #include "wget.h"
  47 #include "convert.h"
  48 #include "url.h"
  49 #include "recur.h"
  50 #include "utils.h"
  51 #include "hash.h"
  52
  53 static struct hash_table *dl_file_url_map;
  54 struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run, used for link
  57    conversion after Wget is done.  The list and the set contain the
  58    same information, except the list maintains the order.  Perhaps I
  59    should get rid of the list, it's there for historical reasons.  */
  60 static slist *downloaded_html_list;
  61 struct hash_table *downloaded_html_set;
  62
  63 static void convert_links PARAMS ((const char *, struct urlpos *));
  64
  65 /* This function is called when the retrieval is done to convert the
  66    links that have been downloaded.  It has to be called at the end of
  67    the retrieval, because only then does Wget know conclusively which
  68    URLs have been downloaded, and which not, so it can tell which
  69    direction to convert to.
  70
  71    The "direction" means that the URLs to the files that have been
  72    downloaded get converted to the relative URL which will point to
  73    that file.  And the other URLs get converted to the remote URL on
  74    the server.
  75
  76    All the downloaded HTMLs are kept in downloaded_html_files, and
  77    downloaded URLs in urls_downloaded.  All the information is
  78    extracted from these two lists.  */
  79
  80 void
  81 convert_all_links (void)
  82 {
  83   slist *html;
  84   long msecs;
  85   int file_count = 0;
  86
  87   struct wget_timer *timer = wtimer_new ();
  88
  89   /* Destructively reverse downloaded_html_files to get it in the right order.
  90      recursive_retrieve() used slist_prepend() consistently.  */
  91   downloaded_html_list = slist_nreverse (downloaded_html_list);
  92
  93   for (html = downloaded_html_list; html; html = html->next)
  94     {
  95       struct urlpos *urls, *cur_url;
  96       char *url;
  97       char *file = html->string;
  98
  99       /* Determine the URL of the HTML file.  get_urls_html will need
 100          it.  */
 101       url = hash_table_get (dl_file_url_map, file);
 102       if (!url)
 103         {
 104           DEBUGP (("Apparently %s has been removed.\n", file));
 105           continue;
 106         }
 107
 108       DEBUGP (("Scanning %s (from %s)\n", file, url));
 109
 110       /* Parse the HTML file...  */
 111       urls = get_urls_html (file, url, NULL);
 112
 113       /* We don't respect meta_disallow_follow here because, even if
 114          the file is not followed, we might still want to convert the
 115          links that have been followed from other files.  */
 116
 117       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 118         {
 119           char *local_name;
 120           struct url *u = cur_url->url;
 121
 122           if (cur_url->link_base_p)
 123             {
 124               /* Base references have been resolved by our parser, so
 125                  we turn the base URL into an empty string.  (Perhaps
 126                  we should remove the tag entirely?)  */
 127               cur_url->convert = CO_NULLIFY_BASE;
 128               continue;
 129             }
 130
 131           /* We decide the direction of conversion according to whether
 132              a URL was downloaded.  Downloaded URLs will be converted
 133              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 134           local_name = hash_table_get (dl_url_file_map, u->url);
 135
 136           /* Decide on the conversion type.  */
 137           if (local_name)
 138             {
 139               /* We've downloaded this URL.  Convert it to relative
 140                  form.  We do this even if the URL already is in
 141                  relative form, because our directory structure may
 142                  not be identical to that on the server (think `-nd',
 143                  `--cut-dirs', etc.)  */
 144               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 145               cur_url->local_name = xstrdup (local_name);
 146               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 147             }
 148           else
 149             {
 150               /* We haven't downloaded this URL.  If it's not already
 151                  complete (including a full host name), convert it to
 152                  that form, so it can be reached while browsing this
 153                  HTML locally.  */
 154               if (!cur_url->link_complete_p)
 155                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 156               cur_url->local_name = NULL;
 157               DEBUGP (("will convert url %s to complete\n", u->url));
 158             }
 159         }
 160
 161       /* Convert the links in the file.  */
 162       convert_links (file, urls);
 163       ++file_count;
 164
 165       /* Free the data.  */
 166       free_urlpos (urls);
 167     }
 168
 169   msecs = wtimer_elapsed (timer);
 170   wtimer_delete (timer);
 171   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 172              file_count, (double)msecs / 1000);
 173 }
 174
 175 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
 176 static const char *replace_attr PARAMS ((const char *, int, FILE *,
 177                                          const char *));
 178 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
 179                                                       const char *, int));
 180 static char *local_quote_string PARAMS ((const char *));
 181 static char *construct_relative PARAMS ((const char *, const char *));
 182
 183 /* Change the links in one HTML file.  LINKS is a list of links in the
 184    document, along with their positions and the desired direction of
 185    the conversion.  */
 186 static void
 187 convert_links (const char *file, struct urlpos *links)
 188 {
 189   struct file_memory *fm;
 190   FILE *fp;
 191   const char *p;
 192   downloaded_file_t downloaded_file_return;
 193
 194   struct urlpos *link;
 195   int to_url_count = 0, to_file_count = 0;
 196
 197   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 198
 199   {
 200     /* First we do a "dry run": go through the list L and see whether
 201        any URL needs to be converted in the first place.  If not, just
 202        leave the file alone.  */
 203     int dry_count = 0;
 204     struct urlpos *dry = links;
 205     for (dry = links; dry; dry = dry->next)
 206       if (dry->convert != CO_NOCONVERT)
 207         ++dry_count;
 208     if (!dry_count)
 209       {
 210         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 211         return;
 212       }
 213   }
 214
 215   fm = read_file (file);
 216   if (!fm)
 217     {
 218       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 219                  file, strerror (errno));
 220       return;
 221     }
 222
 223   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 224   if (opt.backup_converted && downloaded_file_return)
 225     write_backup_file (file, downloaded_file_return);
 226
 227   /* Before opening the file for writing, unlink the file.  This is
 228      important if the data in FM is mmaped.  In such case, nulling the
 229      file, which is what fopen() below does, would make us read all
 230      zeroes from the mmaped region.  */
 231   if (unlink (file) < 0 && errno != ENOENT)
 232     {
 233       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
 234                  file, strerror (errno));
 235       read_file_free (fm);
 236       return;
 237     }
 238   /* Now open the file for writing.  */
 239   fp = fopen (file, "wb");
 240   if (!fp)
 241     {
 242       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 243                  file, strerror (errno));
 244       read_file_free (fm);
 245       return;
 246     }
 247
 248   /* Here we loop through all the URLs in file, replacing those of
 249      them that are downloaded with relative references.  */
 250   p = fm->content;
 251   for (link = links; link; link = link->next)
 252     {
 253       char *url_start = fm->content + link->pos;
 254
 255       if (link->pos >= fm->length)
 256         {
 257           DEBUGP (("Something strange is going on.  Please investigate."));
 258           break;
 259         }
 260       /* If the URL is not to be converted, skip it.  */
 261       if (link->convert == CO_NOCONVERT)
 262         {
 263           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 264           continue;
 265         }
 266
 267       /* Echo the file contents, up to the offending URL's opening
 268          quote, to the outfile.  */
 269       fwrite (p, 1, url_start - p, fp);
 270       p = url_start;
 271
 272       switch (link->convert)
 273         {
 274         case CO_CONVERT_TO_RELATIVE:
 275           /* Convert absolute URL to relative. */
 276           {
 277             char *newname = construct_relative (file, link->local_name);
 278             char *quoted_newname = local_quote_string (newname);
 279
 280             if (!link->link_refresh_p)
 281               p = replace_attr (p, link->size, fp, quoted_newname);
 282             else
 283               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 284                                              link->refresh_timeout);
 285
 286             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 287                      link->url->url, newname, link->pos, file));
 288             xfree (newname);
 289             xfree (quoted_newname);
 290             ++to_file_count;
 291             break;
 292           }
 293         case CO_CONVERT_TO_COMPLETE:
 294           /* Convert the link to absolute URL. */
 295           {
 296             char *newlink = link->url->url;
 297             char *quoted_newlink = html_quote_string (newlink);
 298
 299             if (!link->link_refresh_p)
 300               p = replace_attr (p, link->size, fp, quoted_newlink);
 301             else
 302               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 303                                              link->refresh_timeout);
 304
 305             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 306                      newlink, link->pos, file));
 307             xfree (quoted_newlink);
 308             ++to_url_count;
 309             break;
 310           }
 311         case CO_NULLIFY_BASE:
 312           /* Change the base href to "". */
 313           p = replace_attr (p, link->size, fp, "");
 314           break;
 315         case CO_NOCONVERT:
 316           abort ();
 317           break;
 318         }
 319     }
 320
 321   /* Output the rest of the file. */
 322   if (p - fm->content < fm->length)
 323     fwrite (p, 1, fm->length - (p - fm->content), fp);
 324   fclose (fp);
 325   read_file_free (fm);
 326
 327   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 328 }
 329
 330 /* Construct and return a link that points from BASEFILE to LINKFILE.
 331    Both files should be local file names, BASEFILE of the referrering
 332    file, and LINKFILE of the referred file.
 333
 334    Examples:
 335
 336    cr("foo", "bar")         -> "bar"
 337    cr("A/foo", "A/bar")     -> "bar"
 338    cr("A/foo", "A/B/bar")   -> "B/bar"
 339    cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
 340    cr("X/", "Y/bar")        -> "../Y/bar" (trailing slash does matter in BASE)
 341
 342    Both files should be absolute or relative, otherwise strange
 343    results might ensue.  The function makes no special efforts to
 344    handle "." and ".." in links, so make sure they're not there
 345    (e.g. using path_simplify).  */
 346
 347 static char *
 348 construct_relative (const char *basefile, const char *linkfile)
 349 {
 350   char *link;
 351   int basedirs;
 352   const char *b, *l;
 353   int i, start;
 354
 355   /* First, skip the initial directory components common to both
 356      files.  */
 357   start = 0;
 358   for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
 359     {
 360       if (*b == '/')
 361         start = (b - basefile) + 1;
 362     }
 363   basefile += start;
 364   linkfile += start;
 365
 366   /* With common directories out of the way, the situation we have is
 367      as follows:
 368          b - b1/b2/[...]/bfile
 369          l - l1/l2/[...]/lfile
 370
 371      The link we're constructing needs to be:
 372        lnk - ../../l1/l2/[...]/lfile
 373
 374      Where the number of ".."'s equals the number of bN directory
 375      components in B.  */
 376
 377   /* Count the directory components in B. */
 378   basedirs = 0;
 379   for (b = basefile; *b; b++)
 380     {
 381       if (*b == '/')
 382         ++basedirs;
 383     }
 384
 385   /* Construct LINK as explained above. */
 386   link = (char *)xmalloc (3 * basedirs + strlen (linkfile) + 1);
 387   for (i = 0; i < basedirs; i++)
 388     memcpy (link + 3 * i, "../", 3);
 389   strcpy (link + 3 * i, linkfile);
 390   return link;
 391 }
 392
 393 static void
 394 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 395 {
 396   /* Rather than just writing over the original .html file with the
 397      converted version, save the former to *.orig.  Note we only do
 398      this for files we've _successfully_ downloaded, so we don't
 399      clobber .orig files sitting around from previous invocations. */
 400
 401   /* Construct the backup filename as the original name plus ".orig". */
 402   size_t         filename_len = strlen(file);
 403   char*          filename_plus_orig_suffix;
 404   boolean        already_wrote_backup_file = FALSE;
 405   slist*         converted_file_ptr;
 406   static slist*  converted_files = NULL;
 407
 408   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 409     {
 410       /* Just write "orig" over "html".  We need to do it this way
 411          because when we're checking to see if we've downloaded the
 412          file before (to see if we can skip downloading it), we don't
 413          know if it's a text/html file.  Therefore we don't know yet
 414          at that stage that -E is going to cause us to tack on
 415          ".html", so we need to compare vs. the original URL plus
 416          ".orig", not the original URL plus ".html.orig". */
 417       filename_plus_orig_suffix = alloca (filename_len + 1);
 418       strcpy(filename_plus_orig_suffix, file);
 419       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
 420     }
 421   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 422     {
 423       /* Append ".orig" to the name. */
 424       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
 425       strcpy(filename_plus_orig_suffix, file);
 426       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
 427     }
 428
 429   /* We can get called twice on the same URL thanks to the
 430      convert_all_links() call in main().  If we write the .orig file
 431      each time in such a case, it'll end up containing the first-pass
 432      conversion, not the original file.  So, see if we've already been
 433      called on this file. */
 434   converted_file_ptr = converted_files;
 435   while (converted_file_ptr != NULL)
 436     if (strcmp(converted_file_ptr->string, file) == 0)
 437       {
 438         already_wrote_backup_file = TRUE;
 439         break;
 440       }
 441     else
 442       converted_file_ptr = converted_file_ptr->next;
 443
 444   if (!already_wrote_backup_file)
 445     {
 446       /* Rename <file> to <file>.orig before former gets written over. */
 447       if (rename(file, filename_plus_orig_suffix) != 0)
 448         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 449                    file, filename_plus_orig_suffix, strerror (errno));
 450
 451       /* Remember that we've already written a .orig backup for this file.
 452          Note that we never free this memory since we need it till the
 453          convert_all_links() call, which is one of the last things the
 454          program does before terminating.  BTW, I'm not sure if it would be
 455          safe to just set 'converted_file_ptr->string' to 'file' below,
 456          rather than making a copy of the string...  Another note is that I
 457          thought I could just add a field to the urlpos structure saying
 458          that we'd written a .orig file for this URL, but that didn't work,
 459          so I had to make this separate list.
 460          -- Dan Harkless <wget@harkless.org>
 461
 462          This [adding a field to the urlpos structure] didn't work
 463          because convert_file() is called from convert_all_links at
 464          the end of the retrieval with a freshly built new urlpos
 465          list.
 466          -- Hrvoje Niksic <hniksic@xemacs.org>
 467       */
 468       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
 469       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
 470       converted_file_ptr->next = converted_files;
 471       converted_files = converted_file_ptr;
 472     }
 473 }
 474
 475 static int find_fragment PARAMS ((const char *, int, const char **,
 476                                   const char **));
 477
 478 /* Replace an attribute's original text with NEW_TEXT. */
 479
 480 static const char *
 481 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 482 {
 483   int quote_flag = 0;
 484   char quote_char = '\"';       /* use "..." for quoting, unless the
 485                                    original value is quoted, in which
 486                                    case reuse its quoting char. */
 487   const char *frag_beg, *frag_end;
 488
 489   /* Structure of our string is:
 490        "...old-contents..."
 491        <---    size    --->  (with quotes)
 492      OR:
 493        ...old-contents...
 494        <---    size   -->    (no quotes)   */
 495
 496   if (*p == '\"' || *p == '\'')
 497     {
 498       quote_char = *p;
 499       quote_flag = 1;
 500       ++p;
 501       size -= 2;                /* disregard opening and closing quote */
 502     }
 503   putc (quote_char, fp);
 504   fputs (new_text, fp);
 505
 506   /* Look for fragment identifier, if any. */
 507   if (find_fragment (p, size, &frag_beg, &frag_end))
 508     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 509   p += size;
 510   if (quote_flag)
 511     ++p;
 512   putc (quote_char, fp);
 513
 514   return p;
 515 }
 516
 517 /* The same as REPLACE_ATTR, but used when replacing
 518    <meta http-equiv=refresh content="new_text"> because we need to
 519    append "timeout_value; URL=" before the next_text.  */
 520
 521 static const char *
 522 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 523                            const char *new_text, int timeout)
 524 {
 525   /* "0; URL=..." */
 526   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 527                                            + 6 /* "; URL=" */
 528                                            + strlen (new_text)
 529                                            + 1);
 530   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 531
 532   return replace_attr (p, size, fp, new_with_timeout);
 533 }
 534
 535 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 536    preceded by '&'.  If the character is not found, return zero.  If
 537    the character is found, return 1 and set BP and EP to point to the
 538    beginning and end of the region.
 539
 540    This is used for finding the fragment indentifiers in URLs.  */
 541
 542 static int
 543 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 544 {
 545   const char *end = beg + size;
 546   int saw_amp = 0;
 547   for (; beg < end; beg++)
 548     {
 549       switch (*beg)
 550         {
 551         case '&':
 552           saw_amp = 1;
 553           break;
 554         case '#':
 555           if (!saw_amp)
 556             {
 557               *bp = beg;
 558               *ep = end;
 559               return 1;
 560             }
 561           /* fallthrough */
 562         default:
 563           saw_amp = 0;
 564         }
 565     }
 566   return 0;
 567 }
 568
 569 /* Quote FILE for use as local reference to an HTML file.
 570
 571    We quote ? as %3F to avoid passing part of the file name as the
 572    parameter when browsing the converted file through HTTP.  However,
 573    it is safe to do this only when `--html-extension' is turned on.
 574    This is because converting "index.html?foo=bar" to
 575    "index.html%3Ffoo=bar" would break local browsing, as the latter
 576    isn't even recognized as an HTML file!  However, converting
 577    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 578    safe for both local and HTTP-served browsing.  */
 579
 580 static char *
 581 local_quote_string (const char *file)
 582 {
 583   const char *file_sans_qmark;
 584   int qm;
 585
 586   if (!opt.html_extension)
 587     return html_quote_string (file);
 588
 589   qm = count_char (file, '?');
 590
 591   if (qm)
 592     {
 593       const char *from = file;
 594       char *to, *newname;
 595
 596       /* qm * 2 because we replace each question mark with "%3F",
 597          i.e. replace one char with three, hence two more.  */
 598       int fsqlen = strlen (file) + qm * 2;
 599
 600       to = newname = (char *)alloca (fsqlen + 1);
 601       for (; *from; from++)
 602         {
 603           if (*from != '?')
 604             *to++ = *from;
 605           else
 606             {
 607               *to++ = '%';
 608               *to++ = '3';
 609               *to++ = 'F';
 610             }
 611         }
 612       assert (to - newname == fsqlen);
 613       *to = '\0';
 614
 615       file_sans_qmark = newname;
 616     }
 617   else
 618     file_sans_qmark = file;
 619
 620   return html_quote_string (file_sans_qmark);
 621 }
 622 \f
 623 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 624    downloaded_html_list, and downloaded_html_set.  Other code calls
 625    these functions to let us know that a file has been downloaded.  */
 626
 627 #define ENSURE_TABLES_EXIST do {                        \
 628   if (!dl_file_url_map)                                 \
 629     dl_file_url_map = make_string_hash_table (0);       \
 630   if (!dl_url_file_map)                                 \
 631     dl_url_file_map = make_string_hash_table (0);       \
 632 } while (0)
 633
 634 /* Return 1 if S1 and S2 are the same, except for "/index.html".  The
 635    three cases in which it returns one are (substitute any substring
 636    for "foo"):
 637
 638    m("foo/index.html", "foo/")  ==> 1
 639    m("foo/", "foo/index.html")  ==> 1
 640    m("foo", "foo/index.html")   ==> 1
 641    m("foo", "foo/"              ==> 1
 642    m("foo", "foo")              ==> 1  */
 643
 644 static int
 645 match_except_index (const char *s1, const char *s2)
 646 {
 647   int i;
 648   const char *lng;
 649
 650   /* Skip common substring. */
 651   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 652     ;
 653   if (i == 0)
 654     /* Strings differ at the very beginning -- bail out.  We need to
 655        check this explicitly to avoid `lng - 1' reading outside the
 656        array.  */
 657     return 0;
 658
 659   if (!*s1 && !*s2)
 660     /* Both strings hit EOF -- strings are equal. */
 661     return 1;
 662   else if (*s1 && *s2)
 663     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 664     return 0;
 665   else if (*s1)
 666     /* S1 is the longer one. */
 667     lng = s1;
 668   else
 669     /* S2 is the longer one. */
 670     lng = s2;
 671
 672   /* foo            */            /* foo/           */
 673   /* foo/index.html */  /* or */  /* foo/index.html */
 674   /*    ^           */            /*     ^          */
 675
 676   if (*lng != '/')
 677     /* The right-hand case. */
 678     --lng;
 679
 680   if (*lng == '/' && *(lng + 1) == '\0')
 681     /* foo  */
 682     /* foo/ */
 683     return 1;
 684
 685   return 0 == strcmp (lng, "/index.html");
 686 }
 687
 688 static int
 689 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 690 {
 691   char *mapping_url = (char *)key;
 692   char *mapping_file = (char *)value;
 693   char *file = (char *)arg;
 694
 695   if (0 == strcmp (mapping_file, file))
 696     {
 697       hash_table_remove (dl_url_file_map, mapping_url);
 698       xfree (mapping_url);
 699       xfree (mapping_file);
 700     }
 701
 702   /* Continue mapping. */
 703   return 0;
 704 }
 705
 706 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 707
 708 static void
 709 dissociate_urls_from_file (const char *file)
 710 {
 711   hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
 712                   (char *)file);
 713 }
 714
 715 /* Register that URL has been successfully downloaded to FILE.  This
 716    is used by the link conversion code to convert references to URLs
 717    to references to local files.  It is also being used to check if a
 718    URL has already been downloaded.  */
 719
 720 void
 721 register_download (const char *url, const char *file)
 722 {
 723   char *old_file, *old_url;
 724
 725   ENSURE_TABLES_EXIST;
 726
 727   /* With some forms of retrieval, it is possible, although not likely
 728      or particularly desirable.  If both are downloaded, the second
 729      download will override the first one.  When that happens,
 730      dissociate the old file name from the URL.  */
 731
 732   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 733     {
 734       if (0 == strcmp (url, old_url))
 735         /* We have somehow managed to download the same URL twice.
 736            Nothing to do.  */
 737         return;
 738
 739       if (match_except_index (url, old_url)
 740           && !hash_table_contains (dl_url_file_map, url))
 741         /* The two URLs differ only in the "index.html" ending.  For
 742            example, one is "http://www.server.com/", and the other is
 743            "http://www.server.com/index.html".  Don't remove the old
 744            one, just add the new one as a non-canonical entry.  */
 745         goto url_only;
 746
 747       hash_table_remove (dl_file_url_map, file);
 748       xfree (old_file);
 749       xfree (old_url);
 750
 751       /* Remove all the URLs that point to this file.  Yes, there can
 752          be more than one such URL, because we store redirections as
 753          multiple entries in dl_url_file_map.  For example, if URL1
 754          redirects to URL2 which gets downloaded to FILE, we map both
 755          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 756          only points to URL2.)  When another URL gets loaded to FILE,
 757          we want both URL1 and URL2 dissociated from it.
 758
 759          This is a relatively expensive operation because it performs
 760          a linear search of the whole hash table, but it should be
 761          called very rarely, only when two URLs resolve to the same
 762          file name, *and* the "<file>.1" extensions are turned off.
 763          In other words, almost never.  */
 764       dissociate_urls_from_file (file);
 765     }
 766
 767   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 768
 769  url_only:
 770   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 771      If the latter were present, it should have been removed by the
 772      above `if'.  So we could write:
 773
 774          assert (!hash_table_contains (dl_url_file_map, url));
 775
 776      The above is correct when running in recursive mode where the
 777      same URL always resolves to the same file.  But if you do
 778      something like:
 779
 780          wget URL URL
 781
 782      then the first URL will resolve to "FILE", and the other to
 783      "FILE.1".  In that case, FILE.1 will not be found in
 784      dl_file_url_map, but URL will still point to FILE in
 785      dl_url_file_map.  */
 786   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 787     {
 788       hash_table_remove (dl_url_file_map, url);
 789       xfree (old_url);
 790       xfree (old_file);
 791     }
 792
 793   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 794 }
 795
 796 /* Register that FROM has been redirected to TO.  This assumes that TO
 797    is successfully downloaded and already registered using
 798    register_download() above.  */
 799
 800 void
 801 register_redirection (const char *from, const char *to)
 802 {
 803   char *file;
 804
 805   ENSURE_TABLES_EXIST;
 806
 807   file = hash_table_get (dl_url_file_map, to);
 808   assert (file != NULL);
 809   if (!hash_table_contains (dl_url_file_map, from))
 810     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 811 }
 812
 813 /* Register that the file has been deleted. */
 814
 815 void
 816 register_delete_file (const char *file)
 817 {
 818   char *old_url, *old_file;
 819
 820   ENSURE_TABLES_EXIST;
 821
 822   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 823     return;
 824
 825   hash_table_remove (dl_file_url_map, file);
 826   xfree (old_file);
 827   xfree (old_url);
 828   dissociate_urls_from_file (file);
 829 }
 830
 831 /* Register that FILE is an HTML file that has been downloaded. */
 832
 833 void
 834 register_html (const char *url, const char *file)
 835 {
 836   if (!downloaded_html_set)
 837     downloaded_html_set = make_string_hash_table (0);
 838   else if (hash_table_contains (downloaded_html_set, file))
 839     return;
 840
 841   /* The set and the list should use the same copy of FILE, but the
 842      slist interface insists on strduping the string it gets.  Oh
 843      well. */
 844   string_set_add (downloaded_html_set, file);
 845   downloaded_html_list = slist_prepend (downloaded_html_list, file);
 846 }
 847
 848 /* Cleanup the data structures associated with recursive retrieving
 849    (the variables above).  */
 850 void
 851 convert_cleanup (void)
 852 {
 853   if (dl_file_url_map)
 854     {
 855       free_keys_and_values (dl_file_url_map);
 856       hash_table_destroy (dl_file_url_map);
 857       dl_file_url_map = NULL;
 858     }
 859   if (dl_url_file_map)
 860     {
 861       free_keys_and_values (dl_url_file_map);
 862       hash_table_destroy (dl_url_file_map);
 863       dl_url_file_map = NULL;
 864     }
 865   if (downloaded_html_set)
 866     string_set_free (downloaded_html_set);
 867   slist_free (downloaded_html_list);
 868   downloaded_html_list = NULL;
 869 }
 870 \f
 871 /* Book-keeping code for downloaded files that enables extension
 872    hacks.  */
 873
 874 /* This table should really be merged with dl_file_url_map and
 875    downloaded_html_files.  This was originally a list, but I changed
 876    it to a hash table beause it was actually taking a lot of time to
 877    find things in it.  */
 878
 879 static struct hash_table *downloaded_files_hash;
 880
 881 /* We're storing "modes" of type downloaded_file_t in the hash table.
 882    However, our hash tables only accept pointers for keys and values.
 883    So when we need a pointer, we use the address of a
 884    downloaded_file_t variable of static storage.  */
 885
 886 static downloaded_file_t *
 887 downloaded_mode_to_ptr (downloaded_file_t mode)
 888 {
 889   static downloaded_file_t
 890     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 891     v2 = FILE_DOWNLOADED_NORMALLY,
 892     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 893     v4 = CHECK_FOR_FILE;
 894
 895   switch (mode)
 896     {
 897     case FILE_NOT_ALREADY_DOWNLOADED:
 898       return &v1;
 899     case FILE_DOWNLOADED_NORMALLY:
 900       return &v2;
 901     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 902       return &v3;
 903     case CHECK_FOR_FILE:
 904       return &v4;
 905     }
 906   return NULL;
 907 }
 908
 909 /* Remembers which files have been downloaded.  In the standard case,
 910    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 911    file we actually download successfully (i.e. not for ones we have
 912    failures on or that we skip due to -N).
 913
 914    When we've downloaded a file and tacked on a ".html" extension due
 915    to -E, call this function with
 916    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 917    FILE_DOWNLOADED_NORMALLY.
 918
 919    If you just want to check if a file has been previously added
 920    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 921    sure to call this function with local filenames, not remote
 922    URLs.  */
 923
 924 downloaded_file_t
 925 downloaded_file (downloaded_file_t mode, const char *file)
 926 {
 927   downloaded_file_t *ptr;
 928
 929   if (mode == CHECK_FOR_FILE)
 930     {
 931       if (!downloaded_files_hash)
 932         return FILE_NOT_ALREADY_DOWNLOADED;
 933       ptr = hash_table_get (downloaded_files_hash, file);
 934       if (!ptr)
 935         return FILE_NOT_ALREADY_DOWNLOADED;
 936       return *ptr;
 937     }
 938
 939   if (!downloaded_files_hash)
 940     downloaded_files_hash = make_string_hash_table (0);
 941
 942   ptr = hash_table_get (downloaded_files_hash, file);
 943   if (ptr)
 944     return *ptr;
 945
 946   ptr = downloaded_mode_to_ptr (mode);
 947   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
 948
 949   return FILE_NOT_ALREADY_DOWNLOADED;
 950 }
 951
 952 static int
 953 df_free_mapper (void *key, void *value, void *ignored)
 954 {
 955   xfree (key);
 956   return 0;
 957 }
 958
 959 void
 960 downloaded_files_free (void)
 961 {
 962   if (downloaded_files_hash)
 963     {
 964       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
 965       hash_table_destroy (downloaded_files_hash);
 966       downloaded_files_hash = NULL;
 967     }
 968 }