sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif /* HAVE_STRING_H */
  39 #ifdef HAVE_UNISTD_H
  40 # include <unistd.h>
  41 #endif /* HAVE_UNISTD_H */
  42 #include <errno.h>
  43 #include <assert.h>
  44 #include <sys/types.h>
  45
  46 #include "wget.h"
  47 #include "convert.h"
  48 #include "url.h"
  49 #include "recur.h"
  50 #include "utils.h"
  51 #include "hash.h"
  52
  53 static struct hash_table *dl_file_url_map;
  54 struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run, used for link
  57    conversion after Wget is done.  The list and the set contain the
  58    same information, except the list maintains the order.  Perhaps I
  59    should get rid of the list, it's there for historical reasons.  */
  60 static slist *downloaded_html_list;
  61 struct hash_table *downloaded_html_set;
  62
  63 static void convert_links PARAMS ((const char *, struct urlpos *));
  64
  65 /* This function is called when the retrieval is done to convert the
  66    links that have been downloaded.  It has to be called at the end of
  67    the retrieval, because only then does Wget know conclusively which
  68    URLs have been downloaded, and which not, so it can tell which
  69    direction to convert to.
  70
  71    The "direction" means that the URLs to the files that have been
  72    downloaded get converted to the relative URL which will point to
  73    that file.  And the other URLs get converted to the remote URL on
  74    the server.
  75
  76    All the downloaded HTMLs are kept in downloaded_html_files, and
  77    downloaded URLs in urls_downloaded.  All the information is
  78    extracted from these two lists.  */
  79
  80 void
  81 convert_all_links (void)
  82 {
  83   slist *html;
  84   long msecs;
  85   int file_count = 0;
  86
  87   struct wget_timer *timer = wtimer_new ();
  88
  89   /* Destructively reverse downloaded_html_files to get it in the right order.
  90      recursive_retrieve() used slist_prepend() consistently.  */
  91   downloaded_html_list = slist_nreverse (downloaded_html_list);
  92
  93   for (html = downloaded_html_list; html; html = html->next)
  94     {
  95       struct urlpos *urls, *cur_url;
  96       char *url;
  97       char *file = html->string;
  98
  99       /* Determine the URL of the HTML file.  get_urls_html will need
 100          it.  */
 101       url = hash_table_get (dl_file_url_map, file);
 102       if (!url)
 103         {
 104           DEBUGP (("Apparently %s has been removed.\n", file));
 105           continue;
 106         }
 107
 108       DEBUGP (("Scanning %s (from %s)\n", file, url));
 109
 110       /* Parse the HTML file...  */
 111       urls = get_urls_html (file, url, NULL);
 112
 113       /* We don't respect meta_disallow_follow here because, even if
 114          the file is not followed, we might still want to convert the
 115          links that have been followed from other files.  */
 116
 117       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 118         {
 119           char *local_name;
 120           struct url *u = cur_url->url;
 121
 122           if (cur_url->link_base_p)
 123             {
 124               /* Base references have been resolved by our parser, so
 125                  we turn the base URL into an empty string.  (Perhaps
 126                  we should remove the tag entirely?)  */
 127               cur_url->convert = CO_NULLIFY_BASE;
 128               continue;
 129             }
 130
 131           /* We decide the direction of conversion according to whether
 132              a URL was downloaded.  Downloaded URLs will be converted
 133              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 134           local_name = hash_table_get (dl_url_file_map, u->url);
 135
 136           /* Decide on the conversion type.  */
 137           if (local_name)
 138             {
 139               /* We've downloaded this URL.  Convert it to relative
 140                  form.  We do this even if the URL already is in
 141                  relative form, because our directory structure may
 142                  not be identical to that on the server (think `-nd',
 143                  `--cut-dirs', etc.)  */
 144               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 145               cur_url->local_name = xstrdup (local_name);
 146               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 147             }
 148           else
 149             {
 150               /* We haven't downloaded this URL.  If it's not already
 151                  complete (including a full host name), convert it to
 152                  that form, so it can be reached while browsing this
 153                  HTML locally.  */
 154               if (!cur_url->link_complete_p)
 155                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 156               cur_url->local_name = NULL;
 157               DEBUGP (("will convert url %s to complete\n", u->url));
 158             }
 159         }
 160
 161       /* Convert the links in the file.  */
 162       convert_links (file, urls);
 163       ++file_count;
 164
 165       /* Free the data.  */
 166       free_urlpos (urls);
 167     }
 168
 169   msecs = wtimer_elapsed (timer);
 170   wtimer_delete (timer);
 171   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 172              file_count, (double)msecs / 1000);
 173 }
 174
 175 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
 176 static const char *replace_attr PARAMS ((const char *, int, FILE *,
 177                                          const char *));
 178 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
 179                                                       const char *, int));
 180 static char *local_quote_string PARAMS ((const char *));
 181 static char *construct_relative PARAMS ((const char *, const char *));
 182
 183 /* Change the links in one HTML file.  LINKS is a list of links in the
 184    document, along with their positions and the desired direction of
 185    the conversion.  */
 186 static void
 187 convert_links (const char *file, struct urlpos *links)
 188 {
 189   struct file_memory *fm;
 190   FILE *fp;
 191   const char *p;
 192   downloaded_file_t downloaded_file_return;
 193
 194   struct urlpos *link;
 195   int to_url_count = 0, to_file_count = 0;
 196
 197   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 198
 199   {
 200     /* First we do a "dry run": go through the list L and see whether
 201        any URL needs to be converted in the first place.  If not, just
 202        leave the file alone.  */
 203     int dry_count = 0;
 204     struct urlpos *dry = links;
 205     for (dry = links; dry; dry = dry->next)
 206       if (dry->convert != CO_NOCONVERT)
 207         ++dry_count;
 208     if (!dry_count)
 209       {
 210         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 211         return;
 212       }
 213   }
 214
 215   fm = read_file (file);
 216   if (!fm)
 217     {
 218       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 219                  file, strerror (errno));
 220       return;
 221     }
 222
 223   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 224   if (opt.backup_converted && downloaded_file_return)
 225     write_backup_file (file, downloaded_file_return);
 226
 227   /* Before opening the file for writing, unlink the file.  This is
 228      important if the data in FM is mmaped.  In such case, nulling the
 229      file, which is what fopen() below does, would make us read all
 230      zeroes from the mmaped region.  */
 231   if (unlink (file) < 0 && errno != ENOENT)
 232     {
 233       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
 234                  file, strerror (errno));
 235       read_file_free (fm);
 236       return;
 237     }
 238   /* Now open the file for writing.  */
 239   fp = fopen (file, "wb");
 240   if (!fp)
 241     {
 242       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 243                  file, strerror (errno));
 244       read_file_free (fm);
 245       return;
 246     }
 247
 248   /* Here we loop through all the URLs in file, replacing those of
 249      them that are downloaded with relative references.  */
 250   p = fm->content;
 251   for (link = links; link; link = link->next)
 252     {
 253       char *url_start = fm->content + link->pos;
 254
 255       if (link->pos >= fm->length)
 256         {
 257           DEBUGP (("Something strange is going on.  Please investigate."));
 258           break;
 259         }
 260       /* If the URL is not to be converted, skip it.  */
 261       if (link->convert == CO_NOCONVERT)
 262         {
 263           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 264           continue;
 265         }
 266
 267       /* Echo the file contents, up to the offending URL's opening
 268          quote, to the outfile.  */
 269       fwrite (p, 1, url_start - p, fp);
 270       p = url_start;
 271
 272       switch (link->convert)
 273         {
 274         case CO_CONVERT_TO_RELATIVE:
 275           /* Convert absolute URL to relative. */
 276           {
 277             char *newname = construct_relative (file, link->local_name);
 278             char *quoted_newname = local_quote_string (newname);
 279
 280             if (!link->link_refresh_p)
 281               p = replace_attr (p, link->size, fp, quoted_newname);
 282             else
 283               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 284                                              link->refresh_timeout);
 285
 286             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 287                      link->url->url, newname, link->pos, file));
 288             xfree (newname);
 289             xfree (quoted_newname);
 290             ++to_file_count;
 291             break;
 292           }
 293         case CO_CONVERT_TO_COMPLETE:
 294           /* Convert the link to absolute URL. */
 295           {
 296             char *newlink = link->url->url;
 297             char *quoted_newlink = html_quote_string (newlink);
 298
 299             if (!link->link_refresh_p)
 300               p = replace_attr (p, link->size, fp, quoted_newlink);
 301             else
 302               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 303                                              link->refresh_timeout);
 304
 305             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 306                      newlink, link->pos, file));
 307             xfree (quoted_newlink);
 308             ++to_url_count;
 309             break;
 310           }
 311         case CO_NULLIFY_BASE:
 312           /* Change the base href to "". */
 313           p = replace_attr (p, link->size, fp, "");
 314           break;
 315         case CO_NOCONVERT:
 316           abort ();
 317           break;
 318         }
 319     }
 320
 321   /* Output the rest of the file. */
 322   if (p - fm->content < fm->length)
 323     fwrite (p, 1, fm->length - (p - fm->content), fp);
 324   fclose (fp);
 325   read_file_free (fm);
 326
 327   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 328 }
 329
 330 /* Construct and return a malloced copy of the relative link from two
 331    pieces of information: local name S1 of the referring file and
 332    local name S2 of the referred file.
 333
 334    So, if S1 is "jagor.srce.hr/index.html" and S2 is
 335    "jagor.srce.hr/images/news.gif", the function will return
 336    "images/news.gif".
 337
 338    Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
 339    "fly.cc.fer.hr/images/fly.gif", the function will return
 340    "../images/fly.gif".
 341
 342    Caveats: S1 should not begin with `/', unless S2 also begins with
 343    '/'.  S1 should not contain things like ".." and such --
 344    construct_relative ("fly/ioccc/../index.html",
 345    "fly/images/fly.gif") will fail.  (A workaround is to call
 346    something like path_simplify() on S1).  */
 347 static char *
 348 construct_relative (const char *s1, const char *s2)
 349 {
 350   int i, cnt, sepdirs1;
 351   char *res;
 352
 353   if (*s2 == '/')
 354     return xstrdup (s2);
 355   /* S1 should *not* be absolute, if S2 wasn't.  */
 356   assert (*s1 != '/');
 357   i = cnt = 0;
 358   /* Skip the directories common to both strings.  */
 359   while (1)
 360     {
 361       while (s1[i] && s2[i]
 362              && (s1[i] == s2[i])
 363              && (s1[i] != '/')
 364              && (s2[i] != '/'))
 365         ++i;
 366       if (s1[i] == '/' && s2[i] == '/')
 367         cnt = ++i;
 368       else
 369         break;
 370     }
 371   for (sepdirs1 = 0; s1[i]; i++)
 372     if (s1[i] == '/')
 373       ++sepdirs1;
 374   /* Now, construct the file as of:
 375      - ../ repeated sepdirs1 time
 376      - all the non-mutual directories of S2.  */
 377   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
 378   for (i = 0; i < sepdirs1; i++)
 379     memcpy (res + 3 * i, "../", 3);
 380   strcpy (res + 3 * i, s2 + cnt);
 381   return res;
 382 }
 383
 384 static void
 385 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 386 {
 387   /* Rather than just writing over the original .html file with the
 388      converted version, save the former to *.orig.  Note we only do
 389      this for files we've _successfully_ downloaded, so we don't
 390      clobber .orig files sitting around from previous invocations. */
 391
 392   /* Construct the backup filename as the original name plus ".orig". */
 393   size_t         filename_len = strlen(file);
 394   char*          filename_plus_orig_suffix;
 395   boolean        already_wrote_backup_file = FALSE;
 396   slist*         converted_file_ptr;
 397   static slist*  converted_files = NULL;
 398
 399   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 400     {
 401       /* Just write "orig" over "html".  We need to do it this way
 402          because when we're checking to see if we've downloaded the
 403          file before (to see if we can skip downloading it), we don't
 404          know if it's a text/html file.  Therefore we don't know yet
 405          at that stage that -E is going to cause us to tack on
 406          ".html", so we need to compare vs. the original URL plus
 407          ".orig", not the original URL plus ".html.orig". */
 408       filename_plus_orig_suffix = alloca (filename_len + 1);
 409       strcpy(filename_plus_orig_suffix, file);
 410       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
 411     }
 412   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 413     {
 414       /* Append ".orig" to the name. */
 415       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
 416       strcpy(filename_plus_orig_suffix, file);
 417       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
 418     }
 419
 420   /* We can get called twice on the same URL thanks to the
 421      convert_all_links() call in main().  If we write the .orig file
 422      each time in such a case, it'll end up containing the first-pass
 423      conversion, not the original file.  So, see if we've already been
 424      called on this file. */
 425   converted_file_ptr = converted_files;
 426   while (converted_file_ptr != NULL)
 427     if (strcmp(converted_file_ptr->string, file) == 0)
 428       {
 429         already_wrote_backup_file = TRUE;
 430         break;
 431       }
 432     else
 433       converted_file_ptr = converted_file_ptr->next;
 434
 435   if (!already_wrote_backup_file)
 436     {
 437       /* Rename <file> to <file>.orig before former gets written over. */
 438       if (rename(file, filename_plus_orig_suffix) != 0)
 439         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 440                    file, filename_plus_orig_suffix, strerror (errno));
 441
 442       /* Remember that we've already written a .orig backup for this file.
 443          Note that we never free this memory since we need it till the
 444          convert_all_links() call, which is one of the last things the
 445          program does before terminating.  BTW, I'm not sure if it would be
 446          safe to just set 'converted_file_ptr->string' to 'file' below,
 447          rather than making a copy of the string...  Another note is that I
 448          thought I could just add a field to the urlpos structure saying
 449          that we'd written a .orig file for this URL, but that didn't work,
 450          so I had to make this separate list.
 451          -- Dan Harkless <wget@harkless.org>
 452
 453          This [adding a field to the urlpos structure] didn't work
 454          because convert_file() is called from convert_all_links at
 455          the end of the retrieval with a freshly built new urlpos
 456          list.
 457          -- Hrvoje Niksic <hniksic@xemacs.org>
 458       */
 459       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
 460       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
 461       converted_file_ptr->next = converted_files;
 462       converted_files = converted_file_ptr;
 463     }
 464 }
 465
 466 static int find_fragment PARAMS ((const char *, int, const char **,
 467                                   const char **));
 468
 469 /* Replace an attribute's original text with NEW_TEXT. */
 470
 471 static const char *
 472 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 473 {
 474   int quote_flag = 0;
 475   char quote_char = '\"';       /* use "..." for quoting, unless the
 476                                    original value is quoted, in which
 477                                    case reuse its quoting char. */
 478   const char *frag_beg, *frag_end;
 479
 480   /* Structure of our string is:
 481        "...old-contents..."
 482        <---    size    --->  (with quotes)
 483      OR:
 484        ...old-contents...
 485        <---    size   -->    (no quotes)   */
 486
 487   if (*p == '\"' || *p == '\'')
 488     {
 489       quote_char = *p;
 490       quote_flag = 1;
 491       ++p;
 492       size -= 2;                /* disregard opening and closing quote */
 493     }
 494   putc (quote_char, fp);
 495   fputs (new_text, fp);
 496
 497   /* Look for fragment identifier, if any. */
 498   if (find_fragment (p, size, &frag_beg, &frag_end))
 499     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 500   p += size;
 501   if (quote_flag)
 502     ++p;
 503   putc (quote_char, fp);
 504
 505   return p;
 506 }
 507
 508 /* The same as REPLACE_ATTR, but used when replacing
 509    <meta http-equiv=refresh content="new_text"> because we need to
 510    append "timeout_value; URL=" before the next_text.  */
 511
 512 static const char *
 513 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 514                            const char *new_text, int timeout)
 515 {
 516   /* "0; URL=..." */
 517   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 518                                            + 6 /* "; URL=" */
 519                                            + strlen (new_text)
 520                                            + 1);
 521   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 522
 523   return replace_attr (p, size, fp, new_with_timeout);
 524 }
 525
 526 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 527    preceded by '&'.  If the character is not found, return zero.  If
 528    the character is found, return 1 and set BP and EP to point to the
 529    beginning and end of the region.
 530
 531    This is used for finding the fragment indentifiers in URLs.  */
 532
 533 static int
 534 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 535 {
 536   const char *end = beg + size;
 537   int saw_amp = 0;
 538   for (; beg < end; beg++)
 539     {
 540       switch (*beg)
 541         {
 542         case '&':
 543           saw_amp = 1;
 544           break;
 545         case '#':
 546           if (!saw_amp)
 547             {
 548               *bp = beg;
 549               *ep = end;
 550               return 1;
 551             }
 552           /* fallthrough */
 553         default:
 554           saw_amp = 0;
 555         }
 556     }
 557   return 0;
 558 }
 559
 560 /* Quote FILE for use as local reference to an HTML file.
 561
 562    We quote ? as %3F to avoid passing part of the file name as the
 563    parameter when browsing the converted file through HTTP.  However,
 564    it is safe to do this only when `--html-extension' is turned on.
 565    This is because converting "index.html?foo=bar" to
 566    "index.html%3Ffoo=bar" would break local browsing, as the latter
 567    isn't even recognized as an HTML file!  However, converting
 568    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 569    safe for both local and HTTP-served browsing.  */
 570
 571 static char *
 572 local_quote_string (const char *file)
 573 {
 574   const char *file_sans_qmark;
 575   int qm;
 576
 577   if (!opt.html_extension)
 578     return html_quote_string (file);
 579
 580   qm = count_char (file, '?');
 581
 582   if (qm)
 583     {
 584       const char *from = file;
 585       char *to, *newname;
 586
 587       /* qm * 2 because we replace each question mark with "%3F",
 588          i.e. replace one char with three, hence two more.  */
 589       int fsqlen = strlen (file) + qm * 2;
 590
 591       to = newname = (char *)alloca (fsqlen + 1);
 592       for (; *from; from++)
 593         {
 594           if (*from != '?')
 595             *to++ = *from;
 596           else
 597             {
 598               *to++ = '%';
 599               *to++ = '3';
 600               *to++ = 'F';
 601             }
 602         }
 603       assert (to - newname == fsqlen);
 604       *to = '\0';
 605
 606       file_sans_qmark = newname;
 607     }
 608   else
 609     file_sans_qmark = file;
 610
 611   return html_quote_string (file_sans_qmark);
 612 }
 613 \f
 614 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 615    downloaded_html_list, and downloaded_html_set.  Other code calls
 616    these functions to let us know that a file has been downloaded.  */
 617
 618 #define ENSURE_TABLES_EXIST do {                        \
 619   if (!dl_file_url_map)                                 \
 620     dl_file_url_map = make_string_hash_table (0);       \
 621   if (!dl_url_file_map)                                 \
 622     dl_url_file_map = make_string_hash_table (0);       \
 623 } while (0)
 624
 625 /* Return 1 if S1 and S2 are the same, except for "/index.html".  The
 626    three cases in which it returns one are (substitute any substring
 627    for "foo"):
 628
 629    m("foo/index.html", "foo/")  ==> 1
 630    m("foo/", "foo/index.html")  ==> 1
 631    m("foo", "foo/index.html")   ==> 1
 632    m("foo", "foo/"              ==> 1
 633    m("foo", "foo")              ==> 1  */
 634
 635 static int
 636 match_except_index (const char *s1, const char *s2)
 637 {
 638   int i;
 639   const char *lng;
 640
 641   /* Skip common substring. */
 642   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 643     ;
 644   if (i == 0)
 645     /* Strings differ at the very beginning -- bail out.  We need to
 646        check this explicitly to avoid `lng - 1' reading outside the
 647        array.  */
 648     return 0;
 649
 650   if (!*s1 && !*s2)
 651     /* Both strings hit EOF -- strings are equal. */
 652     return 1;
 653   else if (*s1 && *s2)
 654     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 655     return 0;
 656   else if (*s1)
 657     /* S1 is the longer one. */
 658     lng = s1;
 659   else
 660     /* S2 is the longer one. */
 661     lng = s2;
 662
 663   /* foo            */            /* foo/           */
 664   /* foo/index.html */  /* or */  /* foo/index.html */
 665   /*    ^           */            /*     ^          */
 666
 667   if (*lng != '/')
 668     /* The right-hand case. */
 669     --lng;
 670
 671   if (*lng == '/' && *(lng + 1) == '\0')
 672     /* foo  */
 673     /* foo/ */
 674     return 1;
 675
 676   return 0 == strcmp (lng, "/index.html");
 677 }
 678
 679 static int
 680 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 681 {
 682   char *mapping_url = (char *)key;
 683   char *mapping_file = (char *)value;
 684   char *file = (char *)arg;
 685
 686   if (0 == strcmp (mapping_file, file))
 687     {
 688       hash_table_remove (dl_url_file_map, mapping_url);
 689       xfree (mapping_url);
 690       xfree (mapping_file);
 691     }
 692
 693   /* Continue mapping. */
 694   return 0;
 695 }
 696
 697 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 698
 699 static void
 700 dissociate_urls_from_file (const char *file)
 701 {
 702   hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
 703                   (char *)file);
 704 }
 705
 706 /* Register that URL has been successfully downloaded to FILE.  This
 707    is used by the link conversion code to convert references to URLs
 708    to references to local files.  It is also being used to check if a
 709    URL has already been downloaded.  */
 710
 711 void
 712 register_download (const char *url, const char *file)
 713 {
 714   char *old_file, *old_url;
 715
 716   ENSURE_TABLES_EXIST;
 717
 718   /* With some forms of retrieval, it is possible, although not likely
 719      or particularly desirable.  If both are downloaded, the second
 720      download will override the first one.  When that happens,
 721      dissociate the old file name from the URL.  */
 722
 723   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 724     {
 725       if (0 == strcmp (url, old_url))
 726         /* We have somehow managed to download the same URL twice.
 727            Nothing to do.  */
 728         return;
 729
 730       if (match_except_index (url, old_url)
 731           && !hash_table_contains (dl_url_file_map, url))
 732         /* The two URLs differ only in the "index.html" ending.  For
 733            example, one is "http://www.server.com/", and the other is
 734            "http://www.server.com/index.html".  Don't remove the old
 735            one, just add the new one as a non-canonical entry.  */
 736         goto url_only;
 737
 738       hash_table_remove (dl_file_url_map, file);
 739       xfree (old_file);
 740       xfree (old_url);
 741
 742       /* Remove all the URLs that point to this file.  Yes, there can
 743          be more than one such URL, because we store redirections as
 744          multiple entries in dl_url_file_map.  For example, if URL1
 745          redirects to URL2 which gets downloaded to FILE, we map both
 746          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 747          only points to URL2.)  When another URL gets loaded to FILE,
 748          we want both URL1 and URL2 dissociated from it.
 749
 750          This is a relatively expensive operation because it performs
 751          a linear search of the whole hash table, but it should be
 752          called very rarely, only when two URLs resolve to the same
 753          file name, *and* the "<file>.1" extensions are turned off.
 754          In other words, almost never.  */
 755       dissociate_urls_from_file (file);
 756     }
 757
 758   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 759
 760  url_only:
 761   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 762      If the latter were present, it should have been removed by the
 763      above `if'.  So we could write:
 764
 765          assert (!hash_table_contains (dl_url_file_map, url));
 766
 767      The above is correct when running in recursive mode where the
 768      same URL always resolves to the same file.  But if you do
 769      something like:
 770
 771          wget URL URL
 772
 773      then the first URL will resolve to "FILE", and the other to
 774      "FILE.1".  In that case, FILE.1 will not be found in
 775      dl_file_url_map, but URL will still point to FILE in
 776      dl_url_file_map.  */
 777   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 778     {
 779       hash_table_remove (dl_url_file_map, url);
 780       xfree (old_url);
 781       xfree (old_file);
 782     }
 783
 784   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 785 }
 786
 787 /* Register that FROM has been redirected to TO.  This assumes that TO
 788    is successfully downloaded and already registered using
 789    register_download() above.  */
 790
 791 void
 792 register_redirection (const char *from, const char *to)
 793 {
 794   char *file;
 795
 796   ENSURE_TABLES_EXIST;
 797
 798   file = hash_table_get (dl_url_file_map, to);
 799   assert (file != NULL);
 800   if (!hash_table_contains (dl_url_file_map, from))
 801     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 802 }
 803
 804 /* Register that the file has been deleted. */
 805
 806 void
 807 register_delete_file (const char *file)
 808 {
 809   char *old_url, *old_file;
 810
 811   ENSURE_TABLES_EXIST;
 812
 813   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 814     return;
 815
 816   hash_table_remove (dl_file_url_map, file);
 817   xfree (old_file);
 818   xfree (old_url);
 819   dissociate_urls_from_file (file);
 820 }
 821
 822 /* Register that FILE is an HTML file that has been downloaded. */
 823
 824 void
 825 register_html (const char *url, const char *file)
 826 {
 827   if (!downloaded_html_set)
 828     downloaded_html_set = make_string_hash_table (0);
 829   else if (hash_table_contains (downloaded_html_set, file))
 830     return;
 831
 832   /* The set and the list should use the same copy of FILE, but the
 833      slist interface insists on strduping the string it gets.  Oh
 834      well. */
 835   string_set_add (downloaded_html_set, file);
 836   downloaded_html_list = slist_prepend (downloaded_html_list, file);
 837 }
 838
 839 /* Cleanup the data structures associated with recursive retrieving
 840    (the variables above).  */
 841 void
 842 convert_cleanup (void)
 843 {
 844   if (dl_file_url_map)
 845     {
 846       free_keys_and_values (dl_file_url_map);
 847       hash_table_destroy (dl_file_url_map);
 848       dl_file_url_map = NULL;
 849     }
 850   if (dl_url_file_map)
 851     {
 852       free_keys_and_values (dl_url_file_map);
 853       hash_table_destroy (dl_url_file_map);
 854       dl_url_file_map = NULL;
 855     }
 856   if (downloaded_html_set)
 857     string_set_free (downloaded_html_set);
 858   slist_free (downloaded_html_list);
 859   downloaded_html_list = NULL;
 860 }
 861 \f
 862 /* Book-keeping code for downloaded files that enables extension
 863    hacks.  */
 864
 865 /* This table should really be merged with dl_file_url_map and
 866    downloaded_html_files.  This was originally a list, but I changed
 867    it to a hash table beause it was actually taking a lot of time to
 868    find things in it.  */
 869
 870 static struct hash_table *downloaded_files_hash;
 871
 872 /* We're storing "modes" of type downloaded_file_t in the hash table.
 873    However, our hash tables only accept pointers for keys and values.
 874    So when we need a pointer, we use the address of a
 875    downloaded_file_t variable of static storage.  */
 876
 877 static downloaded_file_t *
 878 downloaded_mode_to_ptr (downloaded_file_t mode)
 879 {
 880   static downloaded_file_t
 881     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 882     v2 = FILE_DOWNLOADED_NORMALLY,
 883     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 884     v4 = CHECK_FOR_FILE;
 885
 886   switch (mode)
 887     {
 888     case FILE_NOT_ALREADY_DOWNLOADED:
 889       return &v1;
 890     case FILE_DOWNLOADED_NORMALLY:
 891       return &v2;
 892     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 893       return &v3;
 894     case CHECK_FOR_FILE:
 895       return &v4;
 896     }
 897   return NULL;
 898 }
 899
 900 /* Remembers which files have been downloaded.  In the standard case,
 901    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 902    file we actually download successfully (i.e. not for ones we have
 903    failures on or that we skip due to -N).
 904
 905    When we've downloaded a file and tacked on a ".html" extension due
 906    to -E, call this function with
 907    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 908    FILE_DOWNLOADED_NORMALLY.
 909
 910    If you just want to check if a file has been previously added
 911    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 912    sure to call this function with local filenames, not remote
 913    URLs.  */
 914
 915 downloaded_file_t
 916 downloaded_file (downloaded_file_t mode, const char *file)
 917 {
 918   downloaded_file_t *ptr;
 919
 920   if (mode == CHECK_FOR_FILE)
 921     {
 922       if (!downloaded_files_hash)
 923         return FILE_NOT_ALREADY_DOWNLOADED;
 924       ptr = hash_table_get (downloaded_files_hash, file);
 925       if (!ptr)
 926         return FILE_NOT_ALREADY_DOWNLOADED;
 927       return *ptr;
 928     }
 929
 930   if (!downloaded_files_hash)
 931     downloaded_files_hash = make_string_hash_table (0);
 932
 933   ptr = hash_table_get (downloaded_files_hash, file);
 934   if (ptr)
 935     return *ptr;
 936
 937   ptr = downloaded_mode_to_ptr (mode);
 938   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
 939
 940   return FILE_NOT_ALREADY_DOWNLOADED;
 941 }
 942
 943 static int
 944 df_free_mapper (void *key, void *value, void *ignored)
 945 {
 946   xfree (key);
 947   return 0;
 948 }
 949
 950 void
 951 downloaded_files_free (void)
 952 {
 953   if (downloaded_files_hash)
 954     {
 955       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
 956       hash_table_destroy (downloaded_files_hash);
 957       downloaded_files_hash = NULL;
 958     }
 959 }