sjero.net Git - wget/blob - src/convert.c

   1 /* Conversion of links to local files.
   2    Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9  (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 In addition, as a special exception, the Free Software Foundation
  21 gives permission to link the code of its release of Wget with the
  22 OpenSSL project's "OpenSSL" library (or with modified versions of it
  23 that use the same license as the "OpenSSL" library), and distribute
  24 the linked executables.  You must obey the GNU General Public License
  25 in all respects for all of the code used other than "OpenSSL".  If you
  26 modify this file, you may extend this exception to your version of the
  27 file, but you are not obligated to do so.  If you do not wish to do
  28 so, delete this exception statement from your version.  */
  29
  30 #include <config.h>
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #ifdef HAVE_STRING_H
  35 # include <string.h>
  36 #else
  37 # include <strings.h>
  38 #endif /* HAVE_STRING_H */
  39 #ifdef HAVE_UNISTD_H
  40 # include <unistd.h>
  41 #endif /* HAVE_UNISTD_H */
  42 #include <errno.h>
  43 #include <assert.h>
  44 #include <sys/types.h>
  45
  46 #include "wget.h"
  47 #include "convert.h"
  48 #include "url.h"
  49 #include "recur.h"
  50 #include "utils.h"
  51 #include "hash.h"
  52
  53 static struct hash_table *dl_file_url_map;
  54 struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run, used for link
  57    conversion after Wget is done.  The list and the set contain the
  58    same information, except the list maintains the order.  Perhaps I
  59    should get rid of the list, it's there for historical reasons.  */
  60 static slist *downloaded_html_list;
  61 struct hash_table *downloaded_html_set;
  62
  63 static void convert_links PARAMS ((const char *, struct urlpos *));
  64
  65 /* This function is called when the retrieval is done to convert the
  66    links that have been downloaded.  It has to be called at the end of
  67    the retrieval, because only then does Wget know conclusively which
  68    URLs have been downloaded, and which not, so it can tell which
  69    direction to convert to.
  70
  71    The "direction" means that the URLs to the files that have been
  72    downloaded get converted to the relative URL which will point to
  73    that file.  And the other URLs get converted to the remote URL on
  74    the server.
  75
  76    All the downloaded HTMLs are kept in downloaded_html_files, and
  77    downloaded URLs in urls_downloaded.  All the information is
  78    extracted from these two lists.  */
  79
  80 void
  81 convert_all_links (void)
  82 {
  83   slist *html;
  84   long msecs;
  85   int file_count = 0;
  86
  87   struct wget_timer *timer = wtimer_new ();
  88
  89   /* Destructively reverse downloaded_html_files to get it in the right order.
  90      recursive_retrieve() used slist_prepend() consistently.  */
  91   downloaded_html_list = slist_nreverse (downloaded_html_list);
  92
  93   for (html = downloaded_html_list; html; html = html->next)
  94     {
  95       struct urlpos *urls, *cur_url;
  96       char *url;
  97       char *file = html->string;
  98
  99       /* Determine the URL of the HTML file.  get_urls_html will need
 100          it.  */
 101       url = hash_table_get (dl_file_url_map, file);
 102       if (!url)
 103         {
 104           DEBUGP (("Apparently %s has been removed.\n", file));
 105           continue;
 106         }
 107
 108       DEBUGP (("Scanning %s (from %s)\n", file, url));
 109
 110       /* Parse the HTML file...  */
 111       urls = get_urls_html (file, url, NULL);
 112
 113       /* We don't respect meta_disallow_follow here because, even if
 114          the file is not followed, we might still want to convert the
 115          links that have been followed from other files.  */
 116
 117       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 118         {
 119           char *local_name;
 120           struct url *u = cur_url->url;
 121
 122           if (cur_url->link_base_p)
 123             {
 124               /* Base references have been resolved by our parser, so
 125                  we turn the base URL into an empty string.  (Perhaps
 126                  we should remove the tag entirely?)  */
 127               cur_url->convert = CO_NULLIFY_BASE;
 128               continue;
 129             }
 130
 131           /* We decide the direction of conversion according to whether
 132              a URL was downloaded.  Downloaded URLs will be converted
 133              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 134           local_name = hash_table_get (dl_url_file_map, u->url);
 135
 136           /* Decide on the conversion type.  */
 137           if (local_name)
 138             {
 139               /* We've downloaded this URL.  Convert it to relative
 140                  form.  We do this even if the URL already is in
 141                  relative form, because our directory structure may
 142                  not be identical to that on the server (think `-nd',
 143                  `--cut-dirs', etc.)  */
 144               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 145               cur_url->local_name = xstrdup (local_name);
 146               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 147             }
 148           else
 149             {
 150               /* We haven't downloaded this URL.  If it's not already
 151                  complete (including a full host name), convert it to
 152                  that form, so it can be reached while browsing this
 153                  HTML locally.  */
 154               if (!cur_url->link_complete_p)
 155                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 156               cur_url->local_name = NULL;
 157               DEBUGP (("will convert url %s to complete\n", u->url));
 158             }
 159         }
 160
 161       /* Convert the links in the file.  */
 162       convert_links (file, urls);
 163       ++file_count;
 164
 165       /* Free the data.  */
 166       free_urlpos (urls);
 167     }
 168
 169   msecs = wtimer_elapsed (timer);
 170   wtimer_delete (timer);
 171   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 172              file_count, (double)msecs / 1000);
 173 }
 174
 175 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
 176 static const char *replace_attr PARAMS ((const char *, int, FILE *,
 177                                          const char *));
 178 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
 179                                                       const char *, int));
 180 static char *local_quote_string PARAMS ((const char *));
 181 static char *construct_relative PARAMS ((const char *, const char *));
 182
 183 /* Change the links in one HTML file.  LINKS is a list of links in the
 184    document, along with their positions and the desired direction of
 185    the conversion.  */
 186 static void
 187 convert_links (const char *file, struct urlpos *links)
 188 {
 189   struct file_memory *fm;
 190   FILE *fp;
 191   const char *p;
 192   downloaded_file_t downloaded_file_return;
 193
 194   struct urlpos *link;
 195   int to_url_count = 0, to_file_count = 0;
 196
 197   logprintf (LOG_VERBOSE, _("Converting %s... "), file);
 198
 199   {
 200     /* First we do a "dry run": go through the list L and see whether
 201        any URL needs to be converted in the first place.  If not, just
 202        leave the file alone.  */
 203     int dry_count = 0;
 204     struct urlpos *dry = links;
 205     for (dry = links; dry; dry = dry->next)
 206       if (dry->convert != CO_NOCONVERT)
 207         ++dry_count;
 208     if (!dry_count)
 209       {
 210         logputs (LOG_VERBOSE, _("nothing to do.\n"));
 211         return;
 212       }
 213   }
 214
 215   fm = read_file (file);
 216   if (!fm)
 217     {
 218       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 219                  file, strerror (errno));
 220       return;
 221     }
 222
 223   downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
 224   if (opt.backup_converted && downloaded_file_return)
 225     write_backup_file (file, downloaded_file_return);
 226
 227   /* Before opening the file for writing, unlink the file.  This is
 228      important if the data in FM is mmaped.  In such case, nulling the
 229      file, which is what fopen() below does, would make us read all
 230      zeroes from the mmaped region.  */
 231   if (unlink (file) < 0 && errno != ENOENT)
 232     {
 233       logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
 234                  file, strerror (errno));
 235       read_file_free (fm);
 236       return;
 237     }
 238   /* Now open the file for writing.  */
 239   fp = fopen (file, "wb");
 240   if (!fp)
 241     {
 242       logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
 243                  file, strerror (errno));
 244       read_file_free (fm);
 245       return;
 246     }
 247
 248   /* Here we loop through all the URLs in file, replacing those of
 249      them that are downloaded with relative references.  */
 250   p = fm->content;
 251   for (link = links; link; link = link->next)
 252     {
 253       char *url_start = fm->content + link->pos;
 254
 255       if (link->pos >= fm->length)
 256         {
 257           DEBUGP (("Something strange is going on.  Please investigate."));
 258           break;
 259         }
 260       /* If the URL is not to be converted, skip it.  */
 261       if (link->convert == CO_NOCONVERT)
 262         {
 263           DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
 264           continue;
 265         }
 266
 267       /* Echo the file contents, up to the offending URL's opening
 268          quote, to the outfile.  */
 269       fwrite (p, 1, url_start - p, fp);
 270       p = url_start;
 271
 272       switch (link->convert)
 273         {
 274         case CO_CONVERT_TO_RELATIVE:
 275           /* Convert absolute URL to relative. */
 276           {
 277             char *newname = construct_relative (file, link->local_name);
 278             char *quoted_newname = local_quote_string (newname);
 279
 280             if (!link->link_refresh_p)
 281               p = replace_attr (p, link->size, fp, quoted_newname);
 282             else
 283               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
 284                                              link->refresh_timeout);
 285
 286             DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 287                      link->url->url, newname, link->pos, file));
 288             xfree (newname);
 289             xfree (quoted_newname);
 290             ++to_file_count;
 291             break;
 292           }
 293         case CO_CONVERT_TO_COMPLETE:
 294           /* Convert the link to absolute URL. */
 295           {
 296             char *newlink = link->url->url;
 297             char *quoted_newlink = html_quote_string (newlink);
 298
 299             if (!link->link_refresh_p)
 300               p = replace_attr (p, link->size, fp, quoted_newlink);
 301             else
 302               p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
 303                                              link->refresh_timeout);
 304
 305             DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 306                      newlink, link->pos, file));
 307             xfree (quoted_newlink);
 308             ++to_url_count;
 309             break;
 310           }
 311         case CO_NULLIFY_BASE:
 312           /* Change the base href to "". */
 313           p = replace_attr (p, link->size, fp, "");
 314           break;
 315         case CO_NOCONVERT:
 316           abort ();
 317           break;
 318         }
 319     }
 320
 321   /* Output the rest of the file. */
 322   if (p - fm->content < fm->length)
 323     fwrite (p, 1, fm->length - (p - fm->content), fp);
 324   fclose (fp);
 325   read_file_free (fm);
 326
 327   logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
 328 }
 329
 330 /* Construct and return a link that points from S1's position to S2.
 331    Both files should be local file names, S1 of the referrering file,
 332    and S2 of the referred file.
 333
 334    So, if S1 is "H/index.html" and S2 is "H/images/news.gif", this
 335    function will return "images/news.gif".  On the other hand, if S1
 336    is "H/ioccc/index.html", and S2 is "H/images/fly.gif", it will
 337    return "../images/fly.gif".
 338
 339    Caveats: S1 should not begin with `/', unless S2 also begins with
 340    '/'.  S1 should not contain things like ".." and such --
 341    construct_relative ("fly/ioccc/../index.html",
 342    "fly/images/fly.gif") will fail.  (A workaround is to call
 343    something like path_simplify() on S1).  */
 344
 345 static char *
 346 construct_relative (const char *s1, const char *s2)
 347 {
 348   int i, cnt, sepdirs1;
 349   char *res;
 350
 351   i = cnt = 0;
 352   /* Skip the directories common to both strings.  */
 353   while (1)
 354     {
 355       while (s1[i] && s2[i]
 356              && (s1[i] == s2[i])
 357              && (s1[i] != '/')
 358              && (s2[i] != '/'))
 359         ++i;
 360       if (s1[i] == '/' && s2[i] == '/')
 361         cnt = ++i;
 362       else
 363         break;
 364     }
 365   for (sepdirs1 = 0; s1[i]; i++)
 366     if (s1[i] == '/')
 367       ++sepdirs1;
 368   /* Now, construct the file as of:
 369      - ../ repeated sepdirs1 time
 370      - all the non-mutual directories of S2.  */
 371   res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
 372   for (i = 0; i < sepdirs1; i++)
 373     memcpy (res + 3 * i, "../", 3);
 374   strcpy (res + 3 * i, s2 + cnt);
 375   return res;
 376 }
 377
 378 static void
 379 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
 380 {
 381   /* Rather than just writing over the original .html file with the
 382      converted version, save the former to *.orig.  Note we only do
 383      this for files we've _successfully_ downloaded, so we don't
 384      clobber .orig files sitting around from previous invocations. */
 385
 386   /* Construct the backup filename as the original name plus ".orig". */
 387   size_t         filename_len = strlen(file);
 388   char*          filename_plus_orig_suffix;
 389   boolean        already_wrote_backup_file = FALSE;
 390   slist*         converted_file_ptr;
 391   static slist*  converted_files = NULL;
 392
 393   if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
 394     {
 395       /* Just write "orig" over "html".  We need to do it this way
 396          because when we're checking to see if we've downloaded the
 397          file before (to see if we can skip downloading it), we don't
 398          know if it's a text/html file.  Therefore we don't know yet
 399          at that stage that -E is going to cause us to tack on
 400          ".html", so we need to compare vs. the original URL plus
 401          ".orig", not the original URL plus ".html.orig". */
 402       filename_plus_orig_suffix = alloca (filename_len + 1);
 403       strcpy(filename_plus_orig_suffix, file);
 404       strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
 405     }
 406   else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
 407     {
 408       /* Append ".orig" to the name. */
 409       filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
 410       strcpy(filename_plus_orig_suffix, file);
 411       strcpy(filename_plus_orig_suffix + filename_len, ".orig");
 412     }
 413
 414   /* We can get called twice on the same URL thanks to the
 415      convert_all_links() call in main().  If we write the .orig file
 416      each time in such a case, it'll end up containing the first-pass
 417      conversion, not the original file.  So, see if we've already been
 418      called on this file. */
 419   converted_file_ptr = converted_files;
 420   while (converted_file_ptr != NULL)
 421     if (strcmp(converted_file_ptr->string, file) == 0)
 422       {
 423         already_wrote_backup_file = TRUE;
 424         break;
 425       }
 426     else
 427       converted_file_ptr = converted_file_ptr->next;
 428
 429   if (!already_wrote_backup_file)
 430     {
 431       /* Rename <file> to <file>.orig before former gets written over. */
 432       if (rename(file, filename_plus_orig_suffix) != 0)
 433         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
 434                    file, filename_plus_orig_suffix, strerror (errno));
 435
 436       /* Remember that we've already written a .orig backup for this file.
 437          Note that we never free this memory since we need it till the
 438          convert_all_links() call, which is one of the last things the
 439          program does before terminating.  BTW, I'm not sure if it would be
 440          safe to just set 'converted_file_ptr->string' to 'file' below,
 441          rather than making a copy of the string...  Another note is that I
 442          thought I could just add a field to the urlpos structure saying
 443          that we'd written a .orig file for this URL, but that didn't work,
 444          so I had to make this separate list.
 445          -- Dan Harkless <wget@harkless.org>
 446
 447          This [adding a field to the urlpos structure] didn't work
 448          because convert_file() is called from convert_all_links at
 449          the end of the retrieval with a freshly built new urlpos
 450          list.
 451          -- Hrvoje Niksic <hniksic@xemacs.org>
 452       */
 453       converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
 454       converted_file_ptr->string = xstrdup(file);  /* die on out-of-mem. */
 455       converted_file_ptr->next = converted_files;
 456       converted_files = converted_file_ptr;
 457     }
 458 }
 459
 460 static int find_fragment PARAMS ((const char *, int, const char **,
 461                                   const char **));
 462
 463 /* Replace an attribute's original text with NEW_TEXT. */
 464
 465 static const char *
 466 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
 467 {
 468   int quote_flag = 0;
 469   char quote_char = '\"';       /* use "..." for quoting, unless the
 470                                    original value is quoted, in which
 471                                    case reuse its quoting char. */
 472   const char *frag_beg, *frag_end;
 473
 474   /* Structure of our string is:
 475        "...old-contents..."
 476        <---    size    --->  (with quotes)
 477      OR:
 478        ...old-contents...
 479        <---    size   -->    (no quotes)   */
 480
 481   if (*p == '\"' || *p == '\'')
 482     {
 483       quote_char = *p;
 484       quote_flag = 1;
 485       ++p;
 486       size -= 2;                /* disregard opening and closing quote */
 487     }
 488   putc (quote_char, fp);
 489   fputs (new_text, fp);
 490
 491   /* Look for fragment identifier, if any. */
 492   if (find_fragment (p, size, &frag_beg, &frag_end))
 493     fwrite (frag_beg, 1, frag_end - frag_beg, fp);
 494   p += size;
 495   if (quote_flag)
 496     ++p;
 497   putc (quote_char, fp);
 498
 499   return p;
 500 }
 501
 502 /* The same as REPLACE_ATTR, but used when replacing
 503    <meta http-equiv=refresh content="new_text"> because we need to
 504    append "timeout_value; URL=" before the next_text.  */
 505
 506 static const char *
 507 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
 508                            const char *new_text, int timeout)
 509 {
 510   /* "0; URL=..." */
 511   char *new_with_timeout = (char *)alloca (numdigit (timeout)
 512                                            + 6 /* "; URL=" */
 513                                            + strlen (new_text)
 514                                            + 1);
 515   sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
 516
 517   return replace_attr (p, size, fp, new_with_timeout);
 518 }
 519
 520 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
 521    preceded by '&'.  If the character is not found, return zero.  If
 522    the character is found, return 1 and set BP and EP to point to the
 523    beginning and end of the region.
 524
 525    This is used for finding the fragment indentifiers in URLs.  */
 526
 527 static int
 528 find_fragment (const char *beg, int size, const char **bp, const char **ep)
 529 {
 530   const char *end = beg + size;
 531   int saw_amp = 0;
 532   for (; beg < end; beg++)
 533     {
 534       switch (*beg)
 535         {
 536         case '&':
 537           saw_amp = 1;
 538           break;
 539         case '#':
 540           if (!saw_amp)
 541             {
 542               *bp = beg;
 543               *ep = end;
 544               return 1;
 545             }
 546           /* fallthrough */
 547         default:
 548           saw_amp = 0;
 549         }
 550     }
 551   return 0;
 552 }
 553
 554 /* Quote FILE for use as local reference to an HTML file.
 555
 556    We quote ? as %3F to avoid passing part of the file name as the
 557    parameter when browsing the converted file through HTTP.  However,
 558    it is safe to do this only when `--html-extension' is turned on.
 559    This is because converting "index.html?foo=bar" to
 560    "index.html%3Ffoo=bar" would break local browsing, as the latter
 561    isn't even recognized as an HTML file!  However, converting
 562    "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
 563    safe for both local and HTTP-served browsing.  */
 564
 565 static char *
 566 local_quote_string (const char *file)
 567 {
 568   const char *file_sans_qmark;
 569   int qm;
 570
 571   if (!opt.html_extension)
 572     return html_quote_string (file);
 573
 574   qm = count_char (file, '?');
 575
 576   if (qm)
 577     {
 578       const char *from = file;
 579       char *to, *newname;
 580
 581       /* qm * 2 because we replace each question mark with "%3F",
 582          i.e. replace one char with three, hence two more.  */
 583       int fsqlen = strlen (file) + qm * 2;
 584
 585       to = newname = (char *)alloca (fsqlen + 1);
 586       for (; *from; from++)
 587         {
 588           if (*from != '?')
 589             *to++ = *from;
 590           else
 591             {
 592               *to++ = '%';
 593               *to++ = '3';
 594               *to++ = 'F';
 595             }
 596         }
 597       assert (to - newname == fsqlen);
 598       *to = '\0';
 599
 600       file_sans_qmark = newname;
 601     }
 602   else
 603     file_sans_qmark = file;
 604
 605   return html_quote_string (file_sans_qmark);
 606 }
 607 \f
 608 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
 609    downloaded_html_list, and downloaded_html_set.  Other code calls
 610    these functions to let us know that a file has been downloaded.  */
 611
 612 #define ENSURE_TABLES_EXIST do {                        \
 613   if (!dl_file_url_map)                                 \
 614     dl_file_url_map = make_string_hash_table (0);       \
 615   if (!dl_url_file_map)                                 \
 616     dl_url_file_map = make_string_hash_table (0);       \
 617 } while (0)
 618
 619 /* Return 1 if S1 and S2 are the same, except for "/index.html".  The
 620    three cases in which it returns one are (substitute any substring
 621    for "foo"):
 622
 623    m("foo/index.html", "foo/")  ==> 1
 624    m("foo/", "foo/index.html")  ==> 1
 625    m("foo", "foo/index.html")   ==> 1
 626    m("foo", "foo/"              ==> 1
 627    m("foo", "foo")              ==> 1  */
 628
 629 static int
 630 match_except_index (const char *s1, const char *s2)
 631 {
 632   int i;
 633   const char *lng;
 634
 635   /* Skip common substring. */
 636   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 637     ;
 638   if (i == 0)
 639     /* Strings differ at the very beginning -- bail out.  We need to
 640        check this explicitly to avoid `lng - 1' reading outside the
 641        array.  */
 642     return 0;
 643
 644   if (!*s1 && !*s2)
 645     /* Both strings hit EOF -- strings are equal. */
 646     return 1;
 647   else if (*s1 && *s2)
 648     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 649     return 0;
 650   else if (*s1)
 651     /* S1 is the longer one. */
 652     lng = s1;
 653   else
 654     /* S2 is the longer one. */
 655     lng = s2;
 656
 657   /* foo            */            /* foo/           */
 658   /* foo/index.html */  /* or */  /* foo/index.html */
 659   /*    ^           */            /*     ^          */
 660
 661   if (*lng != '/')
 662     /* The right-hand case. */
 663     --lng;
 664
 665   if (*lng == '/' && *(lng + 1) == '\0')
 666     /* foo  */
 667     /* foo/ */
 668     return 1;
 669
 670   return 0 == strcmp (lng, "/index.html");
 671 }
 672
 673 static int
 674 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 675 {
 676   char *mapping_url = (char *)key;
 677   char *mapping_file = (char *)value;
 678   char *file = (char *)arg;
 679
 680   if (0 == strcmp (mapping_file, file))
 681     {
 682       hash_table_remove (dl_url_file_map, mapping_url);
 683       xfree (mapping_url);
 684       xfree (mapping_file);
 685     }
 686
 687   /* Continue mapping. */
 688   return 0;
 689 }
 690
 691 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 692
 693 static void
 694 dissociate_urls_from_file (const char *file)
 695 {
 696   hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
 697                   (char *)file);
 698 }
 699
 700 /* Register that URL has been successfully downloaded to FILE.  This
 701    is used by the link conversion code to convert references to URLs
 702    to references to local files.  It is also being used to check if a
 703    URL has already been downloaded.  */
 704
 705 void
 706 register_download (const char *url, const char *file)
 707 {
 708   char *old_file, *old_url;
 709
 710   ENSURE_TABLES_EXIST;
 711
 712   /* With some forms of retrieval, it is possible, although not likely
 713      or particularly desirable.  If both are downloaded, the second
 714      download will override the first one.  When that happens,
 715      dissociate the old file name from the URL.  */
 716
 717   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 718     {
 719       if (0 == strcmp (url, old_url))
 720         /* We have somehow managed to download the same URL twice.
 721            Nothing to do.  */
 722         return;
 723
 724       if (match_except_index (url, old_url)
 725           && !hash_table_contains (dl_url_file_map, url))
 726         /* The two URLs differ only in the "index.html" ending.  For
 727            example, one is "http://www.server.com/", and the other is
 728            "http://www.server.com/index.html".  Don't remove the old
 729            one, just add the new one as a non-canonical entry.  */
 730         goto url_only;
 731
 732       hash_table_remove (dl_file_url_map, file);
 733       xfree (old_file);
 734       xfree (old_url);
 735
 736       /* Remove all the URLs that point to this file.  Yes, there can
 737          be more than one such URL, because we store redirections as
 738          multiple entries in dl_url_file_map.  For example, if URL1
 739          redirects to URL2 which gets downloaded to FILE, we map both
 740          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 741          only points to URL2.)  When another URL gets loaded to FILE,
 742          we want both URL1 and URL2 dissociated from it.
 743
 744          This is a relatively expensive operation because it performs
 745          a linear search of the whole hash table, but it should be
 746          called very rarely, only when two URLs resolve to the same
 747          file name, *and* the "<file>.1" extensions are turned off.
 748          In other words, almost never.  */
 749       dissociate_urls_from_file (file);
 750     }
 751
 752   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 753
 754  url_only:
 755   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 756      If the latter were present, it should have been removed by the
 757      above `if'.  So we could write:
 758
 759          assert (!hash_table_contains (dl_url_file_map, url));
 760
 761      The above is correct when running in recursive mode where the
 762      same URL always resolves to the same file.  But if you do
 763      something like:
 764
 765          wget URL URL
 766
 767      then the first URL will resolve to "FILE", and the other to
 768      "FILE.1".  In that case, FILE.1 will not be found in
 769      dl_file_url_map, but URL will still point to FILE in
 770      dl_url_file_map.  */
 771   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 772     {
 773       hash_table_remove (dl_url_file_map, url);
 774       xfree (old_url);
 775       xfree (old_file);
 776     }
 777
 778   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 779 }
 780
 781 /* Register that FROM has been redirected to TO.  This assumes that TO
 782    is successfully downloaded and already registered using
 783    register_download() above.  */
 784
 785 void
 786 register_redirection (const char *from, const char *to)
 787 {
 788   char *file;
 789
 790   ENSURE_TABLES_EXIST;
 791
 792   file = hash_table_get (dl_url_file_map, to);
 793   assert (file != NULL);
 794   if (!hash_table_contains (dl_url_file_map, from))
 795     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 796 }
 797
 798 /* Register that the file has been deleted. */
 799
 800 void
 801 register_delete_file (const char *file)
 802 {
 803   char *old_url, *old_file;
 804
 805   ENSURE_TABLES_EXIST;
 806
 807   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 808     return;
 809
 810   hash_table_remove (dl_file_url_map, file);
 811   xfree (old_file);
 812   xfree (old_url);
 813   dissociate_urls_from_file (file);
 814 }
 815
 816 /* Register that FILE is an HTML file that has been downloaded. */
 817
 818 void
 819 register_html (const char *url, const char *file)
 820 {
 821   if (!downloaded_html_set)
 822     downloaded_html_set = make_string_hash_table (0);
 823   else if (hash_table_contains (downloaded_html_set, file))
 824     return;
 825
 826   /* The set and the list should use the same copy of FILE, but the
 827      slist interface insists on strduping the string it gets.  Oh
 828      well. */
 829   string_set_add (downloaded_html_set, file);
 830   downloaded_html_list = slist_prepend (downloaded_html_list, file);
 831 }
 832
 833 /* Cleanup the data structures associated with recursive retrieving
 834    (the variables above).  */
 835 void
 836 convert_cleanup (void)
 837 {
 838   if (dl_file_url_map)
 839     {
 840       free_keys_and_values (dl_file_url_map);
 841       hash_table_destroy (dl_file_url_map);
 842       dl_file_url_map = NULL;
 843     }
 844   if (dl_url_file_map)
 845     {
 846       free_keys_and_values (dl_url_file_map);
 847       hash_table_destroy (dl_url_file_map);
 848       dl_url_file_map = NULL;
 849     }
 850   if (downloaded_html_set)
 851     string_set_free (downloaded_html_set);
 852   slist_free (downloaded_html_list);
 853   downloaded_html_list = NULL;
 854 }
 855 \f
 856 /* Book-keeping code for downloaded files that enables extension
 857    hacks.  */
 858
 859 /* This table should really be merged with dl_file_url_map and
 860    downloaded_html_files.  This was originally a list, but I changed
 861    it to a hash table beause it was actually taking a lot of time to
 862    find things in it.  */
 863
 864 static struct hash_table *downloaded_files_hash;
 865
 866 /* We're storing "modes" of type downloaded_file_t in the hash table.
 867    However, our hash tables only accept pointers for keys and values.
 868    So when we need a pointer, we use the address of a
 869    downloaded_file_t variable of static storage.  */
 870
 871 static downloaded_file_t *
 872 downloaded_mode_to_ptr (downloaded_file_t mode)
 873 {
 874   static downloaded_file_t
 875     v1 = FILE_NOT_ALREADY_DOWNLOADED,
 876     v2 = FILE_DOWNLOADED_NORMALLY,
 877     v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
 878     v4 = CHECK_FOR_FILE;
 879
 880   switch (mode)
 881     {
 882     case FILE_NOT_ALREADY_DOWNLOADED:
 883       return &v1;
 884     case FILE_DOWNLOADED_NORMALLY:
 885       return &v2;
 886     case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
 887       return &v3;
 888     case CHECK_FOR_FILE:
 889       return &v4;
 890     }
 891   return NULL;
 892 }
 893
 894 /* Remembers which files have been downloaded.  In the standard case,
 895    should be called with mode == FILE_DOWNLOADED_NORMALLY for each
 896    file we actually download successfully (i.e. not for ones we have
 897    failures on or that we skip due to -N).
 898
 899    When we've downloaded a file and tacked on a ".html" extension due
 900    to -E, call this function with
 901    FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
 902    FILE_DOWNLOADED_NORMALLY.
 903
 904    If you just want to check if a file has been previously added
 905    without adding it, call with mode == CHECK_FOR_FILE.  Please be
 906    sure to call this function with local filenames, not remote
 907    URLs.  */
 908
 909 downloaded_file_t
 910 downloaded_file (downloaded_file_t mode, const char *file)
 911 {
 912   downloaded_file_t *ptr;
 913
 914   if (mode == CHECK_FOR_FILE)
 915     {
 916       if (!downloaded_files_hash)
 917         return FILE_NOT_ALREADY_DOWNLOADED;
 918       ptr = hash_table_get (downloaded_files_hash, file);
 919       if (!ptr)
 920         return FILE_NOT_ALREADY_DOWNLOADED;
 921       return *ptr;
 922     }
 923
 924   if (!downloaded_files_hash)
 925     downloaded_files_hash = make_string_hash_table (0);
 926
 927   ptr = hash_table_get (downloaded_files_hash, file);
 928   if (ptr)
 929     return *ptr;
 930
 931   ptr = downloaded_mode_to_ptr (mode);
 932   hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
 933
 934   return FILE_NOT_ALREADY_DOWNLOADED;
 935 }
 936
 937 static int
 938 df_free_mapper (void *key, void *value, void *ignored)
 939 {
 940   xfree (key);
 941   return 0;
 942 }
 943
 944 void
 945 downloaded_files_free (void)
 946 {
 947   if (downloaded_files_hash)
 948     {
 949       hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
 950       hash_table_destroy (downloaded_files_hash);
 951       downloaded_files_hash = NULL;
 952     }
 953 }