sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61
  62 static void register_delete_file PARAMS ((const char *));
  63 \f
  64 /* Functions for maintaining the URL queue.  */
  65
  66 struct queue_element {
  67   const char *url;
  68   const char *referer;
  69   int depth;
  70   struct queue_element *next;
  71 };
  72
  73 struct url_queue {
  74   struct queue_element *head;
  75   struct queue_element *tail;
  76   int count, maxcount;
  77 };
  78
  79 /* Create a URL queue. */
  80
  81 static struct url_queue *
  82 url_queue_new (void)
  83 {
  84   struct url_queue *queue = xmalloc (sizeof (*queue));
  85   memset (queue, '\0', sizeof (*queue));
  86   return queue;
  87 }
  88
  89 /* Delete a URL queue. */
  90
  91 static void
  92 url_queue_delete (struct url_queue *queue)
  93 {
  94   xfree (queue);
  95 }
  96
  97 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  98    retrieved ("dequeued") from the queue in the order they were placed
  99    into it.  */
 100
 101 static void
 102 url_enqueue (struct url_queue *queue,
 103              const char *url, const char *referer, int depth)
 104 {
 105   struct queue_element *qel = xmalloc (sizeof (*qel));
 106   qel->url = url;
 107   qel->referer = referer;
 108   qel->depth = depth;
 109   qel->next = NULL;
 110
 111   ++queue->count;
 112   if (queue->count > queue->maxcount)
 113     queue->maxcount = queue->count;
 114
 115   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 116   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 117
 118   if (queue->tail)
 119     queue->tail->next = qel;
 120   queue->tail = qel;
 121
 122   if (!queue->head)
 123     queue->head = queue->tail;
 124 }
 125
 126 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 127    or 0 if the queue is empty.  */
 128
 129 static int
 130 url_dequeue (struct url_queue *queue,
 131              const char **url, const char **referer, int *depth)
 132 {
 133   struct queue_element *qel = queue->head;
 134
 135   if (!qel)
 136     return 0;
 137
 138   queue->head = queue->head->next;
 139   if (!queue->head)
 140     queue->tail = NULL;
 141
 142   *url = qel->url;
 143   *referer = qel->referer;
 144   *depth = qel->depth;
 145
 146   --queue->count;
 147
 148   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 149   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 150
 151   xfree (qel);
 152   return 1;
 153 }
 154 \f
 155 static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
 156                                      struct url *, struct hash_table *));
 157 static int descend_redirect_p PARAMS ((const char *, const char *, int,
 158                                        struct url *, struct hash_table *));
 159
 160
 161 /* Retrieve a part of the web beginning with START_URL.  This used to
 162    be called "recursive retrieval", because the old function was
 163    recursive and implemented depth-first search.  retrieve_tree on the
 164    other hand implements breadth-search traversal of the tree, which
 165    results in much nicer ordering of downloads.
 166
 167    The algorithm this function uses is simple:
 168
 169    1. put START_URL in the queue.
 170    2. while there are URLs in the queue:
 171
 172      3. get next URL from the queue.
 173      4. download it.
 174      5. if the URL is HTML and its depth does not exceed maximum depth,
 175         get the list of URLs embedded therein.
 176      6. for each of those URLs do the following:
 177
 178        7. if the URL is not one of those downloaded before, and if it
 179           satisfies the criteria specified by the various command-line
 180           options, add it to the queue. */
 181
 182 uerr_t
 183 retrieve_tree (const char *start_url)
 184 {
 185   uerr_t status = RETROK;
 186
 187   /* The queue of URLs we need to load. */
 188   struct url_queue *queue = url_queue_new ();
 189
 190   /* The URLs we do not wish to enqueue, because they are already in
 191      the queue, but haven't been downloaded yet.  */
 192   struct hash_table *blacklist = make_string_hash_table (0);
 193
 194   /* We'll need various components of this, so better get it over with
 195      now. */
 196   struct url *start_url_parsed = url_parse (start_url, NULL);
 197
 198   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 199   string_set_add (blacklist, start_url);
 200
 201   while (1)
 202     {
 203       int descend = 0;
 204       char *url, *referer, *file = NULL;
 205       int depth;
 206       boolean dash_p_leaf_HTML = FALSE;
 207
 208       if (downloaded_exceeds_quota ())
 209         break;
 210       if (status == FWRITEERR)
 211         break;
 212
 213       /* Get the next URL from the queue... */
 214
 215       if (!url_dequeue (queue,
 216                         (const char **)&url, (const char **)&referer,
 217                         &depth))
 218         break;
 219
 220       /* ...and download it.  Note that this download is in most cases
 221          unconditional, as download_child_p already makes sure a file
 222          doesn't get enqueued twice -- and yet this check is here, and
 223          not in download_child_p.  This is so that if you run `wget -r
 224          URL1 URL2', and a random URL is encountered once under URL1
 225          and again under URL2, but at a different (possibly smaller)
 226          depth, we want the URL's children to be taken into account
 227          the second time.  */
 228       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 229         {
 230           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 231                    url, (char *)hash_table_get (dl_url_file_map, url)));
 232         }
 233       else
 234         {
 235           int dt = 0;
 236           char *redirected = NULL;
 237           int oldrec = opt.recursive;
 238
 239           opt.recursive = 0;
 240           status = retrieve_url (url, &file, &redirected, NULL, &dt);
 241           opt.recursive = oldrec;
 242
 243           if (file && status == RETROK
 244               && (dt & RETROKF) && (dt & TEXTHTML))
 245             descend = 1;
 246
 247           if (redirected)
 248             {
 249               /* We have been redirected, possibly to another host, or
 250                  different path, or wherever.  Check whether we really
 251                  want to follow it.  */
 252               if (descend)
 253                 {
 254                   if (!descend_redirect_p (redirected, url, depth,
 255                                            start_url_parsed, blacklist))
 256                     descend = 0;
 257                   else
 258                     /* Make sure that the old pre-redirect form gets
 259                        blacklisted. */
 260                     string_set_add (blacklist, url);
 261                 }
 262
 263               xfree (url);
 264               url = redirected;
 265             }
 266         }
 267
 268       if (descend
 269           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 270         {
 271           if (opt.page_requisites
 272               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 273             {
 274               /* When -p is specified, we are allowed to exceed the
 275                  maximum depth, but only for the "inline" links,
 276                  i.e. those that are needed to display the page.
 277                  Originally this could exceed the depth at most by
 278                  one, but we allow one more level so that the leaf
 279                  pages that contain frames can be loaded
 280                  correctly.  */
 281               dash_p_leaf_HTML = TRUE;
 282             }
 283           else
 284             {
 285               /* Either -p wasn't specified or it was and we've
 286                  already spent the two extra (pseudo-)levels that it
 287                  affords us, so we need to bail out. */
 288               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 289                        depth, opt.reclevel));
 290               descend = 0;
 291             }
 292         }
 293
 294       /* If the downloaded document was HTML, parse it and enqueue the
 295          links it contains. */
 296
 297       if (descend)
 298         {
 299           int meta_disallow_follow = 0;
 300           struct urlpos *children
 301             = get_urls_html (file, url, &meta_disallow_follow);
 302
 303           if (opt.use_robots && meta_disallow_follow)
 304             {
 305               free_urlpos (children);
 306               children = NULL;
 307             }
 308
 309           if (children)
 310             {
 311               struct urlpos *child = children;
 312               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 313               assert (url_parsed != NULL);
 314
 315               for (; child; child = child->next)
 316                 {
 317                   if (child->ignore_when_downloading)
 318                     continue;
 319                   if (dash_p_leaf_HTML && !child->link_inline_p)
 320                     continue;
 321                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 322                                         blacklist))
 323                     {
 324                       url_enqueue (queue, xstrdup (child->url->url),
 325                                    xstrdup (url), depth + 1);
 326                       /* We blacklist the URL we have enqueued, because we
 327                          don't want to enqueue (and hence download) the
 328                          same URL twice.  */
 329                       string_set_add (blacklist, child->url->url);
 330                     }
 331                 }
 332
 333               url_free (url_parsed);
 334               free_urlpos (children);
 335             }
 336         }
 337
 338       if (opt.delete_after || (file && !acceptable (file)))
 339         {
 340           /* Either --delete-after was specified, or we loaded this
 341              otherwise rejected (e.g. by -R) HTML file just so we
 342              could harvest its hyperlinks -- in either case, delete
 343              the local file. */
 344           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 345                    opt.delete_after ? "--delete-after" :
 346                    "recursive rejection criteria"));
 347           logprintf (LOG_VERBOSE,
 348                      (opt.delete_after
 349                       ? _("Removing %s.\n")
 350                       : _("Removing %s since it should be rejected.\n")),
 351                      file);
 352           if (unlink (file))
 353             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 354           register_delete_file (file);
 355         }
 356
 357       xfree (url);
 358       FREE_MAYBE (referer);
 359       FREE_MAYBE (file);
 360     }
 361
 362   /* If anything is left of the queue due to a premature exit, free it
 363      now.  */
 364   {
 365     char *d1, *d2;
 366     int d3;
 367     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 368       {
 369         xfree (d1);
 370         FREE_MAYBE (d2);
 371       }
 372   }
 373   url_queue_delete (queue);
 374
 375   if (start_url_parsed)
 376     url_free (start_url_parsed);
 377   string_set_free (blacklist);
 378
 379   if (downloaded_exceeds_quota ())
 380     return QUOTEXC;
 381   else if (status == FWRITEERR)
 382     return FWRITEERR;
 383   else
 384     return RETROK;
 385 }
 386
 387 /* Based on the context provided by retrieve_tree, decide whether a
 388    URL is to be descended to.  This is only ever called from
 389    retrieve_tree, but is in a separate function for clarity.
 390
 391    The most expensive checks (such as those for robots) are memoized
 392    by storing these URLs to BLACKLIST.  This may or may not help.  It
 393    will help if those URLs are encountered many times.  */
 394
 395 static int
 396 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 397                   struct url *start_url_parsed, struct hash_table *blacklist)
 398 {
 399   struct url *u = upos->url;
 400   const char *url = u->url;
 401
 402   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 403
 404   if (string_set_contains (blacklist, url))
 405     {
 406       DEBUGP (("Already on the black list.\n"));
 407       goto out;
 408     }
 409
 410   /* Several things to check for:
 411      1. if scheme is not http, and we don't load it
 412      2. check for relative links (if relative_only is set)
 413      3. check for domain
 414      4. check for no-parent
 415      5. check for excludes && includes
 416      6. check for suffix
 417      7. check for same host (if spanhost is unset), with possible
 418      gethostbyname baggage
 419      8. check for robots.txt
 420
 421      Addendum: If the URL is FTP, and it is to be loaded, only the
 422      domain and suffix settings are "stronger".
 423
 424      Note that .html files will get loaded regardless of suffix rules
 425      (but that is remedied later with unlink) unless the depth equals
 426      the maximum depth.
 427
 428      More time- and memory- consuming tests should be put later on
 429      the list.  */
 430
 431   /* 1. Schemes other than HTTP are normally not recursed into. */
 432   if (u->scheme != SCHEME_HTTP
 433       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 434     {
 435       DEBUGP (("Not following non-HTTP schemes.\n"));
 436       goto out;
 437     }
 438
 439   /* 2. If it is an absolute link and they are not followed, throw it
 440      out.  */
 441   if (u->scheme == SCHEME_HTTP)
 442     if (opt.relative_only && !upos->link_relative_p)
 443       {
 444         DEBUGP (("It doesn't really look like a relative link.\n"));
 445         goto out;
 446       }
 447
 448   /* 3. If its domain is not to be accepted/looked-up, chuck it
 449      out.  */
 450   if (!accept_domain (u))
 451     {
 452       DEBUGP (("The domain was not accepted.\n"));
 453       goto out;
 454     }
 455
 456   /* 4. Check for parent directory.
 457
 458      If we descended to a different host or changed the scheme, ignore
 459      opt.no_parent.  Also ignore it for documents needed to display
 460      the parent page when in -p mode.  */
 461   if (opt.no_parent
 462       && u->scheme == start_url_parsed->scheme
 463       && 0 == strcasecmp (u->host, start_url_parsed->host)
 464       && u->port == start_url_parsed->port
 465       && !(opt.page_requisites && upos->link_inline_p))
 466     {
 467       if (!frontcmp (start_url_parsed->dir, u->dir))
 468         {
 469           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 470                    u->dir, start_url_parsed->dir));
 471           goto out;
 472         }
 473     }
 474
 475   /* 5. If the file does not match the acceptance list, or is on the
 476      rejection list, chuck it out.  The same goes for the directory
 477      exclusion and inclusion lists.  */
 478   if (opt.includes || opt.excludes)
 479     {
 480       if (!accdir (u->dir, ALLABS))
 481         {
 482           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 483           goto out;
 484         }
 485     }
 486
 487   /* 6. */
 488   {
 489     char *suf;
 490     /* Check for acceptance/rejection rules.  We ignore these rules
 491        for HTML documents because they might lead to other files which
 492        need to be downloaded.  Of course, we don't know which
 493        documents are HTML before downloading them, so we guess.
 494
 495        A file is subject to acceptance/rejection rules if:
 496
 497        * u->file is not "" (i.e. it is not a directory)
 498        and either:
 499          + there is no file suffix,
 500          + or there is a suffix, but is not "html" or "htm",
 501          + both:
 502            - recursion is not infinite,
 503            - and we are at its very end. */
 504
 505     if (u->file[0] != '\0'
 506         && ((suf = suffix (url)) == NULL
 507             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 508             || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
 509       {
 510         if (!acceptable (u->file))
 511           {
 512             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 513                      url, u->file));
 514             goto out;
 515           }
 516       }
 517   }
 518
 519   /* 7. */
 520   if (u->scheme == parent->scheme)
 521     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 522       {
 523         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 524                  u->host, parent->host));
 525         goto out;
 526       }
 527
 528   /* 8. */
 529   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 530     {
 531       struct robot_specs *specs = res_get_specs (u->host, u->port);
 532       if (!specs)
 533         {
 534           char *rfile;
 535           if (res_retrieve_file (url, &rfile))
 536             {
 537               specs = res_parse_from_file (rfile);
 538               xfree (rfile);
 539             }
 540           else
 541             {
 542               /* If we cannot get real specs, at least produce
 543                  dummy ones so that we can register them and stop
 544                  trying to retrieve them.  */
 545               specs = res_parse ("", 0);
 546             }
 547           res_register_specs (u->host, u->port, specs);
 548         }
 549
 550       /* Now that we have (or don't have) robots.txt specs, we can
 551          check what they say.  */
 552       if (!res_match_path (specs, u->path))
 553         {
 554           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 555           string_set_add (blacklist, url);
 556           goto out;
 557         }
 558     }
 559
 560   /* The URL has passed all the tests.  It can be placed in the
 561      download queue. */
 562   DEBUGP (("Decided to load it.\n"));
 563
 564   return 1;
 565
 566  out:
 567   DEBUGP (("Decided NOT to load it.\n"));
 568
 569   return 0;
 570 }
 571
 572 /* This function determines whether we will consider downloading the
 573    children of a URL whose download resulted in a redirection,
 574    possibly to another host, etc.  It is needed very rarely, and thus
 575    it is merely a simple-minded wrapper around download_child_p.  */
 576
 577 static int
 578 descend_redirect_p (const char *redirected, const char *original, int depth,
 579                     struct url *start_url_parsed, struct hash_table *blacklist)
 580 {
 581   struct url *orig_parsed, *new_parsed;
 582   struct urlpos *upos;
 583   int success;
 584
 585   orig_parsed = url_parse (original, NULL);
 586   assert (orig_parsed != NULL);
 587
 588   new_parsed = url_parse (redirected, NULL);
 589   assert (new_parsed != NULL);
 590
 591   upos = xmalloc (sizeof (struct urlpos));
 592   memset (upos, 0, sizeof (*upos));
 593   upos->url = new_parsed;
 594
 595   success = download_child_p (upos, orig_parsed, depth,
 596                               start_url_parsed, blacklist);
 597
 598   url_free (orig_parsed);
 599   url_free (new_parsed);
 600   xfree (upos);
 601
 602   if (!success)
 603     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 604
 605   return success;
 606 }
 607
 608 \f
 609 #define ENSURE_TABLES_EXIST do {                        \
 610   if (!dl_file_url_map)                                 \
 611     dl_file_url_map = make_string_hash_table (0);       \
 612   if (!dl_url_file_map)                                 \
 613     dl_url_file_map = make_string_hash_table (0);       \
 614 } while (0)
 615
 616 /* Return 1 if S1 and S2 are the same, except for "/index.html".  The
 617    three cases in which it returns one are (substitute any substring
 618    for "foo"):
 619
 620    m("foo/index.html", "foo/")  ==> 1
 621    m("foo/", "foo/index.html")  ==> 1
 622    m("foo", "foo/index.html")   ==> 1
 623    m("foo", "foo/"              ==> 1
 624    m("foo", "foo")              ==> 1  */
 625
 626 static int
 627 match_except_index (const char *s1, const char *s2)
 628 {
 629   int i;
 630   const char *lng;
 631
 632   /* Skip common substring. */
 633   for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
 634     ;
 635   if (i == 0)
 636     /* Strings differ at the very beginning -- bail out.  We need to
 637        check this explicitly to avoid `lng - 1' reading outside the
 638        array.  */
 639     return 0;
 640
 641   if (!*s1 && !*s2)
 642     /* Both strings hit EOF -- strings are equal. */
 643     return 1;
 644   else if (*s1 && *s2)
 645     /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
 646     return 0;
 647   else if (*s1)
 648     /* S1 is the longer one. */
 649     lng = s1;
 650   else
 651     /* S2 is the longer one. */
 652     lng = s2;
 653
 654   /* foo            */            /* foo/           */
 655   /* foo/index.html */  /* or */  /* foo/index.html */
 656   /*    ^           */            /*     ^          */
 657
 658   if (*lng != '/')
 659     /* The right-hand case. */
 660     --lng;
 661
 662   if (*lng == '/' && *(lng + 1) == '\0')
 663     /* foo  */
 664     /* foo/ */
 665     return 1;
 666
 667   return 0 == strcmp (lng, "/index.html");
 668 }
 669
 670 static int
 671 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 672 {
 673   char *mapping_url = (char *)key;
 674   char *mapping_file = (char *)value;
 675   char *file = (char *)arg;
 676
 677   if (0 == strcmp (mapping_file, file))
 678     {
 679       hash_table_remove (dl_url_file_map, mapping_url);
 680       xfree (mapping_url);
 681       xfree (mapping_file);
 682     }
 683
 684   /* Continue mapping. */
 685   return 0;
 686 }
 687
 688 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 689
 690 static void
 691 dissociate_urls_from_file (const char *file)
 692 {
 693   hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
 694                   (char *)file);
 695 }
 696
 697 /* Register that URL has been successfully downloaded to FILE.  This
 698    is used by the link conversion code to convert references to URLs
 699    to references to local files.  It is also being used to check if a
 700    URL has already been downloaded.  */
 701
 702 void
 703 register_download (const char *url, const char *file)
 704 {
 705   char *old_file, *old_url;
 706
 707   ENSURE_TABLES_EXIST;
 708
 709   /* With some forms of retrieval, it is possible, although not likely
 710      or particularly desirable.  If both are downloaded, the second
 711      download will override the first one.  When that happens,
 712      dissociate the old file name from the URL.  */
 713
 714   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 715     {
 716       if (0 == strcmp (url, old_url))
 717         /* We have somehow managed to download the same URL twice.
 718            Nothing to do.  */
 719         return;
 720
 721       if (match_except_index (url, old_url)
 722           && !hash_table_contains (dl_url_file_map, url))
 723         /* The two URLs differ only in the "index.html" ending.  For
 724            example, one is "http://www.server.com/", and the other is
 725            "http://www.server.com/index.html".  Don't remove the old
 726            one, just add the new one as a non-canonical entry.  */
 727         goto url_only;
 728
 729       hash_table_remove (dl_file_url_map, file);
 730       xfree (old_file);
 731       xfree (old_url);
 732
 733       /* Remove all the URLs that point to this file.  Yes, there can
 734          be more than one such URL, because we store redirections as
 735          multiple entries in dl_url_file_map.  For example, if URL1
 736          redirects to URL2 which gets downloaded to FILE, we map both
 737          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 738          only points to URL2.)  When another URL gets loaded to FILE,
 739          we want both URL1 and URL2 dissociated from it.
 740
 741          This is a relatively expensive operation because it performs
 742          a linear search of the whole hash table, but it should be
 743          called very rarely, only when two URLs resolve to the same
 744          file name, *and* the "<file>.1" extensions are turned off.
 745          In other words, almost never.  */
 746       dissociate_urls_from_file (file);
 747     }
 748
 749   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 750
 751  url_only:
 752   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 753      If the latter were present, it should have been removed by the
 754      above `if'.  So we could write:
 755
 756          assert (!hash_table_contains (dl_url_file_map, url));
 757
 758      The above is correct when running in recursive mode where the
 759      same URL always resolves to the same file.  But if you do
 760      something like:
 761
 762          wget URL URL
 763
 764      then the first URL will resolve to "FILE", and the other to
 765      "FILE.1".  In that case, FILE.1 will not be found in
 766      dl_file_url_map, but URL will still point to FILE in
 767      dl_url_file_map.  */
 768   if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
 769     {
 770       hash_table_remove (dl_url_file_map, url);
 771       xfree (old_url);
 772       xfree (old_file);
 773     }
 774
 775   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 776 }
 777
 778 /* Register that FROM has been redirected to TO.  This assumes that TO
 779    is successfully downloaded and already registered using
 780    register_download() above.  */
 781
 782 void
 783 register_redirection (const char *from, const char *to)
 784 {
 785   char *file;
 786
 787   ENSURE_TABLES_EXIST;
 788
 789   file = hash_table_get (dl_url_file_map, to);
 790   assert (file != NULL);
 791   if (!hash_table_contains (dl_url_file_map, from))
 792     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 793 }
 794
 795 /* Register that the file has been deleted. */
 796
 797 static void
 798 register_delete_file (const char *file)
 799 {
 800   char *old_url, *old_file;
 801
 802   ENSURE_TABLES_EXIST;
 803
 804   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 805     return;
 806
 807   hash_table_remove (dl_file_url_map, file);
 808   xfree (old_file);
 809   xfree (old_url);
 810   dissociate_urls_from_file (file);
 811 }
 812
 813 /* Register that FILE is an HTML file that has been downloaded. */
 814
 815 void
 816 register_html (const char *url, const char *file)
 817 {
 818   if (!opt.convert_links)
 819     return;
 820   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 821 }
 822
 823 /* This function is called when the retrieval is done to convert the
 824    links that have been downloaded.  It has to be called at the end of
 825    the retrieval, because only then does Wget know conclusively which
 826    URLs have been downloaded, and which not, so it can tell which
 827    direction to convert to.
 828
 829    The "direction" means that the URLs to the files that have been
 830    downloaded get converted to the relative URL which will point to
 831    that file.  And the other URLs get converted to the remote URL on
 832    the server.
 833
 834    All the downloaded HTMLs are kept in downloaded_html_files, and
 835    downloaded URLs in urls_downloaded.  All the information is
 836    extracted from these two lists.  */
 837
 838 void
 839 convert_all_links (void)
 840 {
 841   slist *html;
 842   long msecs;
 843   int file_count = 0;
 844
 845   struct wget_timer *timer = wtimer_new ();
 846   struct hash_table *seen = make_string_hash_table (0);
 847
 848   /* Destructively reverse downloaded_html_files to get it in the right order.
 849      recursive_retrieve() used slist_prepend() consistently.  */
 850   downloaded_html_files = slist_nreverse (downloaded_html_files);
 851
 852   for (html = downloaded_html_files; html; html = html->next)
 853     {
 854       struct urlpos *urls, *cur_url;
 855       char *url;
 856       char *file = html->string;
 857
 858       /* Guard against duplicates. */
 859       if (string_set_contains (seen, file))
 860         continue;
 861       string_set_add (seen, file);
 862
 863       /* Determine the URL of the HTML file.  get_urls_html will need
 864          it.  */
 865       url = hash_table_get (dl_file_url_map, file);
 866       if (!url)
 867         {
 868           DEBUGP (("Apparently %s has been removed.\n", file));
 869           continue;
 870         }
 871
 872       DEBUGP (("Scanning %s (from %s)\n", file, url));
 873
 874       /* Parse the HTML file...  */
 875       urls = get_urls_html (file, url, NULL);
 876
 877       /* We don't respect meta_disallow_follow here because, even if
 878          the file is not followed, we might still want to convert the
 879          links that have been followed from other files.  */
 880
 881       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 882         {
 883           char *local_name;
 884           struct url *u = cur_url->url;
 885
 886           if (cur_url->link_base_p)
 887             {
 888               /* Base references have been resolved by our parser, so
 889                  we turn the base URL into an empty string.  (Perhaps
 890                  we should remove the tag entirely?)  */
 891               cur_url->convert = CO_NULLIFY_BASE;
 892               continue;
 893             }
 894
 895           /* We decide the direction of conversion according to whether
 896              a URL was downloaded.  Downloaded URLs will be converted
 897              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 898           local_name = hash_table_get (dl_url_file_map, u->url);
 899
 900           /* Decide on the conversion type.  */
 901           if (local_name)
 902             {
 903               /* We've downloaded this URL.  Convert it to relative
 904                  form.  We do this even if the URL already is in
 905                  relative form, because our directory structure may
 906                  not be identical to that on the server (think `-nd',
 907                  `--cut-dirs', etc.)  */
 908               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 909               cur_url->local_name = xstrdup (local_name);
 910               DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
 911             }
 912           else
 913             {
 914               /* We haven't downloaded this URL.  If it's not already
 915                  complete (including a full host name), convert it to
 916                  that form, so it can be reached while browsing this
 917                  HTML locally.  */
 918               if (!cur_url->link_complete_p)
 919                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 920               cur_url->local_name = NULL;
 921               DEBUGP (("will convert url %s to complete\n", u->url));
 922             }
 923         }
 924
 925       /* Convert the links in the file.  */
 926       convert_links (file, urls);
 927       ++file_count;
 928
 929       /* Free the data.  */
 930       free_urlpos (urls);
 931     }
 932
 933   msecs = wtimer_elapsed (timer);
 934   wtimer_delete (timer);
 935   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 936              file_count, (double)msecs / 1000);
 937
 938   string_set_free (seen);
 939 }
 940
 941 /* Cleanup the data structures associated with recursive retrieving
 942    (the variables above).  */
 943 void
 944 recursive_cleanup (void)
 945 {
 946   if (dl_file_url_map)
 947     {
 948       free_keys_and_values (dl_file_url_map);
 949       hash_table_destroy (dl_file_url_map);
 950       dl_file_url_map = NULL;
 951     }
 952   if (dl_url_file_map)
 953     {
 954       free_keys_and_values (dl_url_file_map);
 955       hash_table_destroy (dl_url_file_map);
 956       dl_url_file_map = NULL;
 957     }
 958   slist_free (downloaded_html_files);
 959   downloaded_html_files = NULL;
 960 }