sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61
  62 static void register_delete_file PARAMS ((const char *));
  63 \f
  64 /* Functions for maintaining the URL queue.  */
  65
  66 struct queue_element {
  67   const char *url;
  68   const char *referer;
  69   int depth;
  70   struct queue_element *next;
  71 };
  72
  73 struct url_queue {
  74   struct queue_element *head;
  75   struct queue_element *tail;
  76   int count, maxcount;
  77 };
  78
  79 /* Create a URL queue. */
  80
  81 static struct url_queue *
  82 url_queue_new (void)
  83 {
  84   struct url_queue *queue = xmalloc (sizeof (*queue));
  85   memset (queue, '\0', sizeof (*queue));
  86   return queue;
  87 }
  88
  89 /* Delete a URL queue. */
  90
  91 static void
  92 url_queue_delete (struct url_queue *queue)
  93 {
  94   xfree (queue);
  95 }
  96
  97 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  98    retrieved ("dequeued") from the queue in the order they were placed
  99    into it.  */
 100
 101 static void
 102 url_enqueue (struct url_queue *queue,
 103              const char *url, const char *referer, int depth)
 104 {
 105   struct queue_element *qel = xmalloc (sizeof (*qel));
 106   qel->url = url;
 107   qel->referer = referer;
 108   qel->depth = depth;
 109   qel->next = NULL;
 110
 111   ++queue->count;
 112   if (queue->count > queue->maxcount)
 113     queue->maxcount = queue->count;
 114
 115   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 116   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 117
 118   if (queue->tail)
 119     queue->tail->next = qel;
 120   queue->tail = qel;
 121
 122   if (!queue->head)
 123     queue->head = queue->tail;
 124 }
 125
 126 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 127    or 0 if the queue is empty.  */
 128
 129 static int
 130 url_dequeue (struct url_queue *queue,
 131              const char **url, const char **referer, int *depth)
 132 {
 133   struct queue_element *qel = queue->head;
 134
 135   if (!qel)
 136     return 0;
 137
 138   queue->head = queue->head->next;
 139   if (!queue->head)
 140     queue->tail = NULL;
 141
 142   *url = qel->url;
 143   *referer = qel->referer;
 144   *depth = qel->depth;
 145
 146   --queue->count;
 147
 148   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 149   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 150
 151   xfree (qel);
 152   return 1;
 153 }
 154 \f
 155 static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
 156                                      struct url *, struct hash_table *));
 157 static int descend_redirect_p PARAMS ((const char *, const char *, int,
 158                                        struct url *, struct hash_table *));
 159
 160
 161 /* Retrieve a part of the web beginning with START_URL.  This used to
 162    be called "recursive retrieval", because the old function was
 163    recursive and implemented depth-first search.  retrieve_tree on the
 164    other hand implements breadth-search traversal of the tree, which
 165    results in much nicer ordering of downloads.
 166
 167    The algorithm this function uses is simple:
 168
 169    1. put START_URL in the queue.
 170    2. while there are URLs in the queue:
 171
 172      3. get next URL from the queue.
 173      4. download it.
 174      5. if the URL is HTML and its depth does not exceed maximum depth,
 175         get the list of URLs embedded therein.
 176      6. for each of those URLs do the following:
 177
 178        7. if the URL is not one of those downloaded before, and if it
 179           satisfies the criteria specified by the various command-line
 180           options, add it to the queue. */
 181
 182 uerr_t
 183 retrieve_tree (const char *start_url)
 184 {
 185   uerr_t status = RETROK;
 186
 187   /* The queue of URLs we need to load. */
 188   struct url_queue *queue = url_queue_new ();
 189
 190   /* The URLs we do not wish to enqueue, because they are already in
 191      the queue, but haven't been downloaded yet.  */
 192   struct hash_table *blacklist = make_string_hash_table (0);
 193
 194   /* We'll need various components of this, so better get it over with
 195      now. */
 196   struct url *start_url_parsed = url_parse (start_url, NULL);
 197
 198   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 199   string_set_add (blacklist, start_url);
 200
 201   while (1)
 202     {
 203       int descend = 0;
 204       char *url, *referer, *file = NULL;
 205       int depth;
 206       boolean dash_p_leaf_HTML = FALSE;
 207
 208       if (downloaded_exceeds_quota ())
 209         break;
 210       if (status == FWRITEERR)
 211         break;
 212
 213       /* Get the next URL from the queue... */
 214
 215       if (!url_dequeue (queue,
 216                         (const char **)&url, (const char **)&referer,
 217                         &depth))
 218         break;
 219
 220       /* ...and download it.  Note that this download is in most cases
 221          unconditional, as download_child_p already makes sure a file
 222          doesn't get enqueued twice -- and yet this check is here, and
 223          not in download_child_p.  This is so that if you run `wget -r
 224          URL1 URL2', and a random URL is encountered once under URL1
 225          and again under URL2, but at a different (possibly smaller)
 226          depth, we want the URL's children to be taken into account
 227          the second time.  */
 228       if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
 229         {
 230           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
 231                    url, (char *)hash_table_get (dl_url_file_map, url)));
 232         }
 233       else
 234         {
 235           int dt = 0;
 236           char *redirected = NULL;
 237           int oldrec = opt.recursive;
 238
 239           opt.recursive = 0;
 240           status = retrieve_url (url, &file, &redirected, NULL, &dt);
 241           opt.recursive = oldrec;
 242
 243           if (file && status == RETROK
 244               && (dt & RETROKF) && (dt & TEXTHTML))
 245             descend = 1;
 246
 247           if (redirected)
 248             {
 249               /* We have been redirected, possibly to another host, or
 250                  different path, or wherever.  Check whether we really
 251                  want to follow it.  */
 252               if (descend)
 253                 {
 254                   if (!descend_redirect_p (redirected, url, depth,
 255                                            start_url_parsed, blacklist))
 256                     descend = 0;
 257                   else
 258                     /* Make sure that the old pre-redirect form gets
 259                        blacklisted. */
 260                     string_set_add (blacklist, url);
 261                 }
 262
 263               xfree (url);
 264               url = redirected;
 265             }
 266         }
 267
 268       if (descend
 269           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 270         {
 271           if (opt.page_requisites
 272               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 273             {
 274               /* When -p is specified, we are allowed to exceed the
 275                  maximum depth, but only for the "inline" links,
 276                  i.e. those that are needed to display the page.
 277                  Originally this could exceed the depth at most by
 278                  one, but we allow one more level so that the leaf
 279                  pages that contain frames can be loaded
 280                  correctly.  */
 281               dash_p_leaf_HTML = TRUE;
 282             }
 283           else
 284             {
 285               /* Either -p wasn't specified or it was and we've
 286                  already spent the two extra (pseudo-)levels that it
 287                  affords us, so we need to bail out. */
 288               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 289                        depth, opt.reclevel));
 290               descend = 0;
 291             }
 292         }
 293
 294       /* If the downloaded document was HTML, parse it and enqueue the
 295          links it contains. */
 296
 297       if (descend)
 298         {
 299           int meta_disallow_follow = 0;
 300           struct urlpos *children
 301             = get_urls_html (file, url, &meta_disallow_follow);
 302
 303           if (opt.use_robots && meta_disallow_follow)
 304             {
 305               free_urlpos (children);
 306               children = NULL;
 307             }
 308
 309           if (children)
 310             {
 311               struct urlpos *child = children;
 312               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 313               assert (url_parsed != NULL);
 314
 315               for (; child; child = child->next)
 316                 {
 317                   if (child->ignore_when_downloading)
 318                     continue;
 319                   if (dash_p_leaf_HTML && !child->link_inline_p)
 320                     continue;
 321                   if (download_child_p (child, url_parsed, depth, start_url_parsed,
 322                                         blacklist))
 323                     {
 324                       url_enqueue (queue, xstrdup (child->url->url),
 325                                    xstrdup (url), depth + 1);
 326                       /* We blacklist the URL we have enqueued, because we
 327                          don't want to enqueue (and hence download) the
 328                          same URL twice.  */
 329                       string_set_add (blacklist, child->url->url);
 330                     }
 331                 }
 332
 333               url_free (url_parsed);
 334               free_urlpos (children);
 335             }
 336         }
 337
 338       if (opt.delete_after || (file && !acceptable (file)))
 339         {
 340           /* Either --delete-after was specified, or we loaded this
 341              otherwise rejected (e.g. by -R) HTML file just so we
 342              could harvest its hyperlinks -- in either case, delete
 343              the local file. */
 344           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 345                    opt.delete_after ? "--delete-after" :
 346                    "recursive rejection criteria"));
 347           logprintf (LOG_VERBOSE,
 348                      (opt.delete_after
 349                       ? _("Removing %s.\n")
 350                       : _("Removing %s since it should be rejected.\n")),
 351                      file);
 352           if (unlink (file))
 353             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 354           register_delete_file (file);
 355         }
 356
 357       xfree (url);
 358       FREE_MAYBE (referer);
 359       FREE_MAYBE (file);
 360     }
 361
 362   /* If anything is left of the queue due to a premature exit, free it
 363      now.  */
 364   {
 365     char *d1, *d2;
 366     int d3;
 367     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 368       {
 369         xfree (d1);
 370         FREE_MAYBE (d2);
 371       }
 372   }
 373   url_queue_delete (queue);
 374
 375   if (start_url_parsed)
 376     url_free (start_url_parsed);
 377   string_set_free (blacklist);
 378
 379   if (downloaded_exceeds_quota ())
 380     return QUOTEXC;
 381   else if (status == FWRITEERR)
 382     return FWRITEERR;
 383   else
 384     return RETROK;
 385 }
 386
 387 /* Based on the context provided by retrieve_tree, decide whether a
 388    URL is to be descended to.  This is only ever called from
 389    retrieve_tree, but is in a separate function for clarity.
 390
 391    The most expensive checks (such as those for robots) are memoized
 392    by storing these URLs to BLACKLIST.  This may or may not help.  It
 393    will help if those URLs are encountered many times.  */
 394
 395 static int
 396 download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 397                   struct url *start_url_parsed, struct hash_table *blacklist)
 398 {
 399   struct url *u = upos->url;
 400   const char *url = u->url;
 401
 402   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 403
 404   if (string_set_contains (blacklist, url))
 405     {
 406       DEBUGP (("Already on the black list.\n"));
 407       goto out;
 408     }
 409
 410   /* Several things to check for:
 411      1. if scheme is not http, and we don't load it
 412      2. check for relative links (if relative_only is set)
 413      3. check for domain
 414      4. check for no-parent
 415      5. check for excludes && includes
 416      6. check for suffix
 417      7. check for same host (if spanhost is unset), with possible
 418      gethostbyname baggage
 419      8. check for robots.txt
 420
 421      Addendum: If the URL is FTP, and it is to be loaded, only the
 422      domain and suffix settings are "stronger".
 423
 424      Note that .html files will get loaded regardless of suffix rules
 425      (but that is remedied later with unlink) unless the depth equals
 426      the maximum depth.
 427
 428      More time- and memory- consuming tests should be put later on
 429      the list.  */
 430
 431   /* 1. Schemes other than HTTP are normally not recursed into. */
 432   if (u->scheme != SCHEME_HTTP
 433       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 434     {
 435       DEBUGP (("Not following non-HTTP schemes.\n"));
 436       goto out;
 437     }
 438
 439   /* 2. If it is an absolute link and they are not followed, throw it
 440      out.  */
 441   if (u->scheme == SCHEME_HTTP)
 442     if (opt.relative_only && !upos->link_relative_p)
 443       {
 444         DEBUGP (("It doesn't really look like a relative link.\n"));
 445         goto out;
 446       }
 447
 448   /* 3. If its domain is not to be accepted/looked-up, chuck it
 449      out.  */
 450   if (!accept_domain (u))
 451     {
 452       DEBUGP (("The domain was not accepted.\n"));
 453       goto out;
 454     }
 455
 456   /* 4. Check for parent directory.
 457
 458      If we descended to a different host or changed the scheme, ignore
 459      opt.no_parent.  Also ignore it for documents needed to display
 460      the parent page when in -p mode.  */
 461   if (opt.no_parent
 462       && u->scheme == start_url_parsed->scheme
 463       && 0 == strcasecmp (u->host, start_url_parsed->host)
 464       && u->port == start_url_parsed->port
 465       && !(opt.page_requisites && upos->link_inline_p))
 466     {
 467       if (!frontcmp (start_url_parsed->dir, u->dir))
 468         {
 469           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 470                    u->dir, start_url_parsed->dir));
 471           goto out;
 472         }
 473     }
 474
 475   /* 5. If the file does not match the acceptance list, or is on the
 476      rejection list, chuck it out.  The same goes for the directory
 477      exclusion and inclusion lists.  */
 478   if (opt.includes || opt.excludes)
 479     {
 480       if (!accdir (u->dir, ALLABS))
 481         {
 482           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 483           goto out;
 484         }
 485     }
 486
 487   /* 6. */
 488   {
 489     char *suf;
 490     /* Check for acceptance/rejection rules.  We ignore these rules
 491        for HTML documents because they might lead to other files which
 492        need to be downloaded.  Of course, we don't know which
 493        documents are HTML before downloading them, so we guess.
 494
 495        A file is subject to acceptance/rejection rules if:
 496
 497        * u->file is not "" (i.e. it is not a directory)
 498        and either:
 499          + there is no file suffix,
 500          + or there is a suffix, but is not "html" or "htm",
 501          + both:
 502            - recursion is not infinite,
 503            - and we are at its very end. */
 504
 505     if (u->file[0] != '\0'
 506         && ((suf = suffix (url)) == NULL
 507             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 508             || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
 509       {
 510         if (!acceptable (u->file))
 511           {
 512             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 513                      url, u->file));
 514             goto out;
 515           }
 516       }
 517   }
 518
 519   /* 7. */
 520   if (u->scheme == parent->scheme)
 521     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 522       {
 523         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 524                  u->host, parent->host));
 525         goto out;
 526       }
 527
 528   /* 8. */
 529   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 530     {
 531       struct robot_specs *specs = res_get_specs (u->host, u->port);
 532       if (!specs)
 533         {
 534           char *rfile;
 535           if (res_retrieve_file (url, &rfile))
 536             {
 537               specs = res_parse_from_file (rfile);
 538               xfree (rfile);
 539             }
 540           else
 541             {
 542               /* If we cannot get real specs, at least produce
 543                  dummy ones so that we can register them and stop
 544                  trying to retrieve them.  */
 545               specs = res_parse ("", 0);
 546             }
 547           res_register_specs (u->host, u->port, specs);
 548         }
 549
 550       /* Now that we have (or don't have) robots.txt specs, we can
 551          check what they say.  */
 552       if (!res_match_path (specs, u->path))
 553         {
 554           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 555           string_set_add (blacklist, url);
 556           goto out;
 557         }
 558     }
 559
 560   /* The URL has passed all the tests.  It can be placed in the
 561      download queue. */
 562   DEBUGP (("Decided to load it.\n"));
 563
 564   return 1;
 565
 566  out:
 567   DEBUGP (("Decided NOT to load it.\n"));
 568
 569   return 0;
 570 }
 571
 572 /* This function determines whether we will consider downloading the
 573    children of a URL whose download resulted in a redirection,
 574    possibly to another host, etc.  It is needed very rarely, and thus
 575    it is merely a simple-minded wrapper around download_child_p.  */
 576
 577 static int
 578 descend_redirect_p (const char *redirected, const char *original, int depth,
 579                     struct url *start_url_parsed, struct hash_table *blacklist)
 580 {
 581   struct url *orig_parsed, *new_parsed;
 582   struct urlpos *upos;
 583   int success;
 584
 585   orig_parsed = url_parse (original, NULL);
 586   assert (orig_parsed != NULL);
 587
 588   new_parsed = url_parse (redirected, NULL);
 589   assert (new_parsed != NULL);
 590
 591   upos = xmalloc (sizeof (struct urlpos));
 592   memset (upos, 0, sizeof (*upos));
 593   upos->url = new_parsed;
 594
 595   success = download_child_p (upos, orig_parsed, depth,
 596                               start_url_parsed, blacklist);
 597
 598   url_free (orig_parsed);
 599   url_free (new_parsed);
 600   xfree (upos);
 601
 602   if (!success)
 603     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 604
 605   return success;
 606 }
 607
 608 \f
 609 #define ENSURE_TABLES_EXIST do {                        \
 610   if (!dl_file_url_map)                                 \
 611     dl_file_url_map = make_string_hash_table (0);       \
 612   if (!dl_url_file_map)                                 \
 613     dl_url_file_map = make_string_hash_table (0);       \
 614 } while (0)
 615
 616 static int
 617 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
 618 {
 619   char *mapping_url = (char *)key;
 620   char *mapping_file = (char *)value;
 621   char *file = (char *)arg;
 622
 623   if (0 == strcmp (mapping_file, file))
 624     {
 625       hash_table_remove (dl_url_file_map, mapping_url);
 626       xfree (mapping_url);
 627       xfree (mapping_file);
 628     }
 629
 630   /* Continue mapping. */
 631   return 0;
 632 }
 633
 634 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
 635
 636 static void
 637 dissociate_urls_from_file (const char *file)
 638 {
 639   hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,
 640                   (char *)file);
 641 }
 642
 643 /* Register that URL has been successfully downloaded to FILE.  This
 644    is used by the link conversion code to convert references to URLs
 645    to references to local files.  It is also being used to check if a
 646    URL has already been downloaded.  */
 647
 648 void
 649 register_download (const char *url, const char *file)
 650 {
 651   char *old_file, *old_url;
 652
 653   ENSURE_TABLES_EXIST;
 654
 655   /* With some forms of retrieval, it is possible, although not
 656      likely, for different URLs to resolve to the same file name.  For
 657      example, "http://www.server.com/" and
 658      "http://www.server.com/index.html" will both resolve to the same
 659      file, "index.html".  If both are downloaded, the second download
 660      will override the first one.
 661
 662      If that happens, dissociate the old file name from the URL.  */
 663
 664   if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 665     {
 666       if (0 == strcmp (url, old_url))
 667         /* We have somehow managed to download the same URL twice.
 668            Nothing to do.  */
 669         return;
 670
 671       hash_table_remove (dl_file_url_map, file);
 672       xfree (old_file);
 673       xfree (old_url);
 674
 675       /* Remove all the URLs that point to this file.  Yes, there can
 676          be more than one such URL, because we store redirections as
 677          multiple entries in dl_url_file_map.  For example, if URL1
 678          redirects to URL2 which gets downloaded to FILE, we map both
 679          URL1 and URL2 to FILE in dl_url_file_map.  (dl_file_url_map
 680          only points to URL2.)  When another URL gets loaded to FILE,
 681          we want both URL1 and URL2 dissociated from it.
 682
 683          This is a relatively expensive operation because it performs
 684          a linear search of the whole hash table, but it should be
 685          called very rarely, only when two URLs resolve to the same
 686          file name, *and* the "<file>.1" extensions are turned off.
 687          In other words, almost never.  */
 688       dissociate_urls_from_file (file);
 689     }
 690
 691   /* A URL->FILE mapping is not possible without a FILE->URL mapping.
 692      If the latter were present, it should have been removed by the
 693      above `if'.  */
 694   assert (!hash_table_contains (dl_url_file_map, url));
 695
 696   hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 697   hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 698 }
 699
 700 /* Register that FROM has been redirected to TO.  This assumes that TO
 701    is successfully downloaded and already registered using
 702    register_download() above.  */
 703
 704 void
 705 register_redirection (const char *from, const char *to)
 706 {
 707   char *file;
 708
 709   ENSURE_TABLES_EXIST;
 710
 711   file = hash_table_get (dl_url_file_map, to);
 712   assert (file != NULL);
 713   if (!hash_table_contains (dl_url_file_map, from))
 714     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 715 }
 716
 717 /* Register that the file has been deleted. */
 718
 719 static void
 720 register_delete_file (const char *file)
 721 {
 722   char *old_url, *old_file;
 723
 724   ENSURE_TABLES_EXIST;
 725
 726   if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
 727     return;
 728
 729   hash_table_remove (dl_file_url_map, file);
 730   xfree (old_file);
 731   xfree (old_url);
 732   dissociate_urls_from_file (file);
 733 }
 734
 735 /* Register that FILE is an HTML file that has been downloaded. */
 736
 737 void
 738 register_html (const char *url, const char *file)
 739 {
 740   if (!opt.convert_links)
 741     return;
 742   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 743 }
 744
 745 /* This function is called when the retrieval is done to convert the
 746    links that have been downloaded.  It has to be called at the end of
 747    the retrieval, because only then does Wget know conclusively which
 748    URLs have been downloaded, and which not, so it can tell which
 749    direction to convert to.
 750
 751    The "direction" means that the URLs to the files that have been
 752    downloaded get converted to the relative URL which will point to
 753    that file.  And the other URLs get converted to the remote URL on
 754    the server.
 755
 756    All the downloaded HTMLs are kept in downloaded_html_files, and
 757    downloaded URLs in urls_downloaded.  All the information is
 758    extracted from these two lists.  */
 759
 760 void
 761 convert_all_links (void)
 762 {
 763   slist *html;
 764   struct wget_timer *timer;
 765   long msecs;
 766   int file_count = 0;
 767
 768   timer = wtimer_new ();
 769
 770   /* Destructively reverse downloaded_html_files to get it in the right order.
 771      recursive_retrieve() used slist_prepend() consistently.  */
 772   downloaded_html_files = slist_nreverse (downloaded_html_files);
 773
 774   for (html = downloaded_html_files; html; html = html->next)
 775     {
 776       struct urlpos *urls, *cur_url;
 777       char *url;
 778
 779       /* Determine the URL of the HTML file.  get_urls_html will need
 780          it.  */
 781       url = hash_table_get (dl_file_url_map, html->string);
 782       if (!url)
 783         {
 784           DEBUGP (("Apparently %s has been removed.\n", html->string));
 785           continue;
 786         }
 787
 788       DEBUGP (("Rescanning %s (from %s)\n", html->string, url));
 789
 790       /* Parse the HTML file...  */
 791       urls = get_urls_html (html->string, url, NULL);
 792
 793       /* We don't respect meta_disallow_follow here because, even if
 794          the file is not followed, we might still want to convert the
 795          links that have been followed from other files.  */
 796
 797       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 798         {
 799           char *local_name;
 800           struct url *u = cur_url->url;
 801
 802           if (cur_url->link_base_p)
 803             {
 804               /* Base references have been resolved by our parser, so
 805                  we turn the base URL into an empty string.  (Perhaps
 806                  we should remove the tag entirely?)  */
 807               cur_url->convert = CO_NULLIFY_BASE;
 808               continue;
 809             }
 810
 811           /* We decide the direction of conversion according to whether
 812              a URL was downloaded.  Downloaded URLs will be converted
 813              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 814           local_name = hash_table_get (dl_url_file_map, u->url);
 815           if (local_name)
 816             DEBUGP (("%s marked for conversion, local %s\n",
 817                      u->url, local_name));
 818
 819           /* Decide on the conversion type.  */
 820           if (local_name)
 821             {
 822               /* We've downloaded this URL.  Convert it to relative
 823                  form.  We do this even if the URL already is in
 824                  relative form, because our directory structure may
 825                  not be identical to that on the server (think `-nd',
 826                  `--cut-dirs', etc.)  */
 827               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 828               cur_url->local_name = xstrdup (local_name);
 829             }
 830           else
 831             {
 832               /* We haven't downloaded this URL.  If it's not already
 833                  complete (including a full host name), convert it to
 834                  that form, so it can be reached while browsing this
 835                  HTML locally.  */
 836               if (!cur_url->link_complete_p)
 837                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 838               cur_url->local_name = NULL;
 839             }
 840         }
 841
 842       /* Convert the links in the file.  */
 843       convert_links (html->string, urls);
 844       ++file_count;
 845
 846       /* Free the data.  */
 847       free_urlpos (urls);
 848     }
 849
 850   msecs = wtimer_elapsed (timer);
 851   wtimer_delete (timer);
 852   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 853              file_count, (double)msecs / 1000);
 854 }
 855
 856 /* Cleanup the data structures associated with recursive retrieving
 857    (the variables above).  */
 858 void
 859 recursive_cleanup (void)
 860 {
 861   if (dl_file_url_map)
 862     {
 863       free_keys_and_values (dl_file_url_map);
 864       hash_table_destroy (dl_file_url_map);
 865       dl_file_url_map = NULL;
 866     }
 867   if (dl_url_file_map)
 868     {
 869       free_keys_and_values (dl_url_file_map);
 870       hash_table_destroy (dl_url_file_map);
 871       dl_url_file_map = NULL;
 872     }
 873   slist_free (downloaded_html_files);
 874   downloaded_html_files = NULL;
 875 }