sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61 \f
  62 /* Functions for maintaining the URL queue.  */
  63
  64 struct queue_element {
  65   const char *url;
  66   const char *referer;
  67   int depth;
  68   struct queue_element *next;
  69 };
  70
  71 struct url_queue {
  72   struct queue_element *head;
  73   struct queue_element *tail;
  74   int count, maxcount;
  75 };
  76
  77 /* Create a URL queue. */
  78
  79 static struct url_queue *
  80 url_queue_new (void)
  81 {
  82   struct url_queue *queue = xmalloc (sizeof (*queue));
  83   memset (queue, '\0', sizeof (*queue));
  84   return queue;
  85 }
  86
  87 /* Delete a URL queue. */
  88
  89 static void
  90 url_queue_delete (struct url_queue *queue)
  91 {
  92   xfree (queue);
  93 }
  94
  95 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  96    retrieved ("dequeued") from the queue in the order they were placed
  97    into it.  */
  98
  99 static void
 100 url_enqueue (struct url_queue *queue,
 101              const char *url, const char *referer, int depth)
 102 {
 103   struct queue_element *qel = xmalloc (sizeof (*qel));
 104   qel->url = url;
 105   qel->referer = referer;
 106   qel->depth = depth;
 107   qel->next = NULL;
 108
 109   ++queue->count;
 110   if (queue->count > queue->maxcount)
 111     queue->maxcount = queue->count;
 112
 113   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 114   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 115
 116   if (queue->tail)
 117     queue->tail->next = qel;
 118   queue->tail = qel;
 119
 120   if (!queue->head)
 121     queue->head = queue->tail;
 122 }
 123
 124 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 125    or 0 if the queue is empty.  */
 126
 127 static int
 128 url_dequeue (struct url_queue *queue,
 129              const char **url, const char **referer, int *depth)
 130 {
 131   struct queue_element *qel = queue->head;
 132
 133   if (!qel)
 134     return 0;
 135
 136   queue->head = queue->head->next;
 137   if (!queue->head)
 138     queue->tail = NULL;
 139
 140   *url = qel->url;
 141   *referer = qel->referer;
 142   *depth = qel->depth;
 143
 144   --queue->count;
 145
 146   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 147   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 148
 149   xfree (qel);
 150   return 1;
 151 }
 152 \f
 153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
 154                                   struct url *, struct hash_table *));
 155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
 156                                        struct url *, struct hash_table *));
 157
 158
 159 /* Retrieve a part of the web beginning with START_URL.  This used to
 160    be called "recursive retrieval", because the old function was
 161    recursive and implemented depth-first search.  retrieve_tree on the
 162    other hand implements breadth-search traversal of the tree, which
 163    results in much nicer ordering of downloads.
 164
 165    The algorithm this function uses is simple:
 166
 167    1. put START_URL in the queue.
 168    2. while there are URLs in the queue:
 169
 170      3. get next URL from the queue.
 171      4. download it.
 172      5. if the URL is HTML and its depth does not exceed maximum depth,
 173         get the list of URLs embedded therein.
 174      6. for each of those URLs do the following:
 175
 176        7. if the URL is not one of those downloaded before, and if it
 177           satisfies the criteria specified by the various command-line
 178           options, add it to the queue. */
 179
 180 uerr_t
 181 retrieve_tree (const char *start_url)
 182 {
 183   uerr_t status = RETROK;
 184
 185   /* The queue of URLs we need to load. */
 186   struct url_queue *queue = url_queue_new ();
 187
 188   /* The URLs we do not wish to enqueue, because they are already in
 189      the queue, but haven't been downloaded yet.  */
 190   struct hash_table *blacklist = make_string_hash_table (0);
 191
 192   /* We'll need various components of this, so better get it over with
 193      now. */
 194   struct url *start_url_parsed = url_parse (start_url, NULL);
 195
 196   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 197   string_set_add (blacklist, start_url);
 198
 199   while (1)
 200     {
 201       int descend = 0;
 202       char *url, *referer, *file = NULL;
 203       int depth;
 204       boolean dash_p_leaf_HTML = FALSE;
 205
 206       if (downloaded_exceeds_quota ())
 207         break;
 208
 209       if (status == FWRITEERR)
 210         break;
 211
 212       /* Get the next URL from the queue. */
 213
 214       if (!url_dequeue (queue,
 215                         (const char **)&url, (const char **)&referer,
 216                         &depth))
 217         break;
 218
 219       /* And download it. */
 220
 221       {
 222         int dt = 0;
 223         char *redirected = NULL;
 224         int oldrec = opt.recursive;
 225
 226         opt.recursive = 0;
 227         status = retrieve_url (url, &file, &redirected, NULL, &dt);
 228         opt.recursive = oldrec;
 229
 230         if (file && status == RETROK
 231             && (dt & RETROKF) && (dt & TEXTHTML))
 232           descend = 1;
 233
 234         if (redirected)
 235           {
 236             /* We have been redirected, possibly to another host, or
 237                different path, or wherever.  Check whether we really
 238                want to follow it.  */
 239             if (descend)
 240               {
 241                 if (!descend_redirect_p (redirected, url, depth,
 242                                          start_url_parsed, blacklist))
 243                   descend = 0;
 244               }
 245
 246             xfree (url);
 247             url = redirected;
 248           }
 249       }
 250
 251       if (descend
 252           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 253         {
 254           if (opt.page_requisites && depth == opt.reclevel)
 255             /* When -p is specified, we can do one more partial
 256                recursion from the "leaf nodes" on the HTML document
 257                tree.  The recursion is partial in that we won't
 258                traverse any <A> or <AREA> tags, nor any <LINK> tags
 259                except for <LINK REL="stylesheet">. */
 260             dash_p_leaf_HTML = TRUE;
 261           else
 262             {
 263               /* Either -p wasn't specified or it was and we've
 264                  already gone the one extra (pseudo-)level that it
 265                  affords us, so we need to bail out. */
 266               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 267                        depth, opt.reclevel));
 268               descend = 0;
 269             }
 270         }
 271
 272       /* If the downloaded document was HTML, parse it and enqueue the
 273          links it contains. */
 274
 275       if (descend)
 276         {
 277           int meta_disallow_follow = 0;
 278           struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
 279                                                    &meta_disallow_follow);
 280
 281           if (opt.use_robots && meta_disallow_follow)
 282             {
 283               free_urlpos (children);
 284               children = NULL;
 285             }
 286
 287           if (children)
 288             {
 289               struct urlpos *child = children;
 290               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 291               assert (url_parsed != NULL);
 292
 293               for (; child; child = child->next)
 294                 {
 295                   if (child->ignore_when_downloading)
 296                     continue;
 297                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 298                                      blacklist))
 299                     {
 300                       url_enqueue (queue, xstrdup (child->url->url),
 301                                    xstrdup (url), depth + 1);
 302                       /* We blacklist the URL we have enqueued, because we
 303                          don't want to enqueue (and hence download) the
 304                          same URL twice.  */
 305                       string_set_add (blacklist, child->url->url);
 306                     }
 307                 }
 308
 309               url_free (url_parsed);
 310               free_urlpos (children);
 311             }
 312         }
 313
 314       if (opt.delete_after || (file && !acceptable (file)))
 315         {
 316           /* Either --delete-after was specified, or we loaded this
 317              otherwise rejected (e.g. by -R) HTML file just so we
 318              could harvest its hyperlinks -- in either case, delete
 319              the local file. */
 320           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 321                    opt.delete_after ? "--delete-after" :
 322                    "recursive rejection criteria"));
 323           logprintf (LOG_VERBOSE,
 324                      (opt.delete_after
 325                       ? _("Removing %s.\n")
 326                       : _("Removing %s since it should be rejected.\n")),
 327                      file);
 328           if (unlink (file))
 329             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 330         }
 331
 332       xfree (url);
 333       FREE_MAYBE (referer);
 334       FREE_MAYBE (file);
 335     }
 336
 337   /* If anything is left of the queue due to a premature exit, free it
 338      now.  */
 339   {
 340     char *d1, *d2;
 341     int d3;
 342     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 343       {
 344         xfree (d1);
 345         FREE_MAYBE (d2);
 346       }
 347   }
 348   url_queue_delete (queue);
 349
 350   if (start_url_parsed)
 351     url_free (start_url_parsed);
 352   string_set_free (blacklist);
 353
 354   if (downloaded_exceeds_quota ())
 355     return QUOTEXC;
 356   else if (status == FWRITEERR)
 357     return FWRITEERR;
 358   else
 359     return RETROK;
 360 }
 361
 362 /* Based on the context provided by retrieve_tree, decide whether a
 363    URL is to be descended to.  This is only ever called from
 364    retrieve_tree, but is in a separate function for clarity.
 365
 366    The most expensive checks (such as those for robots) are memoized
 367    by storing these URLs to BLACKLIST.  This may or may not help.  It
 368    will help if those URLs are encountered many times.  */
 369
 370 static int
 371 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
 372                struct url *start_url_parsed, struct hash_table *blacklist)
 373 {
 374   struct url *u = upos->url;
 375   const char *url = u->url;
 376
 377   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 378
 379   if (string_set_contains (blacklist, url))
 380     {
 381       DEBUGP (("Already on the black list.\n"));
 382       goto out;
 383     }
 384
 385   /* Several things to check for:
 386      1. if scheme is not http, and we don't load it
 387      2. check for relative links (if relative_only is set)
 388      3. check for domain
 389      4. check for no-parent
 390      5. check for excludes && includes
 391      6. check for suffix
 392      7. check for same host (if spanhost is unset), with possible
 393      gethostbyname baggage
 394      8. check for robots.txt
 395
 396      Addendum: If the URL is FTP, and it is to be loaded, only the
 397      domain and suffix settings are "stronger".
 398
 399      Note that .html files will get loaded regardless of suffix rules
 400      (but that is remedied later with unlink) unless the depth equals
 401      the maximum depth.
 402
 403      More time- and memory- consuming tests should be put later on
 404      the list.  */
 405
 406   /* 1. Schemes other than HTTP are normally not recursed into. */
 407   if (u->scheme != SCHEME_HTTP
 408       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 409     {
 410       DEBUGP (("Not following non-HTTP schemes.\n"));
 411       goto out;
 412     }
 413
 414   /* 2. If it is an absolute link and they are not followed, throw it
 415      out.  */
 416   if (u->scheme == SCHEME_HTTP)
 417     if (opt.relative_only && !upos->link_relative_p)
 418       {
 419         DEBUGP (("It doesn't really look like a relative link.\n"));
 420         goto out;
 421       }
 422
 423   /* 3. If its domain is not to be accepted/looked-up, chuck it
 424      out.  */
 425   if (!accept_domain (u))
 426     {
 427       DEBUGP (("The domain was not accepted.\n"));
 428       goto out;
 429     }
 430
 431   /* 4. Check for parent directory.
 432
 433      If we descended to a different host or changed the scheme, ignore
 434      opt.no_parent.  Also ignore it for -p leaf retrievals.  */
 435   if (opt.no_parent
 436       && u->scheme == parent->scheme
 437       && 0 == strcasecmp (u->host, parent->host)
 438       && u->port == parent->port)
 439     {
 440       if (!frontcmp (parent->dir, u->dir))
 441         {
 442           DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
 443           goto out;
 444         }
 445     }
 446
 447   /* 5. If the file does not match the acceptance list, or is on the
 448      rejection list, chuck it out.  The same goes for the directory
 449      exclusion and inclusion lists.  */
 450   if (opt.includes || opt.excludes)
 451     {
 452       if (!accdir (u->dir, ALLABS))
 453         {
 454           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 455           goto out;
 456         }
 457     }
 458
 459   /* 6. */
 460   {
 461     char *suf;
 462     /* Check for acceptance/rejection rules.  We ignore these rules
 463        for HTML documents because they might lead to other files which
 464        need to be downloaded.  Of course, we don't know which
 465        documents are HTML before downloading them, so we guess.
 466
 467        A file is subject to acceptance/rejection rules if:
 468
 469        * u->file is not "" (i.e. it is not a directory)
 470        and either:
 471          + there is no file suffix,
 472          + or there is a suffix, but is not "html" or "htm",
 473          + both:
 474            - recursion is not infinite,
 475            - and we are at its very end. */
 476
 477     if (u->file[0] != '\0'
 478         && ((suf = suffix (url)) == NULL
 479             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 480             || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
 481       {
 482         if (!acceptable (u->file))
 483           {
 484             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 485                      url, u->file));
 486             goto out;
 487           }
 488       }
 489   }
 490
 491   /* 7. */
 492   if (u->scheme == parent->scheme)
 493     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 494       {
 495         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 496                  u->host, parent->host));
 497         goto out;
 498       }
 499
 500   /* 8. */
 501   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 502     {
 503       struct robot_specs *specs = res_get_specs (u->host, u->port);
 504       if (!specs)
 505         {
 506           char *rfile;
 507           if (res_retrieve_file (url, &rfile))
 508             {
 509               specs = res_parse_from_file (rfile);
 510               xfree (rfile);
 511             }
 512           else
 513             {
 514               /* If we cannot get real specs, at least produce
 515                  dummy ones so that we can register them and stop
 516                  trying to retrieve them.  */
 517               specs = res_parse ("", 0);
 518             }
 519           res_register_specs (u->host, u->port, specs);
 520         }
 521
 522       /* Now that we have (or don't have) robots.txt specs, we can
 523          check what they say.  */
 524       if (!res_match_path (specs, u->path))
 525         {
 526           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 527           string_set_add (blacklist, url);
 528           goto out;
 529         }
 530     }
 531
 532   /* The URL has passed all the tests.  It can be placed in the
 533      download queue. */
 534   DEBUGP (("Decided to load it.\n"));
 535
 536   return 1;
 537
 538  out:
 539   DEBUGP (("Decided NOT to load it.\n"));
 540
 541   return 0;
 542 }
 543
 544 /* This function determines whether we should descend the children of
 545    the URL whose download resulted in a redirection, possibly to
 546    another host, etc.  It is needed very rarely, and thus it is merely
 547    a simple-minded wrapper around descend_url_p.  */
 548
 549 static int
 550 descend_redirect_p (const char *redirected, const char *original, int depth,
 551                     struct url *start_url_parsed, struct hash_table *blacklist)
 552 {
 553   struct url *orig_parsed, *new_parsed;
 554   struct urlpos *upos;
 555   int success;
 556
 557   orig_parsed = url_parse (original, NULL);
 558   assert (orig_parsed != NULL);
 559
 560   new_parsed = url_parse (redirected, NULL);
 561   assert (new_parsed != NULL);
 562
 563   upos = xmalloc (sizeof (struct urlpos));
 564   memset (upos, 0, sizeof (*upos));
 565   upos->url = new_parsed;
 566
 567   success = descend_url_p (upos, orig_parsed, depth,
 568                            start_url_parsed, blacklist);
 569
 570   url_free (orig_parsed);
 571   url_free (new_parsed);
 572   xfree (upos);
 573
 574   if (!success)
 575     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 576
 577   return success;
 578 }
 579
 580 \f
 581 /* Register that URL has been successfully downloaded to FILE. */
 582
 583 void
 584 register_download (const char *url, const char *file)
 585 {
 586   if (!opt.convert_links)
 587     return;
 588   if (!dl_file_url_map)
 589     dl_file_url_map = make_string_hash_table (0);
 590   if (!dl_url_file_map)
 591     dl_url_file_map = make_string_hash_table (0);
 592
 593   if (!hash_table_contains (dl_file_url_map, file))
 594     hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 595   if (!hash_table_contains (dl_url_file_map, url))
 596     hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 597 }
 598
 599 /* Register that FROM has been redirected to TO.  This assumes that TO
 600    is successfully downloaded and already registered using
 601    register_download() above.  */
 602
 603 void
 604 register_redirection (const char *from, const char *to)
 605 {
 606   char *file;
 607
 608   if (!opt.convert_links)
 609     return;
 610
 611   file = hash_table_get (dl_url_file_map, to);
 612   assert (file != NULL);
 613   if (!hash_table_contains (dl_url_file_map, from))
 614     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 615 }
 616
 617 /* Register that URL corresponds to the HTML file FILE. */
 618
 619 void
 620 register_html (const char *url, const char *file)
 621 {
 622   if (!opt.convert_links)
 623     return;
 624   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 625 }
 626
 627 /* This function is called when the retrieval is done to convert the
 628    links that have been downloaded.  It has to be called at the end of
 629    the retrieval, because only then does Wget know conclusively which
 630    URLs have been downloaded, and which not, so it can tell which
 631    direction to convert to.
 632
 633    The "direction" means that the URLs to the files that have been
 634    downloaded get converted to the relative URL which will point to
 635    that file.  And the other URLs get converted to the remote URL on
 636    the server.
 637
 638    All the downloaded HTMLs are kept in downloaded_html_files, and
 639    downloaded URLs in urls_downloaded.  All the information is
 640    extracted from these two lists.  */
 641
 642 void
 643 convert_all_links (void)
 644 {
 645   slist *html;
 646   struct wget_timer *timer;
 647   long msecs;
 648   int file_count = 0;
 649
 650   timer = wtimer_new ();
 651
 652   /* Destructively reverse downloaded_html_files to get it in the right order.
 653      recursive_retrieve() used slist_prepend() consistently.  */
 654   downloaded_html_files = slist_nreverse (downloaded_html_files);
 655
 656   for (html = downloaded_html_files; html; html = html->next)
 657     {
 658       struct urlpos *urls, *cur_url;
 659       char *url;
 660
 661       DEBUGP (("Rescanning %s\n", html->string));
 662
 663       /* Determine the URL of the HTML file.  get_urls_html will need
 664          it.  */
 665       url = hash_table_get (dl_file_url_map, html->string);
 666       if (url)
 667         DEBUGP (("It should correspond to %s.\n", url));
 668       else
 669         DEBUGP (("I cannot find the corresponding URL.\n"));
 670
 671       /* Parse the HTML file...  */
 672       urls = get_urls_html (html->string, url, FALSE, NULL);
 673
 674       /* We don't respect meta_disallow_follow here because, even if
 675          the file is not followed, we might still want to convert the
 676          links that have been followed from other files.  */
 677
 678       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 679         {
 680           char *local_name;
 681           struct url *u = cur_url->url;
 682
 683           if (cur_url->link_base_p)
 684             {
 685               /* Base references have been resolved by our parser, so
 686                  we turn the base URL into an empty string.  (Perhaps
 687                  we should remove the tag entirely?)  */
 688               cur_url->convert = CO_NULLIFY_BASE;
 689               continue;
 690             }
 691
 692           /* We decide the direction of conversion according to whether
 693              a URL was downloaded.  Downloaded URLs will be converted
 694              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 695           local_name = hash_table_get (dl_url_file_map, u->url);
 696           if (local_name)
 697             DEBUGP (("%s marked for conversion, local %s\n",
 698                      u->url, local_name));
 699
 700           /* Decide on the conversion type.  */
 701           if (local_name)
 702             {
 703               /* We've downloaded this URL.  Convert it to relative
 704                  form.  We do this even if the URL already is in
 705                  relative form, because our directory structure may
 706                  not be identical to that on the server (think `-nd',
 707                  `--cut-dirs', etc.)  */
 708               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 709               cur_url->local_name = xstrdup (local_name);
 710             }
 711           else
 712             {
 713               /* We haven't downloaded this URL.  If it's not already
 714                  complete (including a full host name), convert it to
 715                  that form, so it can be reached while browsing this
 716                  HTML locally.  */
 717               if (!cur_url->link_complete_p)
 718                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 719               cur_url->local_name = NULL;
 720             }
 721         }
 722
 723       /* Convert the links in the file.  */
 724       convert_links (html->string, urls);
 725       ++file_count;
 726
 727       /* Free the data.  */
 728       free_urlpos (urls);
 729     }
 730
 731   msecs = wtimer_elapsed (timer);
 732   wtimer_delete (timer);
 733   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 734              file_count, (double)msecs / 1000);
 735 }
 736
 737 /* Cleanup the data structures associated with recursive retrieving
 738    (the variables above).  */
 739 void
 740 recursive_cleanup (void)
 741 {
 742   if (dl_file_url_map)
 743     {
 744       free_keys_and_values (dl_file_url_map);
 745       hash_table_destroy (dl_file_url_map);
 746       dl_file_url_map = NULL;
 747     }
 748   if (dl_url_file_map)
 749     {
 750       free_keys_and_values (dl_url_file_map);
 751       hash_table_destroy (dl_url_file_map);
 752       dl_url_file_map = NULL;
 753     }
 754   slist_free (downloaded_html_files);
 755   downloaded_html_files = NULL;
 756 }