sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61 \f
  62 /* Functions for maintaining the URL queue.  */
  63
  64 struct queue_element {
  65   const char *url;
  66   const char *referer;
  67   int depth;
  68   struct queue_element *next;
  69 };
  70
  71 struct url_queue {
  72   struct queue_element *head;
  73   struct queue_element *tail;
  74   int count, maxcount;
  75 };
  76
  77 /* Create a URL queue. */
  78
  79 static struct url_queue *
  80 url_queue_new (void)
  81 {
  82   struct url_queue *queue = xmalloc (sizeof (*queue));
  83   memset (queue, '\0', sizeof (*queue));
  84   return queue;
  85 }
  86
  87 /* Delete a URL queue. */
  88
  89 static void
  90 url_queue_delete (struct url_queue *queue)
  91 {
  92   xfree (queue);
  93 }
  94
  95 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  96    retrieved ("dequeued") from the queue in the order they were placed
  97    into it.  */
  98
  99 static void
 100 url_enqueue (struct url_queue *queue,
 101              const char *url, const char *referer, int depth)
 102 {
 103   struct queue_element *qel = xmalloc (sizeof (*qel));
 104   qel->url = url;
 105   qel->referer = referer;
 106   qel->depth = depth;
 107   qel->next = NULL;
 108
 109   ++queue->count;
 110   if (queue->count > queue->maxcount)
 111     queue->maxcount = queue->count;
 112
 113   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 114   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 115
 116   if (queue->tail)
 117     queue->tail->next = qel;
 118   queue->tail = qel;
 119
 120   if (!queue->head)
 121     queue->head = queue->tail;
 122 }
 123
 124 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 125    or 0 if the queue is empty.  */
 126
 127 static int
 128 url_dequeue (struct url_queue *queue,
 129              const char **url, const char **referer, int *depth)
 130 {
 131   struct queue_element *qel = queue->head;
 132
 133   if (!qel)
 134     return 0;
 135
 136   queue->head = queue->head->next;
 137   if (!queue->head)
 138     queue->tail = NULL;
 139
 140   *url = qel->url;
 141   *referer = qel->referer;
 142   *depth = qel->depth;
 143
 144   --queue->count;
 145
 146   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 147   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 148
 149   xfree (qel);
 150   return 1;
 151 }
 152 \f
 153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
 154                                   struct url *, struct hash_table *));
 155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
 156                                        struct url *, struct hash_table *));
 157
 158
 159 /* Retrieve a part of the web beginning with START_URL.  This used to
 160    be called "recursive retrieval", because the old function was
 161    recursive and implemented depth-first search.  retrieve_tree on the
 162    other hand implements breadth-search traversal of the tree, which
 163    results in much nicer ordering of downloads.
 164
 165    The algorithm this function uses is simple:
 166
 167    1. put START_URL in the queue.
 168    2. while there are URLs in the queue:
 169
 170      3. get next URL from the queue.
 171      4. download it.
 172      5. if the URL is HTML and its depth does not exceed maximum depth,
 173         get the list of URLs embedded therein.
 174      6. for each of those URLs do the following:
 175
 176        7. if the URL is not one of those downloaded before, and if it
 177           satisfies the criteria specified by the various command-line
 178           options, add it to the queue. */
 179
 180 uerr_t
 181 retrieve_tree (const char *start_url)
 182 {
 183   uerr_t status = RETROK;
 184
 185   /* The queue of URLs we need to load. */
 186   struct url_queue *queue = url_queue_new ();
 187
 188   /* The URLs we do not wish to enqueue, because they are already in
 189      the queue, but haven't been downloaded yet.  */
 190   struct hash_table *blacklist = make_string_hash_table (0);
 191
 192   /* We'll need various components of this, so better get it over with
 193      now. */
 194   struct url *start_url_parsed = url_parse (start_url, NULL);
 195
 196   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 197   string_set_add (blacklist, start_url);
 198
 199   while (1)
 200     {
 201       int descend = 0;
 202       char *url, *referer, *file = NULL;
 203       int depth;
 204       boolean dash_p_leaf_HTML = FALSE;
 205
 206       if (downloaded_exceeds_quota ())
 207         break;
 208
 209       if (status == FWRITEERR)
 210         break;
 211
 212       /* Get the next URL from the queue. */
 213
 214       if (!url_dequeue (queue,
 215                         (const char **)&url, (const char **)&referer,
 216                         &depth))
 217         break;
 218
 219       /* And download it. */
 220
 221       {
 222         int dt = 0;
 223         char *redirected = NULL;
 224         int oldrec = opt.recursive;
 225
 226         opt.recursive = 0;
 227         status = retrieve_url (url, &file, &redirected, NULL, &dt);
 228         opt.recursive = oldrec;
 229
 230         if (file && status == RETROK
 231             && (dt & RETROKF) && (dt & TEXTHTML))
 232           descend = 1;
 233
 234         if (redirected)
 235           {
 236             /* We have been redirected, possibly to another host, or
 237                different path, or wherever.  Check whether we really
 238                want to follow it.  */
 239             if (descend)
 240               {
 241                 if (!descend_redirect_p (redirected, url, depth,
 242                                          start_url_parsed, blacklist))
 243                   descend = 0;
 244                 else
 245                   /* Make sure that the old pre-redirect form gets
 246                      blacklisted. */
 247                   string_set_add (blacklist, url);
 248               }
 249
 250             xfree (url);
 251             url = redirected;
 252           }
 253       }
 254
 255       if (descend
 256           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 257         {
 258           if (opt.page_requisites && depth == opt.reclevel)
 259             /* When -p is specified, we can do one more partial
 260                recursion from the "leaf nodes" on the HTML document
 261                tree.  The recursion is partial in that we won't
 262                traverse any <A> or <AREA> tags, nor any <LINK> tags
 263                except for <LINK REL="stylesheet">. */
 264             dash_p_leaf_HTML = TRUE;
 265           else
 266             {
 267               /* Either -p wasn't specified or it was and we've
 268                  already gone the one extra (pseudo-)level that it
 269                  affords us, so we need to bail out. */
 270               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 271                        depth, opt.reclevel));
 272               descend = 0;
 273             }
 274         }
 275
 276       /* If the downloaded document was HTML, parse it and enqueue the
 277          links it contains. */
 278
 279       if (descend)
 280         {
 281           int meta_disallow_follow = 0;
 282           struct urlpos *children
 283             = get_urls_html (file, url, &meta_disallow_follow);
 284
 285           if (opt.use_robots && meta_disallow_follow)
 286             {
 287               free_urlpos (children);
 288               children = NULL;
 289             }
 290
 291           if (children)
 292             {
 293               struct urlpos *child = children;
 294               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 295               assert (url_parsed != NULL);
 296
 297               for (; child; child = child->next)
 298                 {
 299                   if (child->ignore_when_downloading)
 300                     continue;
 301                   if (dash_p_leaf_HTML && !child->link_inline_p)
 302                     continue;
 303                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 304                                      blacklist))
 305                     {
 306                       url_enqueue (queue, xstrdup (child->url->url),
 307                                    xstrdup (url), depth + 1);
 308                       /* We blacklist the URL we have enqueued, because we
 309                          don't want to enqueue (and hence download) the
 310                          same URL twice.  */
 311                       string_set_add (blacklist, child->url->url);
 312                     }
 313                 }
 314
 315               url_free (url_parsed);
 316               free_urlpos (children);
 317             }
 318         }
 319
 320       if (opt.delete_after || (file && !acceptable (file)))
 321         {
 322           /* Either --delete-after was specified, or we loaded this
 323              otherwise rejected (e.g. by -R) HTML file just so we
 324              could harvest its hyperlinks -- in either case, delete
 325              the local file. */
 326           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 327                    opt.delete_after ? "--delete-after" :
 328                    "recursive rejection criteria"));
 329           logprintf (LOG_VERBOSE,
 330                      (opt.delete_after
 331                       ? _("Removing %s.\n")
 332                       : _("Removing %s since it should be rejected.\n")),
 333                      file);
 334           if (unlink (file))
 335             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 336         }
 337
 338       xfree (url);
 339       FREE_MAYBE (referer);
 340       FREE_MAYBE (file);
 341     }
 342
 343   /* If anything is left of the queue due to a premature exit, free it
 344      now.  */
 345   {
 346     char *d1, *d2;
 347     int d3;
 348     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 349       {
 350         xfree (d1);
 351         FREE_MAYBE (d2);
 352       }
 353   }
 354   url_queue_delete (queue);
 355
 356   if (start_url_parsed)
 357     url_free (start_url_parsed);
 358   string_set_free (blacklist);
 359
 360   if (downloaded_exceeds_quota ())
 361     return QUOTEXC;
 362   else if (status == FWRITEERR)
 363     return FWRITEERR;
 364   else
 365     return RETROK;
 366 }
 367
 368 /* Based on the context provided by retrieve_tree, decide whether a
 369    URL is to be descended to.  This is only ever called from
 370    retrieve_tree, but is in a separate function for clarity.
 371
 372    The most expensive checks (such as those for robots) are memoized
 373    by storing these URLs to BLACKLIST.  This may or may not help.  It
 374    will help if those URLs are encountered many times.  */
 375
 376 static int
 377 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
 378                struct url *start_url_parsed, struct hash_table *blacklist)
 379 {
 380   struct url *u = upos->url;
 381   const char *url = u->url;
 382
 383   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 384
 385   if (string_set_contains (blacklist, url))
 386     {
 387       DEBUGP (("Already on the black list.\n"));
 388       goto out;
 389     }
 390
 391   /* Several things to check for:
 392      1. if scheme is not http, and we don't load it
 393      2. check for relative links (if relative_only is set)
 394      3. check for domain
 395      4. check for no-parent
 396      5. check for excludes && includes
 397      6. check for suffix
 398      7. check for same host (if spanhost is unset), with possible
 399      gethostbyname baggage
 400      8. check for robots.txt
 401
 402      Addendum: If the URL is FTP, and it is to be loaded, only the
 403      domain and suffix settings are "stronger".
 404
 405      Note that .html files will get loaded regardless of suffix rules
 406      (but that is remedied later with unlink) unless the depth equals
 407      the maximum depth.
 408
 409      More time- and memory- consuming tests should be put later on
 410      the list.  */
 411
 412   /* 1. Schemes other than HTTP are normally not recursed into. */
 413   if (u->scheme != SCHEME_HTTP
 414       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 415     {
 416       DEBUGP (("Not following non-HTTP schemes.\n"));
 417       goto out;
 418     }
 419
 420   /* 2. If it is an absolute link and they are not followed, throw it
 421      out.  */
 422   if (u->scheme == SCHEME_HTTP)
 423     if (opt.relative_only && !upos->link_relative_p)
 424       {
 425         DEBUGP (("It doesn't really look like a relative link.\n"));
 426         goto out;
 427       }
 428
 429   /* 3. If its domain is not to be accepted/looked-up, chuck it
 430      out.  */
 431   if (!accept_domain (u))
 432     {
 433       DEBUGP (("The domain was not accepted.\n"));
 434       goto out;
 435     }
 436
 437   /* 4. Check for parent directory.
 438
 439      If we descended to a different host or changed the scheme, ignore
 440      opt.no_parent.  Also ignore it for documents needed to display
 441      the parent page when in -p mode.  */
 442   if (opt.no_parent
 443       && u->scheme == start_url_parsed->scheme
 444       && 0 == strcasecmp (u->host, start_url_parsed->host)
 445       && u->port == start_url_parsed->port
 446       && !(opt.page_requisites && upos->link_inline_p))
 447     {
 448       if (!frontcmp (start_url_parsed->dir, u->dir))
 449         {
 450           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 451                    u->dir, start_url_parsed->dir));
 452           goto out;
 453         }
 454     }
 455
 456   /* 5. If the file does not match the acceptance list, or is on the
 457      rejection list, chuck it out.  The same goes for the directory
 458      exclusion and inclusion lists.  */
 459   if (opt.includes || opt.excludes)
 460     {
 461       if (!accdir (u->dir, ALLABS))
 462         {
 463           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 464           goto out;
 465         }
 466     }
 467
 468   /* 6. */
 469   {
 470     char *suf;
 471     /* Check for acceptance/rejection rules.  We ignore these rules
 472        for HTML documents because they might lead to other files which
 473        need to be downloaded.  Of course, we don't know which
 474        documents are HTML before downloading them, so we guess.
 475
 476        A file is subject to acceptance/rejection rules if:
 477
 478        * u->file is not "" (i.e. it is not a directory)
 479        and either:
 480          + there is no file suffix,
 481          + or there is a suffix, but is not "html" or "htm",
 482          + both:
 483            - recursion is not infinite,
 484            - and we are at its very end. */
 485
 486     if (u->file[0] != '\0'
 487         && ((suf = suffix (url)) == NULL
 488             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 489             || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
 490       {
 491         if (!acceptable (u->file))
 492           {
 493             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 494                      url, u->file));
 495             goto out;
 496           }
 497       }
 498   }
 499
 500   /* 7. */
 501   if (u->scheme == parent->scheme)
 502     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 503       {
 504         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 505                  u->host, parent->host));
 506         goto out;
 507       }
 508
 509   /* 8. */
 510   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 511     {
 512       struct robot_specs *specs = res_get_specs (u->host, u->port);
 513       if (!specs)
 514         {
 515           char *rfile;
 516           if (res_retrieve_file (url, &rfile))
 517             {
 518               specs = res_parse_from_file (rfile);
 519               xfree (rfile);
 520             }
 521           else
 522             {
 523               /* If we cannot get real specs, at least produce
 524                  dummy ones so that we can register them and stop
 525                  trying to retrieve them.  */
 526               specs = res_parse ("", 0);
 527             }
 528           res_register_specs (u->host, u->port, specs);
 529         }
 530
 531       /* Now that we have (or don't have) robots.txt specs, we can
 532          check what they say.  */
 533       if (!res_match_path (specs, u->path))
 534         {
 535           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 536           string_set_add (blacklist, url);
 537           goto out;
 538         }
 539     }
 540
 541   /* The URL has passed all the tests.  It can be placed in the
 542      download queue. */
 543   DEBUGP (("Decided to load it.\n"));
 544
 545   return 1;
 546
 547  out:
 548   DEBUGP (("Decided NOT to load it.\n"));
 549
 550   return 0;
 551 }
 552
 553 /* This function determines whether we should descend the children of
 554    the URL whose download resulted in a redirection, possibly to
 555    another host, etc.  It is needed very rarely, and thus it is merely
 556    a simple-minded wrapper around descend_url_p.  */
 557
 558 static int
 559 descend_redirect_p (const char *redirected, const char *original, int depth,
 560                     struct url *start_url_parsed, struct hash_table *blacklist)
 561 {
 562   struct url *orig_parsed, *new_parsed;
 563   struct urlpos *upos;
 564   int success;
 565
 566   orig_parsed = url_parse (original, NULL);
 567   assert (orig_parsed != NULL);
 568
 569   new_parsed = url_parse (redirected, NULL);
 570   assert (new_parsed != NULL);
 571
 572   upos = xmalloc (sizeof (struct urlpos));
 573   memset (upos, 0, sizeof (*upos));
 574   upos->url = new_parsed;
 575
 576   success = descend_url_p (upos, orig_parsed, depth,
 577                            start_url_parsed, blacklist);
 578
 579   url_free (orig_parsed);
 580   url_free (new_parsed);
 581   xfree (upos);
 582
 583   if (!success)
 584     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 585
 586   return success;
 587 }
 588
 589 \f
 590 /* Register that URL has been successfully downloaded to FILE. */
 591
 592 void
 593 register_download (const char *url, const char *file)
 594 {
 595   if (!opt.convert_links)
 596     return;
 597   if (!dl_file_url_map)
 598     dl_file_url_map = make_string_hash_table (0);
 599   if (!dl_url_file_map)
 600     dl_url_file_map = make_string_hash_table (0);
 601
 602   if (!hash_table_contains (dl_file_url_map, file))
 603     hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 604   if (!hash_table_contains (dl_url_file_map, url))
 605     hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 606 }
 607
 608 /* Register that FROM has been redirected to TO.  This assumes that TO
 609    is successfully downloaded and already registered using
 610    register_download() above.  */
 611
 612 void
 613 register_redirection (const char *from, const char *to)
 614 {
 615   char *file;
 616
 617   if (!opt.convert_links)
 618     return;
 619
 620   file = hash_table_get (dl_url_file_map, to);
 621   assert (file != NULL);
 622   if (!hash_table_contains (dl_url_file_map, from))
 623     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 624 }
 625
 626 /* Register that URL corresponds to the HTML file FILE. */
 627
 628 void
 629 register_html (const char *url, const char *file)
 630 {
 631   if (!opt.convert_links)
 632     return;
 633   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 634 }
 635
 636 /* This function is called when the retrieval is done to convert the
 637    links that have been downloaded.  It has to be called at the end of
 638    the retrieval, because only then does Wget know conclusively which
 639    URLs have been downloaded, and which not, so it can tell which
 640    direction to convert to.
 641
 642    The "direction" means that the URLs to the files that have been
 643    downloaded get converted to the relative URL which will point to
 644    that file.  And the other URLs get converted to the remote URL on
 645    the server.
 646
 647    All the downloaded HTMLs are kept in downloaded_html_files, and
 648    downloaded URLs in urls_downloaded.  All the information is
 649    extracted from these two lists.  */
 650
 651 void
 652 convert_all_links (void)
 653 {
 654   slist *html;
 655   struct wget_timer *timer;
 656   long msecs;
 657   int file_count = 0;
 658
 659   timer = wtimer_new ();
 660
 661   /* Destructively reverse downloaded_html_files to get it in the right order.
 662      recursive_retrieve() used slist_prepend() consistently.  */
 663   downloaded_html_files = slist_nreverse (downloaded_html_files);
 664
 665   for (html = downloaded_html_files; html; html = html->next)
 666     {
 667       struct urlpos *urls, *cur_url;
 668       char *url;
 669
 670       DEBUGP (("Rescanning %s\n", html->string));
 671
 672       /* Determine the URL of the HTML file.  get_urls_html will need
 673          it.  */
 674       url = hash_table_get (dl_file_url_map, html->string);
 675       if (url)
 676         DEBUGP (("It should correspond to %s.\n", url));
 677       else
 678         DEBUGP (("I cannot find the corresponding URL.\n"));
 679
 680       /* Parse the HTML file...  */
 681       urls = get_urls_html (html->string, url, NULL);
 682
 683       /* We don't respect meta_disallow_follow here because, even if
 684          the file is not followed, we might still want to convert the
 685          links that have been followed from other files.  */
 686
 687       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 688         {
 689           char *local_name;
 690           struct url *u = cur_url->url;
 691
 692           if (cur_url->link_base_p)
 693             {
 694               /* Base references have been resolved by our parser, so
 695                  we turn the base URL into an empty string.  (Perhaps
 696                  we should remove the tag entirely?)  */
 697               cur_url->convert = CO_NULLIFY_BASE;
 698               continue;
 699             }
 700
 701           /* We decide the direction of conversion according to whether
 702              a URL was downloaded.  Downloaded URLs will be converted
 703              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 704           local_name = hash_table_get (dl_url_file_map, u->url);
 705           if (local_name)
 706             DEBUGP (("%s marked for conversion, local %s\n",
 707                      u->url, local_name));
 708
 709           /* Decide on the conversion type.  */
 710           if (local_name)
 711             {
 712               /* We've downloaded this URL.  Convert it to relative
 713                  form.  We do this even if the URL already is in
 714                  relative form, because our directory structure may
 715                  not be identical to that on the server (think `-nd',
 716                  `--cut-dirs', etc.)  */
 717               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 718               cur_url->local_name = xstrdup (local_name);
 719             }
 720           else
 721             {
 722               /* We haven't downloaded this URL.  If it's not already
 723                  complete (including a full host name), convert it to
 724                  that form, so it can be reached while browsing this
 725                  HTML locally.  */
 726               if (!cur_url->link_complete_p)
 727                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 728               cur_url->local_name = NULL;
 729             }
 730         }
 731
 732       /* Convert the links in the file.  */
 733       convert_links (html->string, urls);
 734       ++file_count;
 735
 736       /* Free the data.  */
 737       free_urlpos (urls);
 738     }
 739
 740   msecs = wtimer_elapsed (timer);
 741   wtimer_delete (timer);
 742   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 743              file_count, (double)msecs / 1000);
 744 }
 745
 746 /* Cleanup the data structures associated with recursive retrieving
 747    (the variables above).  */
 748 void
 749 recursive_cleanup (void)
 750 {
 751   if (dl_file_url_map)
 752     {
 753       free_keys_and_values (dl_file_url_map);
 754       hash_table_destroy (dl_file_url_map);
 755       dl_file_url_map = NULL;
 756     }
 757   if (dl_url_file_map)
 758     {
 759       free_keys_and_values (dl_url_file_map);
 760       hash_table_destroy (dl_url_file_map);
 761       dl_url_file_map = NULL;
 762     }
 763   slist_free (downloaded_html_files);
 764   downloaded_html_files = NULL;
 765 }