sjero.net Git - wget/blob - src/recur.c

   1 /* Handling of recursive HTTP retrieving.
   2    Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2 of the License, or
   9 (at your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget; if not, write to the Free Software
  18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #ifdef HAVE_STRING_H
  25 # include <string.h>
  26 #else
  27 # include <strings.h>
  28 #endif /* HAVE_STRING_H */
  29 #ifdef HAVE_UNISTD_H
  30 # include <unistd.h>
  31 #endif /* HAVE_UNISTD_H */
  32 #include <errno.h>
  33 #include <assert.h>
  34 #include <sys/types.h>
  35
  36 #include "wget.h"
  37 #include "url.h"
  38 #include "recur.h"
  39 #include "utils.h"
  40 #include "retr.h"
  41 #include "ftp.h"
  42 #include "fnmatch.h"
  43 #include "host.h"
  44 #include "hash.h"
  45 #include "res.h"
  46
  47 #ifndef errno
  48 extern int errno;
  49 #endif
  50
  51 extern char *version_string;
  52
  53 static struct hash_table *dl_file_url_map;
  54 static struct hash_table *dl_url_file_map;
  55
  56 /* List of HTML files downloaded in this Wget run.  Used for link
  57    conversion after Wget is done.  This list should only be traversed
  58    in order.  If you need to check whether a file has been downloaded,
  59    use a hash table, e.g. dl_file_url_map.  */
  60 static slist *downloaded_html_files;
  61 \f
  62 /* Functions for maintaining the URL queue.  */
  63
  64 struct queue_element {
  65   const char *url;
  66   const char *referer;
  67   int depth;
  68   struct queue_element *next;
  69 };
  70
  71 struct url_queue {
  72   struct queue_element *head;
  73   struct queue_element *tail;
  74   int count, maxcount;
  75 };
  76
  77 /* Create a URL queue. */
  78
  79 static struct url_queue *
  80 url_queue_new (void)
  81 {
  82   struct url_queue *queue = xmalloc (sizeof (*queue));
  83   memset (queue, '\0', sizeof (*queue));
  84   return queue;
  85 }
  86
  87 /* Delete a URL queue. */
  88
  89 static void
  90 url_queue_delete (struct url_queue *queue)
  91 {
  92   xfree (queue);
  93 }
  94
  95 /* Enqueue a URL in the queue.  The queue is FIFO: the items will be
  96    retrieved ("dequeued") from the queue in the order they were placed
  97    into it.  */
  98
  99 static void
 100 url_enqueue (struct url_queue *queue,
 101              const char *url, const char *referer, int depth)
 102 {
 103   struct queue_element *qel = xmalloc (sizeof (*qel));
 104   qel->url = url;
 105   qel->referer = referer;
 106   qel->depth = depth;
 107   qel->next = NULL;
 108
 109   ++queue->count;
 110   if (queue->count > queue->maxcount)
 111     queue->maxcount = queue->count;
 112
 113   DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
 114   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 115
 116   if (queue->tail)
 117     queue->tail->next = qel;
 118   queue->tail = qel;
 119
 120   if (!queue->head)
 121     queue->head = queue->tail;
 122 }
 123
 124 /* Take a URL out of the queue.  Return 1 if this operation succeeded,
 125    or 0 if the queue is empty.  */
 126
 127 static int
 128 url_dequeue (struct url_queue *queue,
 129              const char **url, const char **referer, int *depth)
 130 {
 131   struct queue_element *qel = queue->head;
 132
 133   if (!qel)
 134     return 0;
 135
 136   queue->head = queue->head->next;
 137   if (!queue->head)
 138     queue->tail = NULL;
 139
 140   *url = qel->url;
 141   *referer = qel->referer;
 142   *depth = qel->depth;
 143
 144   --queue->count;
 145
 146   DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
 147   DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
 148
 149   xfree (qel);
 150   return 1;
 151 }
 152 \f
 153 static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
 154                                   struct url *, struct hash_table *));
 155 static int descend_redirect_p PARAMS ((const char *, const char *, int,
 156                                        struct url *, struct hash_table *));
 157
 158
 159 /* Retrieve a part of the web beginning with START_URL.  This used to
 160    be called "recursive retrieval", because the old function was
 161    recursive and implemented depth-first search.  retrieve_tree on the
 162    other hand implements breadth-search traversal of the tree, which
 163    results in much nicer ordering of downloads.
 164
 165    The algorithm this function uses is simple:
 166
 167    1. put START_URL in the queue.
 168    2. while there are URLs in the queue:
 169
 170      3. get next URL from the queue.
 171      4. download it.
 172      5. if the URL is HTML and its depth does not exceed maximum depth,
 173         get the list of URLs embedded therein.
 174      6. for each of those URLs do the following:
 175
 176        7. if the URL is not one of those downloaded before, and if it
 177           satisfies the criteria specified by the various command-line
 178           options, add it to the queue. */
 179
 180 uerr_t
 181 retrieve_tree (const char *start_url)
 182 {
 183   uerr_t status = RETROK;
 184
 185   /* The queue of URLs we need to load. */
 186   struct url_queue *queue = url_queue_new ();
 187
 188   /* The URLs we do not wish to enqueue, because they are already in
 189      the queue, but haven't been downloaded yet.  */
 190   struct hash_table *blacklist = make_string_hash_table (0);
 191
 192   /* We'll need various components of this, so better get it over with
 193      now. */
 194   struct url *start_url_parsed = url_parse (start_url, NULL);
 195
 196   url_enqueue (queue, xstrdup (start_url), NULL, 0);
 197   string_set_add (blacklist, start_url);
 198
 199   while (1)
 200     {
 201       int descend = 0;
 202       char *url, *referer, *file = NULL;
 203       int depth;
 204       boolean dash_p_leaf_HTML = FALSE;
 205
 206       if (downloaded_exceeds_quota ())
 207         break;
 208
 209       if (status == FWRITEERR)
 210         break;
 211
 212       /* Get the next URL from the queue. */
 213
 214       if (!url_dequeue (queue,
 215                         (const char **)&url, (const char **)&referer,
 216                         &depth))
 217         break;
 218
 219       /* And download it. */
 220
 221       {
 222         int dt = 0;
 223         char *redirected = NULL;
 224         int oldrec = opt.recursive;
 225
 226         opt.recursive = 0;
 227         status = retrieve_url (url, &file, &redirected, NULL, &dt);
 228         opt.recursive = oldrec;
 229
 230         if (file && status == RETROK
 231             && (dt & RETROKF) && (dt & TEXTHTML))
 232           descend = 1;
 233
 234         if (redirected)
 235           {
 236             /* We have been redirected, possibly to another host, or
 237                different path, or wherever.  Check whether we really
 238                want to follow it.  */
 239             if (descend)
 240               {
 241                 if (!descend_redirect_p (redirected, url, depth,
 242                                          start_url_parsed, blacklist))
 243                   descend = 0;
 244                 else
 245                   /* Make sure that the old pre-redirect form gets
 246                      blacklisted. */
 247                   string_set_add (blacklist, url);
 248               }
 249
 250             xfree (url);
 251             url = redirected;
 252           }
 253       }
 254
 255       if (descend
 256           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 257         {
 258           if (opt.page_requisites
 259               && (depth == opt.reclevel || depth == opt.reclevel + 1))
 260             {
 261               /* When -p is specified, we are allowed to exceed the
 262                  maximum depth, but only for the "inline" links,
 263                  i.e. those that are needed to display the page.
 264                  Originally this could exceed the depth at most by
 265                  one, but we allow one more level so that the leaf
 266                  pages that contain frames can be loaded
 267                  correctly.  */
 268               dash_p_leaf_HTML = TRUE;
 269             }
 270           else
 271             {
 272               /* Either -p wasn't specified or it was and we've
 273                  already spent the two extra (pseudo-)levels that it
 274                  affords us, so we need to bail out. */
 275               DEBUGP (("Not descending further; at depth %d, max. %d.\n",
 276                        depth, opt.reclevel));
 277               descend = 0;
 278             }
 279         }
 280
 281       /* If the downloaded document was HTML, parse it and enqueue the
 282          links it contains. */
 283
 284       if (descend)
 285         {
 286           int meta_disallow_follow = 0;
 287           struct urlpos *children
 288             = get_urls_html (file, url, &meta_disallow_follow);
 289
 290           if (opt.use_robots && meta_disallow_follow)
 291             {
 292               free_urlpos (children);
 293               children = NULL;
 294             }
 295
 296           if (children)
 297             {
 298               struct urlpos *child = children;
 299               struct url *url_parsed = url_parsed = url_parse (url, NULL);
 300               assert (url_parsed != NULL);
 301
 302               for (; child; child = child->next)
 303                 {
 304                   if (child->ignore_when_downloading)
 305                     continue;
 306                   if (dash_p_leaf_HTML && !child->link_inline_p)
 307                     continue;
 308                   if (descend_url_p (child, url_parsed, depth, start_url_parsed,
 309                                      blacklist))
 310                     {
 311                       url_enqueue (queue, xstrdup (child->url->url),
 312                                    xstrdup (url), depth + 1);
 313                       /* We blacklist the URL we have enqueued, because we
 314                          don't want to enqueue (and hence download) the
 315                          same URL twice.  */
 316                       string_set_add (blacklist, child->url->url);
 317                     }
 318                 }
 319
 320               url_free (url_parsed);
 321               free_urlpos (children);
 322             }
 323         }
 324
 325       if (opt.delete_after || (file && !acceptable (file)))
 326         {
 327           /* Either --delete-after was specified, or we loaded this
 328              otherwise rejected (e.g. by -R) HTML file just so we
 329              could harvest its hyperlinks -- in either case, delete
 330              the local file. */
 331           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
 332                    opt.delete_after ? "--delete-after" :
 333                    "recursive rejection criteria"));
 334           logprintf (LOG_VERBOSE,
 335                      (opt.delete_after
 336                       ? _("Removing %s.\n")
 337                       : _("Removing %s since it should be rejected.\n")),
 338                      file);
 339           if (unlink (file))
 340             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
 341         }
 342
 343       xfree (url);
 344       FREE_MAYBE (referer);
 345       FREE_MAYBE (file);
 346     }
 347
 348   /* If anything is left of the queue due to a premature exit, free it
 349      now.  */
 350   {
 351     char *d1, *d2;
 352     int d3;
 353     while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3))
 354       {
 355         xfree (d1);
 356         FREE_MAYBE (d2);
 357       }
 358   }
 359   url_queue_delete (queue);
 360
 361   if (start_url_parsed)
 362     url_free (start_url_parsed);
 363   string_set_free (blacklist);
 364
 365   if (downloaded_exceeds_quota ())
 366     return QUOTEXC;
 367   else if (status == FWRITEERR)
 368     return FWRITEERR;
 369   else
 370     return RETROK;
 371 }
 372
 373 /* Based on the context provided by retrieve_tree, decide whether a
 374    URL is to be descended to.  This is only ever called from
 375    retrieve_tree, but is in a separate function for clarity.
 376
 377    The most expensive checks (such as those for robots) are memoized
 378    by storing these URLs to BLACKLIST.  This may or may not help.  It
 379    will help if those URLs are encountered many times.  */
 380
 381 static int
 382 descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
 383                struct url *start_url_parsed, struct hash_table *blacklist)
 384 {
 385   struct url *u = upos->url;
 386   const char *url = u->url;
 387
 388   DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
 389
 390   if (string_set_contains (blacklist, url))
 391     {
 392       DEBUGP (("Already on the black list.\n"));
 393       goto out;
 394     }
 395
 396   /* Several things to check for:
 397      1. if scheme is not http, and we don't load it
 398      2. check for relative links (if relative_only is set)
 399      3. check for domain
 400      4. check for no-parent
 401      5. check for excludes && includes
 402      6. check for suffix
 403      7. check for same host (if spanhost is unset), with possible
 404      gethostbyname baggage
 405      8. check for robots.txt
 406
 407      Addendum: If the URL is FTP, and it is to be loaded, only the
 408      domain and suffix settings are "stronger".
 409
 410      Note that .html files will get loaded regardless of suffix rules
 411      (but that is remedied later with unlink) unless the depth equals
 412      the maximum depth.
 413
 414      More time- and memory- consuming tests should be put later on
 415      the list.  */
 416
 417   /* 1. Schemes other than HTTP are normally not recursed into. */
 418   if (u->scheme != SCHEME_HTTP
 419       && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
 420     {
 421       DEBUGP (("Not following non-HTTP schemes.\n"));
 422       goto out;
 423     }
 424
 425   /* 2. If it is an absolute link and they are not followed, throw it
 426      out.  */
 427   if (u->scheme == SCHEME_HTTP)
 428     if (opt.relative_only && !upos->link_relative_p)
 429       {
 430         DEBUGP (("It doesn't really look like a relative link.\n"));
 431         goto out;
 432       }
 433
 434   /* 3. If its domain is not to be accepted/looked-up, chuck it
 435      out.  */
 436   if (!accept_domain (u))
 437     {
 438       DEBUGP (("The domain was not accepted.\n"));
 439       goto out;
 440     }
 441
 442   /* 4. Check for parent directory.
 443
 444      If we descended to a different host or changed the scheme, ignore
 445      opt.no_parent.  Also ignore it for documents needed to display
 446      the parent page when in -p mode.  */
 447   if (opt.no_parent
 448       && u->scheme == start_url_parsed->scheme
 449       && 0 == strcasecmp (u->host, start_url_parsed->host)
 450       && u->port == start_url_parsed->port
 451       && !(opt.page_requisites && upos->link_inline_p))
 452     {
 453       if (!frontcmp (start_url_parsed->dir, u->dir))
 454         {
 455           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
 456                    u->dir, start_url_parsed->dir));
 457           goto out;
 458         }
 459     }
 460
 461   /* 5. If the file does not match the acceptance list, or is on the
 462      rejection list, chuck it out.  The same goes for the directory
 463      exclusion and inclusion lists.  */
 464   if (opt.includes || opt.excludes)
 465     {
 466       if (!accdir (u->dir, ALLABS))
 467         {
 468           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
 469           goto out;
 470         }
 471     }
 472
 473   /* 6. */
 474   {
 475     char *suf;
 476     /* Check for acceptance/rejection rules.  We ignore these rules
 477        for HTML documents because they might lead to other files which
 478        need to be downloaded.  Of course, we don't know which
 479        documents are HTML before downloading them, so we guess.
 480
 481        A file is subject to acceptance/rejection rules if:
 482
 483        * u->file is not "" (i.e. it is not a directory)
 484        and either:
 485          + there is no file suffix,
 486          + or there is a suffix, but is not "html" or "htm",
 487          + both:
 488            - recursion is not infinite,
 489            - and we are at its very end. */
 490
 491     if (u->file[0] != '\0'
 492         && ((suf = suffix (url)) == NULL
 493             || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
 494             || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
 495       {
 496         if (!acceptable (u->file))
 497           {
 498             DEBUGP (("%s (%s) does not match acc/rej rules.\n",
 499                      url, u->file));
 500             goto out;
 501           }
 502       }
 503   }
 504
 505   /* 7. */
 506   if (u->scheme == parent->scheme)
 507     if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
 508       {
 509         DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
 510                  u->host, parent->host));
 511         goto out;
 512       }
 513
 514   /* 8. */
 515   if (opt.use_robots && u->scheme == SCHEME_HTTP)
 516     {
 517       struct robot_specs *specs = res_get_specs (u->host, u->port);
 518       if (!specs)
 519         {
 520           char *rfile;
 521           if (res_retrieve_file (url, &rfile))
 522             {
 523               specs = res_parse_from_file (rfile);
 524               xfree (rfile);
 525             }
 526           else
 527             {
 528               /* If we cannot get real specs, at least produce
 529                  dummy ones so that we can register them and stop
 530                  trying to retrieve them.  */
 531               specs = res_parse ("", 0);
 532             }
 533           res_register_specs (u->host, u->port, specs);
 534         }
 535
 536       /* Now that we have (or don't have) robots.txt specs, we can
 537          check what they say.  */
 538       if (!res_match_path (specs, u->path))
 539         {
 540           DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
 541           string_set_add (blacklist, url);
 542           goto out;
 543         }
 544     }
 545
 546   /* The URL has passed all the tests.  It can be placed in the
 547      download queue. */
 548   DEBUGP (("Decided to load it.\n"));
 549
 550   return 1;
 551
 552  out:
 553   DEBUGP (("Decided NOT to load it.\n"));
 554
 555   return 0;
 556 }
 557
 558 /* This function determines whether we should descend the children of
 559    the URL whose download resulted in a redirection, possibly to
 560    another host, etc.  It is needed very rarely, and thus it is merely
 561    a simple-minded wrapper around descend_url_p.  */
 562
 563 static int
 564 descend_redirect_p (const char *redirected, const char *original, int depth,
 565                     struct url *start_url_parsed, struct hash_table *blacklist)
 566 {
 567   struct url *orig_parsed, *new_parsed;
 568   struct urlpos *upos;
 569   int success;
 570
 571   orig_parsed = url_parse (original, NULL);
 572   assert (orig_parsed != NULL);
 573
 574   new_parsed = url_parse (redirected, NULL);
 575   assert (new_parsed != NULL);
 576
 577   upos = xmalloc (sizeof (struct urlpos));
 578   memset (upos, 0, sizeof (*upos));
 579   upos->url = new_parsed;
 580
 581   success = descend_url_p (upos, orig_parsed, depth,
 582                            start_url_parsed, blacklist);
 583
 584   url_free (orig_parsed);
 585   url_free (new_parsed);
 586   xfree (upos);
 587
 588   if (!success)
 589     DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
 590
 591   return success;
 592 }
 593
 594 \f
 595 /* Register that URL has been successfully downloaded to FILE. */
 596
 597 void
 598 register_download (const char *url, const char *file)
 599 {
 600   if (!opt.convert_links)
 601     return;
 602   if (!dl_file_url_map)
 603     dl_file_url_map = make_string_hash_table (0);
 604   if (!dl_url_file_map)
 605     dl_url_file_map = make_string_hash_table (0);
 606
 607   if (!hash_table_contains (dl_file_url_map, file))
 608     hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
 609   if (!hash_table_contains (dl_url_file_map, url))
 610     hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
 611 }
 612
 613 /* Register that FROM has been redirected to TO.  This assumes that TO
 614    is successfully downloaded and already registered using
 615    register_download() above.  */
 616
 617 void
 618 register_redirection (const char *from, const char *to)
 619 {
 620   char *file;
 621
 622   if (!opt.convert_links)
 623     return;
 624
 625   file = hash_table_get (dl_url_file_map, to);
 626   assert (file != NULL);
 627   if (!hash_table_contains (dl_url_file_map, from))
 628     hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
 629 }
 630
 631 /* Register that URL corresponds to the HTML file FILE. */
 632
 633 void
 634 register_html (const char *url, const char *file)
 635 {
 636   if (!opt.convert_links)
 637     return;
 638   downloaded_html_files = slist_prepend (downloaded_html_files, file);
 639 }
 640
 641 /* This function is called when the retrieval is done to convert the
 642    links that have been downloaded.  It has to be called at the end of
 643    the retrieval, because only then does Wget know conclusively which
 644    URLs have been downloaded, and which not, so it can tell which
 645    direction to convert to.
 646
 647    The "direction" means that the URLs to the files that have been
 648    downloaded get converted to the relative URL which will point to
 649    that file.  And the other URLs get converted to the remote URL on
 650    the server.
 651
 652    All the downloaded HTMLs are kept in downloaded_html_files, and
 653    downloaded URLs in urls_downloaded.  All the information is
 654    extracted from these two lists.  */
 655
 656 void
 657 convert_all_links (void)
 658 {
 659   slist *html;
 660   struct wget_timer *timer;
 661   long msecs;
 662   int file_count = 0;
 663
 664   timer = wtimer_new ();
 665
 666   /* Destructively reverse downloaded_html_files to get it in the right order.
 667      recursive_retrieve() used slist_prepend() consistently.  */
 668   downloaded_html_files = slist_nreverse (downloaded_html_files);
 669
 670   for (html = downloaded_html_files; html; html = html->next)
 671     {
 672       struct urlpos *urls, *cur_url;
 673       char *url;
 674
 675       DEBUGP (("Rescanning %s\n", html->string));
 676
 677       /* Determine the URL of the HTML file.  get_urls_html will need
 678          it.  */
 679       url = hash_table_get (dl_file_url_map, html->string);
 680       if (url)
 681         DEBUGP (("It should correspond to %s.\n", url));
 682       else
 683         DEBUGP (("I cannot find the corresponding URL.\n"));
 684
 685       /* Parse the HTML file...  */
 686       urls = get_urls_html (html->string, url, NULL);
 687
 688       /* We don't respect meta_disallow_follow here because, even if
 689          the file is not followed, we might still want to convert the
 690          links that have been followed from other files.  */
 691
 692       for (cur_url = urls; cur_url; cur_url = cur_url->next)
 693         {
 694           char *local_name;
 695           struct url *u = cur_url->url;
 696
 697           if (cur_url->link_base_p)
 698             {
 699               /* Base references have been resolved by our parser, so
 700                  we turn the base URL into an empty string.  (Perhaps
 701                  we should remove the tag entirely?)  */
 702               cur_url->convert = CO_NULLIFY_BASE;
 703               continue;
 704             }
 705
 706           /* We decide the direction of conversion according to whether
 707              a URL was downloaded.  Downloaded URLs will be converted
 708              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
 709           local_name = hash_table_get (dl_url_file_map, u->url);
 710           if (local_name)
 711             DEBUGP (("%s marked for conversion, local %s\n",
 712                      u->url, local_name));
 713
 714           /* Decide on the conversion type.  */
 715           if (local_name)
 716             {
 717               /* We've downloaded this URL.  Convert it to relative
 718                  form.  We do this even if the URL already is in
 719                  relative form, because our directory structure may
 720                  not be identical to that on the server (think `-nd',
 721                  `--cut-dirs', etc.)  */
 722               cur_url->convert = CO_CONVERT_TO_RELATIVE;
 723               cur_url->local_name = xstrdup (local_name);
 724             }
 725           else
 726             {
 727               /* We haven't downloaded this URL.  If it's not already
 728                  complete (including a full host name), convert it to
 729                  that form, so it can be reached while browsing this
 730                  HTML locally.  */
 731               if (!cur_url->link_complete_p)
 732                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
 733               cur_url->local_name = NULL;
 734             }
 735         }
 736
 737       /* Convert the links in the file.  */
 738       convert_links (html->string, urls);
 739       ++file_count;
 740
 741       /* Free the data.  */
 742       free_urlpos (urls);
 743     }
 744
 745   msecs = wtimer_elapsed (timer);
 746   wtimer_delete (timer);
 747   logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
 748              file_count, (double)msecs / 1000);
 749 }
 750
 751 /* Cleanup the data structures associated with recursive retrieving
 752    (the variables above).  */
 753 void
 754 recursive_cleanup (void)
 755 {
 756   if (dl_file_url_map)
 757     {
 758       free_keys_and_values (dl_file_url_map);
 759       hash_table_destroy (dl_file_url_map);
 760       dl_file_url_map = NULL;
 761     }
 762   if (dl_url_file_map)
 763     {
 764       free_keys_and_values (dl_url_file_map);
 765       hash_table_destroy (dl_url_file_map);
 766       dl_url_file_map = NULL;
 767     }
 768   slist_free (downloaded_html_files);
 769   downloaded_html_files = NULL;
 770 }