1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
34 #include <sys/types.h>
51 extern char *version_string;
53 static struct hash_table *dl_file_url_map;
54 static struct hash_table *dl_url_file_map;
56 /* List of HTML files downloaded in this Wget run. Used for link
57 conversion after Wget is done. */
58 static slist *downloaded_html_files;
60 /* List of undesirable-to-load URLs. */
61 static struct hash_table *undesirable_urls;
63 /* Current recursion depth. */
66 /* Base directory we're recursing from (used by no_parent). */
67 static char *base_dir;
69 static int first_time = 1;
72 /* Cleanup the data structures associated with recursive retrieving
73 (the variables above). */
75 recursive_cleanup (void)
79 string_set_free (undesirable_urls);
80 undesirable_urls = NULL;
84 free_keys_and_values (dl_file_url_map);
85 hash_table_destroy (dl_file_url_map);
86 dl_file_url_map = NULL;
90 free_keys_and_values (dl_url_file_map);
91 hash_table_destroy (dl_url_file_map);
92 dl_url_file_map = NULL;
94 undesirable_urls = NULL;
95 slist_free (downloaded_html_files);
96 downloaded_html_files = NULL;
97 FREE_MAYBE (base_dir);
101 /* Reset FIRST_TIME to 1, so that some action can be taken in
102 recursive_retrieve(). */
104 recursive_reset (void)
109 /* The core of recursive retrieving. Endless recursion is avoided by
110 having all URLs stored to a linked list of URLs, which is checked
111 before loading any URL. That way no URL can get loaded twice.
113 The function also supports specification of maximum recursion depth
114 and a number of other goodies. */
116 recursive_retrieve (const char *file, const char *this_url)
118 char *constr, *filename, *newloc;
119 char *canon_this_url = NULL;
120 int dt, inl, dash_p_leaf_HTML = FALSE;
121 int meta_disallow_follow;
122 int this_url_ftp; /* See below the explanation */
124 urlpos *url_list, *cur_url;
127 assert (this_url != NULL);
128 assert (file != NULL);
129 /* If quota was exceeded earlier, bail out. */
130 if (downloaded_exceeds_quota ())
132 /* Cache the current URL in the list. */
135 /* These three operations need to be done only once per Wget
136 run. They should probably be at a different location. */
137 if (!undesirable_urls)
138 undesirable_urls = make_string_hash_table (0);
140 hash_table_clear (undesirable_urls);
141 string_set_add (undesirable_urls, this_url);
142 /* Enter this_url to the hash table, in original and "enhanced" form. */
144 err = parseurl (this_url, u, 0);
147 string_set_add (undesirable_urls, u->url);
149 base_dir = xstrdup (u->dir); /* Set the base dir. */
150 /* Set the canonical this_url to be sent as referer. This
151 problem exists only when running the first time. */
152 canon_this_url = xstrdup (u->url);
156 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
166 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
167 /* We've exceeded the maximum recursion depth specified by the user. */
169 if (opt.page_requisites && depth <= opt.reclevel + 1)
170 /* When -p is specified, we can do one more partial recursion from the
171 "leaf nodes" on the HTML document tree. The recursion is partial in
172 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
173 except for <LINK REL="stylesheet">. */
174 dash_p_leaf_HTML = TRUE;
176 /* Either -p wasn't specified or it was and we've already gone the one
177 extra (pseudo-)level that it affords us, so we need to bail out. */
179 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
180 depth, opt.reclevel));
186 /* Determine whether this_url is an FTP URL. If it is, it means
187 that the retrieval is done through proxy. In that case, FTP
188 links will be followed by default and recursion will not be
189 turned off when following them. */
190 this_url_ftp = (urlproto (this_url) == URLFTP);
192 /* Get the URL-s from an HTML file: */
193 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
194 dash_p_leaf_HTML, &meta_disallow_follow);
196 if (opt.use_robots && meta_disallow_follow)
198 /* The META tag says we are not to follow this file. Respect
200 free_urlpos (url_list);
204 /* Decide what to do with each of the URLs. A URL will be loaded if
205 it meets several requirements, discussed later. */
206 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
208 /* If quota was exceeded earlier, bail out. */
209 if (downloaded_exceeds_quota ())
211 /* Parse the URL for convenient use in other functions, as well
212 as to get the optimized form. It also checks URL integrity. */
214 if (parseurl (cur_url->url, u, 0) != URLOK)
216 DEBUGP (("Yuck! A bad URL.\n"));
220 if (u->proto == URLFILE)
222 DEBUGP (("Nothing to do with file:// around here.\n"));
226 assert (u->url != NULL);
227 constr = xstrdup (u->url);
229 /* Several checkings whether a file is acceptable to load:
230 1. check if URL is ftp, and we don't load it
231 2. check for relative links (if relative_only is set)
233 4. check for no-parent
234 5. check for excludes && includes
236 7. check for same host (if spanhost is unset), with possible
237 gethostbyname baggage
238 8. check for robots.txt
240 Addendum: If the URL is FTP, and it is to be loaded, only the
241 domain and suffix settings are "stronger".
243 Note that .html and (yuck) .htm will get loaded regardless of
244 suffix rules (but that is remedied later with unlink) unless
245 the depth equals the maximum depth.
247 More time- and memory- consuming tests should be put later on
250 /* inl is set if the URL we are working on (constr) is stored in
251 undesirable_urls. Using it is crucial to avoid unnecessary
252 repeated continuous hits to the hash table. */
253 inl = string_set_contains (undesirable_urls, constr);
255 /* If it is FTP, and FTP is not followed, chuck it out. */
257 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
259 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
260 string_set_add (undesirable_urls, constr);
263 /* If it is absolute link and they are not followed, chuck it
265 if (!inl && u->proto != URLFTP)
266 if (opt.relative_only && !cur_url->link_relative_p)
268 DEBUGP (("It doesn't really look like a relative link.\n"));
269 string_set_add (undesirable_urls, constr);
272 /* If its domain is not to be accepted/looked-up, chuck it out. */
274 if (!accept_domain (u))
276 DEBUGP (("I don't like the smell of that domain.\n"));
277 string_set_add (undesirable_urls, constr);
280 /* Check for parent directory. */
281 if (!inl && opt.no_parent
282 /* If the new URL is FTP and the old was not, ignore
284 && !(!this_url_ftp && u->proto == URLFTP))
286 /* Check for base_dir first. */
287 if (!(base_dir && frontcmp (base_dir, u->dir)))
289 /* Failing that, check for parent dir. */
290 struct urlinfo *ut = newurl ();
291 if (parseurl (this_url, ut, 0) != URLOK)
292 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
293 else if (!frontcmp (ut->dir, u->dir))
295 /* Failing that too, kill the URL. */
296 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
297 string_set_add (undesirable_urls, constr);
303 /* If the file does not match the acceptance list, or is on the
304 rejection list, chuck it out. The same goes for the
305 directory exclude- and include- lists. */
306 if (!inl && (opt.includes || opt.excludes))
308 if (!accdir (u->dir, ALLABS))
310 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
311 string_set_add (undesirable_urls, constr);
318 /* We check for acceptance/rejection rules only for non-HTML
319 documents. Since we don't know whether they really are
320 HTML, it will be deduced from (an OR-ed list):
322 1) u->file is "" (meaning it is a directory)
323 2) suffix exists, AND:
327 If the file *is* supposed to be HTML, it will *not* be
328 subject to acc/rej rules, unless a finite maximum depth has
329 been specified and the current depth is the maximum depth. */
332 || (((suf = suffix (constr)) != NULL)
333 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
334 && ((opt.reclevel != INFINITE_RECURSION) &&
335 (depth != opt.reclevel))))))
337 if (!acceptable (u->file))
339 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
341 string_set_add (undesirable_urls, constr);
347 /* Optimize the URL (which includes possible DNS lookup) only
348 after all other possibilities have been exhausted. */
351 if (!opt.simple_check)
356 /* Just lowercase the hostname. */
357 for (p = u->host; *p; p++)
360 u->url = str_url (u, 0);
363 constr = xstrdup (u->url);
364 /* After we have canonicalized the URL, check if we have it
365 on the black list. */
366 if (string_set_contains (undesirable_urls, constr))
368 /* This line is bogus. */
369 /*string_set_add (undesirable_urls, constr);*/
371 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
372 if (!opt.spanhost && this_url && !same_host (this_url, constr))
374 DEBUGP (("This is not the same hostname as the parent's.\n"));
375 string_set_add (undesirable_urls, constr);
379 /* What about robots.txt? */
380 if (!inl && opt.use_robots && u->proto == URLHTTP)
382 struct robot_specs *specs = res_get_specs (u->host, u->port);
386 if (res_retrieve_file (constr, &rfile))
388 specs = res_parse_from_file (rfile);
393 /* If we cannot get real specs, at least produce
394 dummy ones so that we can register them and stop
395 trying to retrieve them. */
396 specs = res_parse ("", 0);
398 res_register_specs (u->host, u->port, specs);
401 /* Now that we have (or don't have) robots.txt specs, we can
402 check what they say. */
403 if (!res_match_path (specs, u->path))
405 DEBUGP (("Not following %s because robots.txt forbids it.\n",
407 string_set_add (undesirable_urls, constr);
413 /* If it wasn't chucked out, do something with it. */
416 DEBUGP (("I've decided to load it -> "));
417 /* Add it to the list of already-loaded URL-s. */
418 string_set_add (undesirable_urls, constr);
419 /* Automatically followed FTPs will *not* be downloaded
421 if (u->proto == URLFTP)
423 /* Don't you adore side-effects? */
426 /* Reset its type. */
429 retrieve_url (constr, &filename, &newloc,
430 canon_this_url ? canon_this_url : this_url, &dt);
431 if (u->proto == URLFTP)
441 /* If there was no error, and the type is text/html, parse
446 recursive_retrieve (filename, constr);
449 DEBUGP (("%s is not text/html so we don't chase.\n",
450 filename ? filename: "(null)"));
452 if (opt.delete_after || (filename && !acceptable (filename)))
453 /* Either --delete-after was specified, or we loaded this otherwise
454 rejected (e.g. by -R) HTML file just so we could harvest its
455 hyperlinks -- in either case, delete the local file. */
457 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
458 opt.delete_after ? "--delete-after" :
459 "recursive rejection criteria"));
460 logprintf (LOG_VERBOSE,
461 (opt.delete_after ? _("Removing %s.\n")
462 : _("Removing %s since it should be rejected.\n")),
464 if (unlink (filename))
465 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
469 /* If everything was OK, and links are to be converted, let's
470 store the local filename. */
471 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
473 cur_url->convert = CO_CONVERT_TO_RELATIVE;
474 cur_url->local_name = xstrdup (filename);
478 DEBUGP (("%s already in list, so we don't load.\n", constr));
479 /* Free filename and constr. */
480 FREE_MAYBE (filename);
483 /* Increment the pbuf for the appropriate size. */
485 if (opt.convert_links && !opt.delete_after)
486 /* This is merely the first pass: the links that have been
487 successfully downloaded are converted. In the second pass,
488 convert_all_links() will also convert those links that have NOT
489 been downloaded to their canonical form. */
490 convert_links (file, url_list);
491 /* Free the linked list of URL-s. */
492 free_urlpos (url_list);
493 /* Free the canonical this_url. */
494 FREE_MAYBE (canon_this_url);
495 /* Decrement the recursion depth. */
497 if (downloaded_exceeds_quota ())
504 register_download (const char *url, const char *file)
506 if (!opt.convert_links)
508 if (!dl_file_url_map)
509 dl_file_url_map = make_string_hash_table (0);
510 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
511 if (!dl_url_file_map)
512 dl_url_file_map = make_string_hash_table (0);
513 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
517 register_html (const char *url, const char *file)
519 if (!opt.convert_links)
521 downloaded_html_files = slist_prepend (downloaded_html_files, file);
524 /* convert_links() is called from recursive_retrieve() after we're
525 done with an HTML file. This call to convert_links is not complete
526 because it converts only the downloaded files, and Wget cannot know
527 which files will be downloaded afterwards. So, if we have file
530 <a href="/c/something.gif">
532 and /c/something.gif was not downloaded because it exceeded the
533 recursion depth, the reference will *not* be changed.
535 However, later we can encounter /c/something.gif from an "upper"
536 level HTML (let's call it filetwo.html), and it gets downloaded.
538 But now we have a problem because /c/something.gif will be
539 correctly transformed in filetwo.html, but not in fileone.html,
540 since Wget could not have known that /c/something.gif will be
541 downloaded in the future.
543 This is why Wget must, after the whole retrieval, call
544 convert_all_links to go once more through the entire list of
545 retrieved HTMLs, and re-convert them.
547 All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
548 in urls_downloaded. From these two lists information is
551 convert_all_links (void)
555 /* Destructively reverse downloaded_html_files to get it in the right order.
556 recursive_retrieve() used slist_prepend() consistently. */
557 downloaded_html_files = slist_nreverse (downloaded_html_files);
559 for (html = downloaded_html_files; html; html = html->next)
561 urlpos *urls, *cur_url;
564 DEBUGP (("Rescanning %s\n", html->string));
565 /* Determine the URL of the HTML file. get_urls_html will need
567 url = hash_table_get (dl_file_url_map, html->string);
569 DEBUGP (("It should correspond to %s.\n", url));
571 DEBUGP (("I cannot find the corresponding URL.\n"));
572 /* Parse the HTML file... */
573 urls = get_urls_html (html->string, url, FALSE, NULL);
574 /* We don't respect meta_disallow_follow here because, even if
575 the file is not followed, we might still want to convert the
576 links that have been followed from other files. */
577 for (cur_url = urls; cur_url; cur_url = cur_url->next)
581 /* The URL must be in canonical form to be compared. */
582 struct urlinfo *u = newurl ();
583 uerr_t res = parseurl (cur_url->url, u, 0);
589 /* We decide the direction of conversion according to whether
590 a URL was downloaded. Downloaded URLs will be converted
591 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
592 local_name = hash_table_get (dl_url_file_map, u->url);
594 DEBUGP (("%s marked for conversion, local %s\n",
595 u->url, local_name));
596 /* Decide on the conversion direction. */
599 /* We've downloaded this URL. Convert it to relative
600 form. We do this even if the URL already is in
601 relative form, because our directory structure may
602 not be identical to that on the server (think `-nd',
603 `--cut-dirs', etc.) */
604 cur_url->convert = CO_CONVERT_TO_RELATIVE;
605 cur_url->local_name = xstrdup (local_name);
609 /* We haven't downloaded this URL. If it's not already
610 complete (including a full host name), convert it to
611 that form, so it can be reached while browsing this
613 if (!cur_url->link_complete_p)
614 cur_url->convert = CO_CONVERT_TO_COMPLETE;
615 cur_url->local_name = NULL;
619 /* Convert the links in the file. */
620 convert_links (html->string, urls);