1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
35 #include <sys/types.h>
46 extern char *version_string;
48 #define ROBOTS_FILENAME "robots.txt"
50 /* #### Many of these lists should really be hashtables! */
52 /* List of downloaded URLs. */
53 static urlpos *urls_downloaded;
55 /* List of HTML URLs. */
56 static slist *urls_html;
58 /* List of undesirable-to-load URLs. */
61 /* List of forbidden locations. */
62 static char **forbidden = NULL;
64 /* Current recursion depth. */
67 /* Base directory we're recursing from (used by no_parent). */
68 static char *base_dir;
70 /* The host name for which we last checked robots. */
71 static char *robots_host;
73 static int first_time = 1;
75 /* Construct the robots URL. */
76 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
77 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
78 static char **parse_robots PARAMS ((const char *));
79 static int robots_match PARAMS ((struct urlinfo *, char **));
82 /* Cleanup the data structures associated with recursive retrieving
83 (the variables above). */
85 recursive_cleanup (void)
91 free_slist (urls_html);
93 free_urlpos (urls_downloaded);
94 urls_downloaded = NULL;
95 FREE_MAYBE (base_dir);
96 FREE_MAYBE (robots_host);
100 /* Reset FIRST_TIME to 1, so that some action can be taken in
101 recursive_retrieve(). */
103 recursive_reset (void)
108 /* The core of recursive retrieving. Endless recursion is avoided by
109 having all URLs stored to a linked list of URLs, which is checked
110 before loading any URL. That way no URL can get loaded twice.
112 The function also supports specification of maximum recursion depth
113 and a number of other goodies. */
115 recursive_retrieve (const char *file, const char *this_url)
117 char *constr, *filename, *newloc;
118 char *canon_this_url = NULL;
119 int dt, inl, dash_p_leaf_HTML = FALSE;
120 int this_url_ftp; /* See below the explanation */
122 struct urlinfo *rurl;
123 urlpos *url_list, *cur_url;
124 char *rfile; /* For robots */
127 assert (this_url != NULL);
128 assert (file != NULL);
129 /* If quota was exceeded earlier, bail out. */
130 if (downloaded_exceeds_quota ())
132 /* Cache the current URL in the list. */
135 ulist = add_slist (ulist, this_url, 0);
136 urls_downloaded = NULL;
138 /* Enter this_url to the slist, in original and "enhanced" form. */
140 err = parseurl (this_url, u, 0);
143 ulist = add_slist (ulist, u->url, 0);
144 urls_downloaded = add_url (urls_downloaded, u->url, file);
145 urls_html = add_slist (urls_html, file, NOSORT);
147 base_dir = xstrdup (u->dir); /* Set the base dir. */
148 /* Set the canonical this_url to be sent as referer. This
149 problem exists only when running the first time. */
150 canon_this_url = xstrdup (u->url);
154 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
166 if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
167 /* We've exceeded the maximum recursion depth specified by the user. */
169 if (opt.page_requisites && depth <= opt.reclevel + 1)
170 /* When -p is specified, we can do one more partial recursion from the
171 "leaf nodes" on the HTML document tree. The recursion is partial in
172 that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
173 except for <LINK REL="stylesheet">. */
174 dash_p_leaf_HTML = TRUE;
176 /* Either -p wasn't specified or it was and we've already gone the one
177 extra (pseudo-)level that it affords us, so we need to bail out. */
179 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
180 depth, opt.reclevel));
186 /* Determine whether this_url is an FTP URL. If it is, it means
187 that the retrieval is done through proxy. In that case, FTP
188 links will be followed by default and recursion will not be
189 turned off when following them. */
190 this_url_ftp = (urlproto (this_url) == URLFTP);
192 /* Get the URL-s from an HTML file: */
193 url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
194 0, dash_p_leaf_HTML);
196 /* Decide what to do with each of the URLs. A URL will be loaded if
197 it meets several requirements, discussed later. */
198 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
200 /* If quota was exceeded earlier, bail out. */
201 if (downloaded_exceeds_quota ())
203 /* Parse the URL for convenient use in other functions, as well
204 as to get the optimized form. It also checks URL integrity. */
206 if (parseurl (cur_url->url, u, 0) != URLOK)
208 DEBUGP (("Yuck! A bad URL.\n"));
212 if (u->proto == URLFILE)
214 DEBUGP (("Nothing to do with file:// around here.\n"));
218 assert (u->url != NULL);
219 constr = xstrdup (u->url);
221 /* Several checkings whether a file is acceptable to load:
222 1. check if URL is ftp, and we don't load it
223 2. check for relative links (if relative_only is set)
225 4. check for no-parent
226 5. check for excludes && includes
228 7. check for same host (if spanhost is unset), with possible
229 gethostbyname baggage
230 8. check for robots.txt
232 Addendum: If the URL is FTP, and it is to be loaded, only the
233 domain and suffix settings are "stronger".
235 Note that .html and (yuck) .htm will get loaded regardless of
236 suffix rules (but that is remedied later with unlink) unless
237 the depth equals the maximum depth.
239 More time- and memory- consuming tests should be put later on
242 /* inl is set if the URL we are working on (constr) is stored in
243 ulist. Using it is crucial to avoid the incessant calls to
244 in_slist, which is quite slow. */
245 inl = in_slist (ulist, constr);
247 /* If it is FTP, and FTP is not followed, chuck it out. */
249 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
251 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
252 ulist = add_slist (ulist, constr, 0);
255 /* If it is absolute link and they are not followed, chuck it
257 if (!inl && u->proto != URLFTP)
258 if (opt.relative_only && !(cur_url->flags & URELATIVE))
260 DEBUGP (("It doesn't really look like a relative link.\n"));
261 ulist = add_slist (ulist, constr, 0);
264 /* If its domain is not to be accepted/looked-up, chuck it out. */
266 if (!accept_domain (u))
268 DEBUGP (("I don't like the smell of that domain.\n"));
269 ulist = add_slist (ulist, constr, 0);
272 /* Check for parent directory. */
273 if (!inl && opt.no_parent
274 /* If the new URL is FTP and the old was not, ignore
276 && !(!this_url_ftp && u->proto == URLFTP))
278 /* Check for base_dir first. */
279 if (!(base_dir && frontcmp (base_dir, u->dir)))
281 /* Failing that, check for parent dir. */
282 struct urlinfo *ut = newurl ();
283 if (parseurl (this_url, ut, 0) != URLOK)
284 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
285 else if (!frontcmp (ut->dir, u->dir))
287 /* Failing that too, kill the URL. */
288 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
289 ulist = add_slist (ulist, constr, 0);
295 /* If the file does not match the acceptance list, or is on the
296 rejection list, chuck it out. The same goes for the
297 directory exclude- and include- lists. */
298 if (!inl && (opt.includes || opt.excludes))
300 if (!accdir (u->dir, ALLABS))
302 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
303 ulist = add_slist (ulist, constr, 0);
310 /* We check for acceptance/rejection rules only for non-HTML
311 documents. Since we don't know whether they really are
312 HTML, it will be deduced from (an OR-ed list):
314 1) u->file is "" (meaning it is a directory)
315 2) suffix exists, AND:
319 If the file *is* supposed to be HTML, it will *not* be
320 subject to acc/rej rules, unless a finite maximum depth has
321 been specified and the current depth is the maximum depth. */
324 || (((suf = suffix (constr)) != NULL)
325 && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
326 && ((opt.reclevel != INFINITE_RECURSION) &&
327 (depth != opt.reclevel))))))
329 if (!acceptable (u->file))
331 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
333 ulist = add_slist (ulist, constr, 0);
339 /* Optimize the URL (which includes possible DNS lookup) only
340 after all other possibilities have been exhausted. */
343 if (!opt.simple_check)
348 /* Just lowercase the hostname. */
349 for (p = u->host; *p; p++)
352 u->url = str_url (u, 0);
355 constr = xstrdup (u->url);
356 inl = in_slist (ulist, constr);
357 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
358 if (!opt.spanhost && this_url && !same_host (this_url, constr))
360 DEBUGP (("This is not the same hostname as the parent's.\n"));
361 ulist = add_slist (ulist, constr, 0);
365 /* What about robots.txt? */
366 if (!inl && opt.use_robots && u->proto == URLHTTP)
368 /* Since Wget knows about only one set of robot rules at a
369 time, /robots.txt must be reloaded whenever a new host is
372 robots_host holds the host the current `forbid' variable
374 if (!robots_host || !same_host (robots_host, u->host))
376 FREE_MAYBE (robots_host);
377 /* Now make robots_host the new host, no matter what the
378 result will be. So if there is no /robots.txt on the
379 site, Wget will not retry getting robots all the
381 robots_host = xstrdup (u->host);
382 free_vec (forbidden);
384 err = retrieve_robots (constr, ROBOTS_FILENAME);
387 rurl = robots_url (constr, ROBOTS_FILENAME);
388 rfile = url_filename (rurl);
389 forbidden = parse_robots (rfile);
395 /* Now that we have (or don't have) robots, we can check for
397 if (!robots_match (u, forbidden))
399 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
401 ulist = add_slist (ulist, constr, 0);
407 /* If it wasn't chucked out, do something with it. */
410 DEBUGP (("I've decided to load it -> "));
411 /* Add it to the list of already-loaded URL-s. */
412 ulist = add_slist (ulist, constr, 0);
413 /* Automatically followed FTPs will *not* be downloaded
415 if (u->proto == URLFTP)
417 /* Don't you adore side-effects? */
420 /* Reset its type. */
423 retrieve_url (constr, &filename, &newloc,
424 canon_this_url ? canon_this_url : this_url, &dt);
425 if (u->proto == URLFTP)
435 /* In case of convert_links: If there was no error, add it to
436 the list of downloaded URLs. We might need it for
438 if (opt.convert_links && filename)
442 urls_downloaded = add_url (urls_downloaded, constr, filename);
443 /* If the URL is HTML, note it. */
445 urls_html = add_slist (urls_html, filename, NOSORT);
448 /* If there was no error, and the type is text/html, parse
453 recursive_retrieve (filename, constr);
456 DEBUGP (("%s is not text/html so we don't chase.\n",
457 filename ? filename: "(null)"));
459 if (opt.delete_after || (filename && !acceptable (filename)))
460 /* Either --delete-after was specified, or we loaded this otherwise
461 rejected (e.g. by -R) HTML file just so we could harvest its
462 hyperlinks -- in either case, delete the local file. */
464 DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
465 opt.delete_after ? "--delete-after" :
466 "recursive rejection criteria"));
467 logprintf (LOG_VERBOSE,
468 (opt.delete_after ? _("Removing %s.\n")
469 : _("Removing %s since it should be rejected.\n")),
471 if (unlink (filename))
472 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
476 /* If everything was OK, and links are to be converted, let's
477 store the local filename. */
478 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
480 cur_url->flags |= UABS2REL;
481 cur_url->local_name = xstrdup (filename);
484 DEBUGP (("%s already in list, so we don't load.\n", constr));
485 /* Free filename and constr. */
486 FREE_MAYBE (filename);
489 /* Increment the pbuf for the appropriate size. */
491 if (opt.convert_links && !opt.delete_after)
492 convert_links (file, url_list);
493 /* Free the linked list of URL-s. */
494 free_urlpos (url_list);
495 /* Free the canonical this_url. */
496 FREE_MAYBE (canon_this_url);
497 /* Decrement the recursion depth. */
499 if (downloaded_exceeds_quota ())
505 /* Simple calls to convert_links will often fail because only the
506 downloaded files are converted, and Wget cannot know which files
507 will be converted in the future. So, if we have file fileone.html
510 <a href=/c/something.gif>
512 and /c/something.gif was not downloaded because it exceeded the
513 recursion depth, the reference will *not* be changed.
515 However, later we can encounter /c/something.gif from an "upper"
516 level HTML (let's call it filetwo.html), and it gets downloaded.
518 But now we have a problem because /c/something.gif will be
519 correctly transformed in filetwo.html, but not in fileone.html,
520 since Wget could not have known that /c/something.gif will be
521 downloaded in the future.
523 This is why Wget must, after the whole retrieval, call
524 convert_all_links to go once more through the entire list of
525 retrieved HTMLs, and re-convert them.
527 All the downloaded HTMLs are kept in urls_html, and downloaded URLs
528 in urls_downloaded. From these two lists information is
531 convert_all_links (void)
534 urlpos *l1, *l2, *urls;
539 for (html = urls_html; html; html = html->next)
541 DEBUGP (("Rescanning %s\n", html->string));
542 /* Determine the URL of the HTML file. get_urls_html will need
544 for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
545 if (!strcmp (urlhtml->local_name, html->string))
548 DEBUGP (("It should correspond to %s.\n", urlhtml->url));
550 DEBUGP (("I cannot find the corresponding URL.\n"));
551 /* Parse the HTML file... */
552 urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
556 for (l1 = urls; l1; l1 = l1->next)
558 /* The URL must be in canonical form to be compared. */
560 res = parseurl (l1->url, u, 0);
566 /* We decide the direction of conversion according to whether
567 a URL was downloaded. Downloaded URLs will be converted
568 ABS2REL, whereas non-downloaded will be converted REL2ABS.
569 Note: not yet implemented; only ABS2REL works. */
570 for (l2 = urls_downloaded; l2; l2 = l2->next)
571 if (!strcmp (l2->url, u->url))
573 DEBUGP (("%s flagged for conversion, local %s\n",
574 l2->url, l2->local_name));
577 /* Clear the flags. */
578 l1->flags &= ~ (UABS2REL | UREL2ABS);
579 /* Decide on the conversion direction. */
582 l1->flags |= UABS2REL;
583 l1->local_name = xstrdup (l2->local_name);
587 l1->flags |= UREL2ABS;
588 l1->local_name = NULL;
592 /* Convert the links in the file. */
593 convert_links (html->string, urls);
599 /* Robots support. */
601 /* Construct the robots URL. */
602 static struct urlinfo *
603 robots_url (const char *url, const char *robots_filename)
605 struct urlinfo *u = newurl ();
608 err = parseurl (url, u, 0);
609 assert (err == URLOK && u->proto == URLHTTP);
613 u->dir = xstrdup ("");
614 u->file = xstrdup (robots_filename);
615 u->url = str_url (u, 0);
619 /* Retrieves the robots_filename from the root server directory, if
620 possible. Returns ROBOTSOK if robots were retrieved OK, and
621 NOROBOTS if robots could not be retrieved for any reason. */
623 retrieve_robots (const char *url, const char *robots_filename)
629 u = robots_url (url, robots_filename);
630 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
631 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
639 /* Parse the robots_filename and return the disallowed path components
640 in a malloc-ed vector of character pointers.
642 It should be fully compliant with the syntax as described in the
643 file norobots.txt, adopted by the robots mailing list
644 (robots@webcrawler.com). */
646 parse_robots (const char *robots_filename)
650 char *line, *cmd, *str, *p;
651 char *base_version, *version;
653 int wget_matched; /* is the part meant for Wget? */
658 fp = fopen (robots_filename, "rb");
662 /* Kill version number. */
665 STRDUP_ALLOCA (base_version, opt.useragent);
666 STRDUP_ALLOCA (version, opt.useragent);
670 int len = 10 + strlen (version_string);
671 base_version = (char *)alloca (len);
672 sprintf (base_version, "Wget/%s", version_string);
673 version = (char *)alloca (len);
674 sprintf (version, "Wget/%s", version_string);
676 for (p = version; *p; p++)
678 for (p = base_version; *p && *p != '/'; p++)
682 /* Setting this to 1 means that Wget considers itself under
683 restrictions by default, even if the User-Agent field is not
684 present. However, if it finds the user-agent set to anything
685 other than Wget, the rest will be ignored (up to the following
686 User-Agent field). Thus you may have something like:
690 User-Agent: stupid-robot
699 In this case the 1, 2, 5, 6 and 7 disallow lines will be
702 while ((line = read_whole_line (fp)))
705 /* Destroy <CR><LF> if present. */
706 if (len && line[len - 1] == '\n')
708 if (len && line[len - 1] == '\r')
710 /* According to specifications, optional space may be at the
712 DEBUGP (("Line: %s\n", line));
714 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
718 DEBUGP (("(chucked out)\n"));
722 for (str = cmd; *str && *str != ':'; str++);
726 DEBUGP (("(chucked out)\n"));
729 /* Zero-terminate the command. */
731 /* Look for the string beginning... */
732 for (; *str && ISSPACE (*str); str++);
733 /* Look for comments or trailing spaces and kill them off. */
734 for (p = str; *p; p++)
735 if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
737 /* We have found either a shell-style comment `<sp>+#' or some
738 trailing spaces. Now rewind to the beginning of the spaces
739 and place '\0' there. */
740 while (p > str && ISSPACE (*p))
748 if (!strcasecmp (cmd, "User-agent"))
751 /* Lowercase the agent string. */
752 for (p = str; *p; p++)
754 /* If the string is `*', it matches. */
755 if (*str == '*' && !*(str + 1))
759 /* If the string contains wildcards, we'll run it through
761 if (has_wildcards_p (str))
763 /* If the string contains '/', compare with the full
764 version. Else, compare it to base_version. */
765 if (strchr (str, '/'))
766 match = !fnmatch (str, version, 0);
768 match = !fnmatch (str, base_version, 0);
770 else /* Substring search */
772 if (strstr (version, str))
778 /* If Wget is not matched, skip all the entries up to the
779 next User-agent field. */
780 wget_matched = match;
782 else if (!wget_matched)
785 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
788 else if (!strcasecmp (cmd, "Disallow"))
790 /* If "Disallow" is empty, the robot is welcome. */
794 entries = (char **)xmalloc (sizeof (char *));
800 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
801 entries[num] = xstrdup (str);
802 entries[++num] = NULL;
803 /* Strip trailing spaces, according to specifications. */
804 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
805 if (ISSPACE (str[i]))
811 /* unknown command */
812 DEBUGP (("(chucked out)\n"));
820 /* May the URL url be loaded according to disallowing rules stored in
823 robots_match (struct urlinfo *u, char **forbidden)
829 DEBUGP (("Matching %s against: ", u->path));
830 for (; *forbidden; forbidden++)
832 DEBUGP (("%s ", *forbidden));
833 l = strlen (*forbidden);
834 /* If dir is forbidden, we may not load the file. */
835 if (strncmp (u->path, *forbidden, l) == 0)
837 DEBUGP (("matched.\n"));
838 return 0; /* Matches, i.e. does not load... */
841 DEBUGP (("not matched.\n"));