1 /* Handling of recursive HTTP retrieving.
2 Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
4 This file is part of Wget.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
28 #endif /* HAVE_STRING_H */
31 #endif /* HAVE_UNISTD_H */
35 #include <sys/types.h>
46 extern char *version_string;
48 #define ROBOTS_FILENAME "robots.txt"
50 /* #### Many of these lists should really be hashtables! */
52 /* List of downloaded URLs. */
53 static urlpos *urls_downloaded;
55 /* List of HTML URLs. */
56 static slist *urls_html;
58 /* List of undesirable-to-load URLs. */
61 /* List of forbidden locations. */
62 static char **forbidden = NULL;
64 /* Current recursion depth. */
67 /* Base directory we're recursing from (used by no_parent). */
68 static char *base_dir;
70 /* The host name for which we last checked robots. */
71 static char *robots_host;
73 static int first_time = 1;
75 /* Construct the robots URL. */
76 static struct urlinfo *robots_url PARAMS ((const char *, const char *));
77 static uerr_t retrieve_robots PARAMS ((const char *, const char *));
78 static char **parse_robots PARAMS ((const char *));
79 static int robots_match PARAMS ((struct urlinfo *, char **));
82 /* Cleanup the data structures associated with recursive retrieving
83 (the variables above). */
85 recursive_cleanup (void)
91 free_slist (urls_html);
93 free_urlpos (urls_downloaded);
94 urls_downloaded = NULL;
95 FREE_MAYBE (base_dir);
96 FREE_MAYBE (robots_host);
100 /* Reset FIRST_TIME to 1, so that some action can be taken in
101 recursive_retrieve(). */
103 recursive_reset (void)
108 /* The core of recursive retrieving. Endless recursion is avoided by
109 having all URL-s stored to a linked list of URL-s, which is checked
110 before loading any URL. That way no URL can get loaded twice.
112 The function also supports specification of maximum recursion depth
113 and a number of other goodies. */
115 recursive_retrieve (const char *file, const char *this_url)
117 char *constr, *filename, *newloc;
118 char *canon_this_url = NULL;
120 int this_url_ftp; /* See below the explanation */
122 struct urlinfo *rurl;
123 urlpos *url_list, *cur_url;
124 char *rfile; /* For robots */
127 assert (this_url != NULL);
128 assert (file != NULL);
129 /* If quota was exceeded earlier, bail out. */
130 if (opt.quota && (opt.downloaded > opt.quota))
132 /* Cache the current URL in the list. */
135 ulist = add_slist (ulist, this_url, 0);
136 urls_downloaded = NULL;
138 /* Enter this_url to the slist, in original and "enhanced" form. */
140 err = parseurl (this_url, u, 0);
143 ulist = add_slist (ulist, u->url, 0);
144 urls_downloaded = add_url (urls_downloaded, u->url, file);
145 urls_html = add_slist (urls_html, file, NOSORT);
147 base_dir = xstrdup (u->dir); /* Set the base dir. */
148 /* Set the canonical this_url to be sent as referer. This
149 problem exists only when running the first time. */
150 canon_this_url = xstrdup (u->url);
154 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
166 /* Bail out if opt.reclevel is exceeded. */
167 if ((opt.reclevel != 0) && (depth > opt.reclevel))
169 DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
170 depth, opt.reclevel));
175 /* Determine whether this_url is an FTP URL. If it is, it means
176 that the retrieval is done through proxy. In that case, FTP
177 links will be followed by default and recursion will not be
178 turned off when following them. */
179 this_url_ftp = (urlproto (this_url) == URLFTP);
181 /* Get the URL-s from an HTML file: */
182 url_list = get_urls_html (file,
183 canon_this_url ? canon_this_url : this_url, 0);
185 /* Decide what to do with each of the URLs. A URL will be loaded if
186 it meets several requirements, discussed later. */
187 for (cur_url = url_list; cur_url; cur_url = cur_url->next)
189 /* If quota was exceeded earlier, bail out. */
190 if (opt.quota && (opt.downloaded > opt.quota))
192 /* Parse the URL for convenient use in other functions, as well
193 as to get the optimized form. It also checks URL integrity. */
195 if (parseurl (cur_url->url, u, 0) != URLOK)
197 DEBUGP (("Yuck! A bad URL.\n"));
201 if (u->proto == URLFILE)
203 DEBUGP (("Nothing to do with file:// around here.\n"));
207 assert (u->url != NULL);
208 constr = xstrdup (u->url);
210 /* Several checkings whether a file is acceptable to load:
211 1. check if URL is ftp, and we don't load it
212 2. check for relative links (if relative_only is set)
214 4. check for no-parent
215 5. check for excludes && includes
217 7. check for same host (if spanhost is unset), with possible
218 gethostbyname baggage
219 8. check for robots.txt
221 Addendum: If the URL is FTP, and it is to be loaded, only the
222 domain and suffix settings are "stronger".
224 Note that .html and (yuck) .htm will get loaded
225 regardless of suffix rules (but that is remedied later with
228 More time- and memory- consuming tests should be put later on
231 /* inl is set if the URL we are working on (constr) is stored in
232 ulist. Using it is crucial to avoid the incessant calls to
233 in_slist, which is quite slow. */
234 inl = in_slist (ulist, constr);
236 /* If it is FTP, and FTP is not followed, chuck it out. */
238 if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
240 DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
241 ulist = add_slist (ulist, constr, 0);
244 /* If it is absolute link and they are not followed, chuck it
246 if (!inl && u->proto != URLFTP)
247 if (opt.relative_only && !(cur_url->flags & URELATIVE))
249 DEBUGP (("It doesn't really look like a relative link.\n"));
250 ulist = add_slist (ulist, constr, 0);
253 /* If its domain is not to be accepted/looked-up, chuck it out. */
255 if (!accept_domain (u))
257 DEBUGP (("I don't like the smell of that domain.\n"));
258 ulist = add_slist (ulist, constr, 0);
261 /* Check for parent directory. */
262 if (!inl && opt.no_parent
263 /* If the new URL is FTP and the old was not, ignore
265 && !(!this_url_ftp && u->proto == URLFTP))
267 /* Check for base_dir first. */
268 if (!(base_dir && frontcmp (base_dir, u->dir)))
270 /* Failing that, check for parent dir. */
271 struct urlinfo *ut = newurl ();
272 if (parseurl (this_url, ut, 0) != URLOK)
273 DEBUGP (("Double yuck! The *base* URL is broken.\n"));
274 else if (!frontcmp (ut->dir, u->dir))
276 /* Failing that too, kill the URL. */
277 DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
278 ulist = add_slist (ulist, constr, 0);
284 /* If the file does not match the acceptance list, or is on the
285 rejection list, chuck it out. The same goes for the
286 directory exclude- and include- lists. */
287 if (!inl && (opt.includes || opt.excludes))
289 if (!accdir (u->dir, ALLABS))
291 DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
292 ulist = add_slist (ulist, constr, 0);
299 /* We check for acceptance/rejection rules only for non-HTML
300 documents. Since we don't know whether they really are
301 HTML, it will be deduced from (an OR-ed list):
303 1) u->file is "" (meaning it is a directory)
304 2) suffix exists, AND:
308 If the file *is* supposed to be HTML, it will *not* be
309 subject to acc/rej rules. That's why the `!'. */
312 || (((suf = suffix (constr)) != NULL)
313 && (!strcmp (suf, "html") || !strcmp (suf, "htm")))))
315 if (!acceptable (u->file))
317 DEBUGP (("%s (%s) does not match acc/rej rules.\n",
319 ulist = add_slist (ulist, constr, 0);
325 /* Optimize the URL (which includes possible DNS lookup) only
326 after all other possibilities have been exhausted. */
329 if (!opt.simple_check)
334 /* Just lowercase the hostname. */
335 for (p = u->host; *p; p++)
338 u->url = str_url (u, 0);
341 constr = xstrdup (u->url);
342 inl = in_slist (ulist, constr);
343 if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
344 if (!opt.spanhost && this_url && !same_host (this_url, constr))
346 DEBUGP (("This is not the same hostname as the parent's.\n"));
347 ulist = add_slist (ulist, constr, 0);
351 /* What about robots.txt? */
352 if (!inl && opt.use_robots && u->proto == URLHTTP)
354 /* Since Wget knows about only one set of robot rules at a
355 time, /robots.txt must be reloaded whenever a new host is
358 robots_host holds the host the current `forbid' variable
360 if (!robots_host || !same_host (robots_host, u->host))
362 FREE_MAYBE (robots_host);
363 /* Now make robots_host the new host, no matter what the
364 result will be. So if there is no /robots.txt on the
365 site, Wget will not retry getting robots all the
367 robots_host = xstrdup (u->host);
368 free_vec (forbidden);
370 err = retrieve_robots (constr, ROBOTS_FILENAME);
373 rurl = robots_url (constr, ROBOTS_FILENAME);
374 rfile = url_filename (rurl);
375 forbidden = parse_robots (rfile);
381 /* Now that we have (or don't have) robots, we can check for
383 if (!robots_match (u, forbidden))
385 DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
387 ulist = add_slist (ulist, constr, 0);
393 /* If it wasn't chucked out, do something with it. */
396 DEBUGP (("I've decided to load it -> "));
397 /* Add it to the list of already-loaded URL-s. */
398 ulist = add_slist (ulist, constr, 0);
399 /* Automatically followed FTPs will *not* be downloaded
401 if (u->proto == URLFTP)
403 /* Don't you adore side-effects? */
406 /* Reset its type. */
409 retrieve_url (constr, &filename, &newloc,
410 canon_this_url ? canon_this_url : this_url, &dt);
411 if (u->proto == URLFTP)
421 /* In case of convert_links: If there was no error, add it to
422 the list of downloaded URLs. We might need it for
424 if (opt.convert_links && filename)
428 urls_downloaded = add_url (urls_downloaded, constr, filename);
429 /* If the URL is HTML, note it. */
431 urls_html = add_slist (urls_html, filename, NOSORT);
434 /* If there was no error, and the type is text/html, parse
439 recursive_retrieve (filename, constr);
442 DEBUGP (("%s is not text/html so we don't chase.\n",
443 filename ? filename: "(null)"));
444 /* If an suffix-rejected file was loaded only because it was HTML,
445 undo the error now */
446 if (opt.delete_after || (filename && !acceptable (filename)))
448 logprintf (LOG_VERBOSE,
449 (opt.delete_after ? _("Removing %s.\n")
450 : _("Removing %s since it should be rejected.\n")),
452 if (unlink (filename))
453 logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
456 /* If everything was OK, and links are to be converted, let's
457 store the local filename. */
458 if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
460 cur_url->flags |= UABS2REL;
461 cur_url->local_name = xstrdup (filename);
464 DEBUGP (("%s already in list, so we don't load.\n", constr));
465 /* Free filename and constr. */
466 FREE_MAYBE (filename);
469 /* Increment the pbuf for the appropriate size. */
471 if (opt.convert_links)
472 convert_links (file, url_list);
473 /* Free the linked list of URL-s. */
474 free_urlpos (url_list);
475 /* Free the canonical this_url. */
476 FREE_MAYBE (canon_this_url);
477 /* Decrement the recursion depth. */
479 if (opt.quota && (opt.downloaded > opt.quota))
485 /* Simple calls to convert_links will often fail because only the
486 downloaded files are converted, and Wget cannot know which files
487 will be converted in the future. So, if we have file fileone.html
490 <a href=/c/something.gif>
492 and /c/something.gif was not downloaded because it exceeded the
493 recursion depth, the reference will *not* be changed.
495 However, later we can encounter /c/something.gif from an "upper"
496 level HTML (let's call it filetwo.html), and it gets downloaded.
498 But now we have a problem because /c/something.gif will be
499 correctly transformed in filetwo.html, but not in fileone.html,
500 since Wget could not have known that /c/something.gif will be
501 downloaded in the future.
503 This is why Wget must, after the whole retrieval, call
504 convert_all_links to go once more through the entire list of
505 retrieved HTML-s, and re-convert them.
507 All the downloaded HTMLs are kept in urls_html, and downloaded URLs
508 in urls_downloaded. From these two lists information is
511 convert_all_links (void)
514 urlpos *l1, *l2, *urls;
519 for (html = urls_html; html; html = html->next)
521 DEBUGP (("Rescanning %s\n", html->string));
522 /* Determine the URL of the HTML file. get_urls_html will need
524 for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
525 if (!strcmp (urlhtml->local_name, html->string))
528 DEBUGP (("It should correspond to %s.\n", urlhtml->url));
530 DEBUGP (("I cannot find the corresponding URL.\n"));
531 /* Parse the HTML file... */
532 urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1);
535 for (l1 = urls; l1; l1 = l1->next)
537 /* The URL must be in canonical form to be compared. */
539 res = parseurl (l1->url, u, 0);
545 /* We decide the direction of conversion according to whether
546 a URL was downloaded. Downloaded URLs will be converted
547 ABS2REL, whereas non-downloaded will be converted REL2ABS.
548 Note: not yet implemented; only ABS2REL works. */
549 for (l2 = urls_downloaded; l2; l2 = l2->next)
550 if (!strcmp (l2->url, u->url))
552 DEBUGP (("%s flagged for conversion, local %s\n",
553 l2->url, l2->local_name));
556 /* Clear the flags. */
557 l1->flags &= ~ (UABS2REL | UREL2ABS);
558 /* Decide on the conversion direction. */
561 l1->flags |= UABS2REL;
562 l1->local_name = xstrdup (l2->local_name);
566 l1->flags |= UREL2ABS;
567 l1->local_name = NULL;
571 /* Convert the links in the file. */
572 convert_links (html->string, urls);
578 /* Robots support. */
580 /* Construct the robots URL. */
581 static struct urlinfo *
582 robots_url (const char *url, const char *robots_filename)
584 struct urlinfo *u = newurl ();
587 err = parseurl (url, u, 0);
588 assert (err == URLOK && u->proto == URLHTTP);
592 u->dir = xstrdup ("");
593 u->file = xstrdup (robots_filename);
594 u->url = str_url (u, 0);
598 /* Retrieves the robots_filename from the root server directory, if
599 possible. Returns ROBOTSOK if robots were retrieved OK, and
600 NOROBOTS if robots could not be retrieved for any reason. */
602 retrieve_robots (const char *url, const char *robots_filename)
608 u = robots_url (url, robots_filename);
609 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
610 err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
618 /* Parse the robots_filename and return the disallowed path components
619 in a malloc-ed vector of character pointers.
621 It should be fully compliant with the syntax as described in the
622 file norobots.txt, adopted by the robots mailing list
623 (robots@webcrawler.com). */
625 parse_robots (const char *robots_filename)
629 char *line, *cmd, *str, *p;
630 char *base_version, *version;
632 int wget_matched; /* is the part meant for Wget? */
637 fp = fopen (robots_filename, "rb");
641 /* Kill version number. */
644 STRDUP_ALLOCA (base_version, opt.useragent);
645 STRDUP_ALLOCA (version, opt.useragent);
649 int len = 10 + strlen (version_string);
650 base_version = (char *)alloca (len);
651 sprintf (base_version, "Wget/%s", version_string);
652 version = (char *)alloca (len);
653 sprintf (version, "Wget/%s", version_string);
655 for (p = version; *p; p++)
657 for (p = base_version; *p && *p != '/'; p++)
661 /* Setting this to 1 means that Wget considers itself under
662 restrictions by default, even if the User-Agent field is not
663 present. However, if it finds the user-agent set to anything
664 other than Wget, the rest will be ignored (up to the following
665 User-Agent field). Thus you may have something like:
669 User-Agent: stupid-robot
678 In this case the 1, 2, 5, 6 and 7 disallow lines will be
681 while ((line = read_whole_line (fp)))
684 /* Destroy <CR> if there is one. */
685 if (len && line[len - 1] == '\r')
686 line[len - 1] = '\0';
687 /* According to specifications, optional space may be at the
689 DEBUGP (("Line: %s\n", line));
691 for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
695 DEBUGP (("(chucked out)\n"));
699 for (str = cmd; *str && *str != ':'; str++);
703 DEBUGP (("(chucked out)\n"));
706 /* Zero-terminate the command. */
708 /* Look for the string beginning... */
709 for (; *str && ISSPACE (*str); str++);
710 /* Look for comments and kill them off. */
711 for (p = str; *p; p++)
712 if (*p && ISSPACE (*p) && *(p + 1) == '#')
714 /* We have found a shell-style comment `<sp>+ #'. Now
715 rewind to the beginning of the spaces and place '\0'
717 while (p > str && ISSPACE (*p))
725 if (!strcasecmp (cmd, "User-agent"))
728 /* Lowercase the agent string. */
729 for (p = str; *p; p++)
731 /* If the string is `*', it matches. */
732 if (*str == '*' && !*(str + 1))
736 /* If the string contains wildcards, we'll run it through
738 if (has_wildcards_p (str))
740 /* If the string contains '/', compare with the full
741 version. Else, compare it to base_version. */
742 if (strchr (str, '/'))
743 match = !fnmatch (str, version, 0);
745 match = !fnmatch (str, base_version, 0);
747 else /* Substring search */
749 if (strstr (version, str))
755 /* If Wget is not matched, skip all the entries up to the
756 next User-agent field. */
757 wget_matched = match;
759 else if (!wget_matched)
762 DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
765 else if (!strcasecmp (cmd, "Disallow"))
767 /* If "Disallow" is empty, the robot is welcome. */
771 entries = (char **)xmalloc (sizeof (char *));
777 entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
778 entries[num] = xstrdup (str);
779 entries[++num] = NULL;
780 /* Strip trailing spaces, according to specifications. */
781 for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
782 if (ISSPACE (str[i]))
788 /* unknown command */
789 DEBUGP (("(chucked out)\n"));
797 /* May the URL url be loaded according to disallowing rules stored in
800 robots_match (struct urlinfo *u, char **forbidden)
806 DEBUGP (("Matching %s against: ", u->path));
807 for (; *forbidden; forbidden++)
809 DEBUGP (("%s ", *forbidden));
810 l = strlen (*forbidden);
811 /* If dir is forbidden, we may not load the file. */
812 if (strncmp (u->path, *forbidden, l) == 0)
814 DEBUGP (("matched.\n"));
815 return 0; /* Matches, i.e. does not load... */
818 DEBUGP (("not matched.\n"));