X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=11c30a2157e00ba2f433cba0e98045b3bb9bb2a4;hb=d5be8ecca466601bda9b81c28a79077fbda6ccde;hp=98c5597035f9d6d29fa0c8f4bf51495db68b2b35;hpb=2ffb47eabf9fe89d513dc79bdc535e4092e1d6ee;p=wget diff --git a/src/recur.c b/src/recur.c index 98c55970..11c30a21 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,20 +1,20 @@ /* Handling of recursive HTTP retrieving. Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. -This file is part of Wget. +This file is part of GNU Wget. -This program is free software; you can redistribute it and/or modify +GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -This program is distributed in the hope that it will be useful, +GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software +along with Wget; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include @@ -31,7 +31,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #endif /* HAVE_UNISTD_H */ #include #include -#include #include #include "wget.h" @@ -43,40 +42,32 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "fnmatch.h" #include "host.h" #include "hash.h" +#include "res.h" -extern char *version_string; +#ifndef errno +extern int errno; +#endif -#define ROBOTS_FILENAME "robots.txt" +extern char *version_string; static struct hash_table *dl_file_url_map; static struct hash_table *dl_url_file_map; -/* List of HTML URLs. */ -static slist *urls_html; +/* List of HTML files downloaded in this Wget run. Used for link + conversion after Wget is done. */ +static slist *downloaded_html_files; /* List of undesirable-to-load URLs. */ static struct hash_table *undesirable_urls; -/* List of forbidden locations. */ -static char **forbidden = NULL; - /* Current recursion depth. */ static int depth; /* Base directory we're recursing from (used by no_parent). */ static char *base_dir; -/* The host name for which we last checked robots. */ -static char *robots_host; - static int first_time = 1; -/* Construct the robots URL. */ -static struct urlinfo *robots_url PARAMS ((const char *, const char *)); -static uerr_t retrieve_robots PARAMS ((const char *, const char *)); -static char **parse_robots PARAMS ((const char *)); -static int robots_match PARAMS ((struct urlinfo *, char **)); - /* Cleanup the data structures associated with recursive retrieving (the variables above). */ @@ -101,12 +92,9 @@ recursive_cleanup (void) dl_url_file_map = NULL; } undesirable_urls = NULL; - free_vec (forbidden); - forbidden = NULL; - slist_free (urls_html); - urls_html = NULL; + slist_free (downloaded_html_files); + downloaded_html_files = NULL; FREE_MAYBE (base_dir); - FREE_MAYBE (robots_host); first_time = 1; } @@ -132,11 +120,8 @@ recursive_retrieve (const char *file, const char *this_url) int dt, inl, dash_p_leaf_HTML = FALSE; int meta_disallow_follow; int this_url_ftp; /* See below the explanation */ - uerr_t err; - struct urlinfo *rurl; urlpos *url_list, *cur_url; - char *rfile; /* For robots */ - struct urlinfo *u; + struct url *u; assert (this_url != NULL); assert (file != NULL); @@ -150,25 +135,14 @@ recursive_retrieve (const char *file, const char *this_url) run. They should probably be at a different location. */ if (!undesirable_urls) undesirable_urls = make_string_hash_table (0); - if (!dl_file_url_map) - dl_file_url_map = make_string_hash_table (0); - if (!dl_url_file_map) - dl_url_file_map = make_string_hash_table (0); hash_table_clear (undesirable_urls); string_set_add (undesirable_urls, this_url); - hash_table_clear (dl_file_url_map); - hash_table_clear (dl_url_file_map); - urls_html = NULL; /* Enter this_url to the hash table, in original and "enhanced" form. */ - u = newurl (); - err = parseurl (this_url, u, 0); - if (err == URLOK) + u = url_parse (this_url, NULL); + if (u) { string_set_add (undesirable_urls, u->url); - hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url)); - hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file)); - urls_html = slist_prepend (urls_html, file); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This @@ -180,10 +154,8 @@ recursive_retrieve (const char *file, const char *this_url) DEBUGP (("Double yuck! The *base* URL is broken.\n")); base_dir = NULL; } - freeurl (u, 1); + url_free (u); depth = 1; - robots_host = NULL; - forbidden = NULL; first_time = 0; } else @@ -213,7 +185,7 @@ recursive_retrieve (const char *file, const char *this_url) that the retrieval is done through proxy. In that case, FTP links will be followed by default and recursion will not be turned off when following them. */ - this_url_ftp = (urlproto (this_url) == URLFTP); + this_url_ftp = (url_scheme (this_url) == SCHEME_FTP); /* Get the URL-s from an HTML file: */ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, @@ -236,17 +208,10 @@ recursive_retrieve (const char *file, const char *this_url) break; /* Parse the URL for convenient use in other functions, as well as to get the optimized form. It also checks URL integrity. */ - u = newurl (); - if (parseurl (cur_url->url, u, 0) != URLOK) + u = url_parse (cur_url->url, NULL); + if (!u) { DEBUGP (("Yuck! A bad URL.\n")); - freeurl (u, 1); - continue; - } - if (u->proto == URLFILE) - { - DEBUGP (("Nothing to do with file:// around here.\n")); - freeurl (u, 1); continue; } assert (u->url != NULL); @@ -276,11 +241,11 @@ recursive_retrieve (const char *file, const char *this_url) /* inl is set if the URL we are working on (constr) is stored in undesirable_urls. Using it is crucial to avoid unnecessary repeated continuous hits to the hash table. */ - inl = string_set_exists (undesirable_urls, constr); + inl = string_set_contains (undesirable_urls, constr); /* If it is FTP, and FTP is not followed, chuck it out. */ if (!inl) - if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) + if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp) { DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); string_set_add (undesirable_urls, constr); @@ -288,7 +253,7 @@ recursive_retrieve (const char *file, const char *this_url) } /* If it is absolute link and they are not followed, chuck it out. */ - if (!inl && u->proto != URLFTP) + if (!inl && u->scheme != SCHEME_FTP) if (opt.relative_only && !cur_url->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); @@ -307,14 +272,14 @@ recursive_retrieve (const char *file, const char *this_url) if (!inl && opt.no_parent /* If the new URL is FTP and the old was not, ignore opt.no_parent. */ - && !(!this_url_ftp && u->proto == URLFTP)) + && !(!this_url_ftp && u->scheme == SCHEME_FTP)) { /* Check for base_dir first. */ if (!(base_dir && frontcmp (base_dir, u->dir))) { /* Failing that, check for parent dir. */ - struct urlinfo *ut = newurl (); - if (parseurl (this_url, ut, 0) != URLOK) + struct url *ut = url_parse (this_url, NULL); + if (!ut) DEBUGP (("Double yuck! The *base* URL is broken.\n")); else if (!frontcmp (ut->dir, u->dir)) { @@ -323,7 +288,7 @@ recursive_retrieve (const char *file, const char *this_url) string_set_add (undesirable_urls, constr); inl = 1; } - freeurl (ut, 1); + url_free (ut); } } /* If the file does not match the acceptance list, or is on the @@ -375,7 +340,16 @@ recursive_retrieve (const char *file, const char *this_url) if (!inl) { if (!opt.simple_check) - opt_url (u); + { + /* Find the "true" host. */ + char *host = realhost (u->host); + xfree (u->host); + u->host = host; + + /* Refresh the printed representation of the URL. */ + xfree (u->url); + u->url = url_string (u, 0); + } else { char *p; @@ -383,12 +357,18 @@ recursive_retrieve (const char *file, const char *this_url) for (p = u->host; *p; p++) *p = TOLOWER (*p); xfree (u->url); - u->url = str_url (u, 0); + u->url = url_string (u, 0); } xfree (constr); constr = xstrdup (u->url); - string_set_add (undesirable_urls, constr); - if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) + /* After we have canonicalized the URL, check if we have it + on the black list. */ + if (string_set_contains (undesirable_urls, constr)) + inl = 1; + /* This line is bogus. */ + /*string_set_add (undesirable_urls, constr);*/ + + if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp)) if (!opt.spanhost && this_url && !same_host (this_url, constr)) { DEBUGP (("This is not the same hostname as the parent's.\n")); @@ -397,41 +377,33 @@ recursive_retrieve (const char *file, const char *this_url) } } /* What about robots.txt? */ - if (!inl && opt.use_robots && u->proto == URLHTTP) + if (!inl && opt.use_robots && u->scheme == SCHEME_FTP) { - /* Since Wget knows about only one set of robot rules at a - time, /robots.txt must be reloaded whenever a new host is - accessed. - - robots_host holds the host the current `forbid' variable - is assigned to. */ - if (!robots_host || !same_host (robots_host, u->host)) + struct robot_specs *specs = res_get_specs (u->host, u->port); + if (!specs) { - FREE_MAYBE (robots_host); - /* Now make robots_host the new host, no matter what the - result will be. So if there is no /robots.txt on the - site, Wget will not retry getting robots all the - time. */ - robots_host = xstrdup (u->host); - free_vec (forbidden); - forbidden = NULL; - err = retrieve_robots (constr, ROBOTS_FILENAME); - if (err == ROBOTSOK) + char *rfile; + if (res_retrieve_file (constr, &rfile)) { - rurl = robots_url (constr, ROBOTS_FILENAME); - rfile = url_filename (rurl); - forbidden = parse_robots (rfile); - freeurl (rurl, 1); + specs = res_parse_from_file (rfile); xfree (rfile); } + else + { + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); + } + res_register_specs (u->host, u->port, specs); } - /* Now that we have (or don't have) robots, we can check for - them. */ - if (!robots_match (u, forbidden)) + /* Now that we have (or don't have) robots.txt specs, we can + check what they say. */ + if (!res_match_path (specs, u->path)) { - DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, - ROBOTS_FILENAME)); + DEBUGP (("Not following %s because robots.txt forbids it.\n", + constr)); string_set_add (undesirable_urls, constr); inl = 1; } @@ -446,7 +418,7 @@ recursive_retrieve (const char *file, const char *this_url) string_set_add (undesirable_urls, constr); /* Automatically followed FTPs will *not* be downloaded recursively. */ - if (u->proto == URLFTP) + if (u->scheme == SCHEME_FTP) { /* Don't you adore side-effects? */ opt.recursive = 0; @@ -456,7 +428,7 @@ recursive_retrieve (const char *file, const char *this_url) /* Retrieve it. */ retrieve_url (constr, &filename, &newloc, canon_this_url ? canon_this_url : this_url, &dt); - if (u->proto == URLFTP) + if (u->scheme == SCHEME_FTP) { /* Restore... */ opt.recursive = 1; @@ -466,22 +438,6 @@ recursive_retrieve (const char *file, const char *this_url) xfree (constr); constr = newloc; } - /* In case of convert_links: If there was no error, add it to - the list of downloaded URLs. We might need it for - conversion. */ - if (opt.convert_links && filename) - { - if (dt & RETROKF) - { - hash_table_put (dl_file_url_map, - xstrdup (filename), xstrdup (constr)); - hash_table_put (dl_url_file_map, - xstrdup (constr), xstrdup (filename)); - /* If the URL is HTML, note it. */ - if (dt & TEXTHTML) - urls_html = slist_prepend (urls_html, filename); - } - } /* If there was no error, and the type is text/html, parse it recursively. */ if (dt & TEXTHTML) @@ -523,7 +479,7 @@ recursive_retrieve (const char *file, const char *this_url) /* Free filename and constr. */ FREE_MAYBE (filename); FREE_MAYBE (constr); - freeurl (u, 1); + url_free (u); /* Increment the pbuf for the appropriate size. */ } if (opt.convert_links && !opt.delete_after) @@ -544,6 +500,27 @@ recursive_retrieve (const char *file, const char *this_url) return RETROK; } +void +register_download (const char *url, const char *file) +{ + if (!opt.convert_links) + return; + if (!dl_file_url_map) + dl_file_url_map = make_string_hash_table (0); + hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); + if (!dl_url_file_map) + dl_url_file_map = make_string_hash_table (0); + hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); +} + +void +register_html (const char *url, const char *file) +{ + if (!opt.convert_links) + return; + downloaded_html_files = slist_prepend (downloaded_html_files, file); +} + /* convert_links() is called from recursive_retrieve() after we're done with an HTML file. This call to convert_links is not complete because it converts only the downloaded files, and Wget cannot know @@ -567,7 +544,7 @@ recursive_retrieve (const char *file, const char *this_url) convert_all_links to go once more through the entire list of retrieved HTMLs, and re-convert them. - All the downloaded HTMLs are kept in urls_html, and downloaded URLs + All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs in urls_downloaded. From these two lists information is extracted. */ void @@ -575,11 +552,11 @@ convert_all_links (void) { slist *html; - /* Destructively reverse urls_html to get it in the right order. + /* Destructively reverse downloaded_html_files to get it in the right order. recursive_retrieve() used slist_prepend() consistently. */ - urls_html = slist_nreverse (urls_html); + downloaded_html_files = slist_nreverse (downloaded_html_files); - for (html = urls_html; html; html = html->next) + for (html = downloaded_html_files; html; html = html->next) { urlpos *urls, *cur_url; char *url; @@ -602,13 +579,9 @@ convert_all_links (void) char *local_name; /* The URL must be in canonical form to be compared. */ - struct urlinfo *u = newurl (); - uerr_t res = parseurl (cur_url->url, u, 0); - if (res != URLOK) - { - freeurl (u, 1); - continue; - } + struct url *u = url_parse (cur_url->url, NULL); + if (!u) + continue; /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted ABS2REL, whereas non-downloaded will be converted REL2ABS. */ @@ -637,7 +610,7 @@ convert_all_links (void) cur_url->convert = CO_CONVERT_TO_COMPLETE; cur_url->local_name = NULL; } - freeurl (u, 1); + url_free (u); } /* Convert the links in the file. */ convert_links (html->string, urls); @@ -645,249 +618,3 @@ convert_all_links (void) free_urlpos (urls); } } - -/* Robots support. */ - -/* Construct the robots URL. */ -static struct urlinfo * -robots_url (const char *url, const char *robots_filename) -{ - struct urlinfo *u = newurl (); - uerr_t err; - - err = parseurl (url, u, 0); - assert (err == URLOK && u->proto == URLHTTP); - xfree (u->file); - xfree (u->dir); - xfree (u->url); - u->dir = xstrdup (""); - u->file = xstrdup (robots_filename); - u->url = str_url (u, 0); - return u; -} - -/* Retrieves the robots_filename from the root server directory, if - possible. Returns ROBOTSOK if robots were retrieved OK, and - NOROBOTS if robots could not be retrieved for any reason. */ -static uerr_t -retrieve_robots (const char *url, const char *robots_filename) -{ - int dt; - uerr_t err; - struct urlinfo *u; - - u = robots_url (url, robots_filename); - logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); - err = retrieve_url (u->url, NULL, NULL, NULL, &dt); - freeurl (u, 1); - if (err == RETROK) - return ROBOTSOK; - else - return NOROBOTS; -} - -/* Parse the robots_filename and return the disallowed path components - in a malloc-ed vector of character pointers. - - It should be fully compliant with the syntax as described in the - file norobots.txt, adopted by the robots mailing list - (robots@webcrawler.com). */ -static char ** -parse_robots (const char *robots_filename) -{ - FILE *fp; - char **entries; - char *line, *cmd, *str, *p; - char *base_version, *version; - int len, num, i; - int wget_matched; /* is the part meant for Wget? */ - - entries = NULL; - - num = 0; - fp = fopen (robots_filename, "rb"); - if (!fp) - return NULL; - - /* Kill version number. */ - if (opt.useragent) - { - STRDUP_ALLOCA (base_version, opt.useragent); - STRDUP_ALLOCA (version, opt.useragent); - } - else - { - int len = 10 + strlen (version_string); - base_version = (char *)alloca (len); - sprintf (base_version, "Wget/%s", version_string); - version = (char *)alloca (len); - sprintf (version, "Wget/%s", version_string); - } - for (p = version; *p; p++) - *p = TOLOWER (*p); - for (p = base_version; *p && *p != '/'; p++) - *p = TOLOWER (*p); - *p = '\0'; - - /* Setting this to 1 means that Wget considers itself under - restrictions by default, even if the User-Agent field is not - present. However, if it finds the user-agent set to anything - other than Wget, the rest will be ignored (up to the following - User-Agent field). Thus you may have something like: - - Disallow: 1 - Disallow: 2 - User-Agent: stupid-robot - Disallow: 3 - Disallow: 4 - User-Agent: Wget* - Disallow: 5 - Disallow: 6 - User-Agent: * - Disallow: 7 - - In this case the 1, 2, 5, 6 and 7 disallow lines will be - stored. */ - wget_matched = 1; - while ((line = read_whole_line (fp))) - { - len = strlen (line); - /* Destroy if present. */ - if (len && line[len - 1] == '\n') - line[--len] = '\0'; - if (len && line[len - 1] == '\r') - line[--len] = '\0'; - /* According to specifications, optional space may be at the - end... */ - DEBUGP (("Line: %s\n", line)); - /* Skip spaces. */ - for (cmd = line; *cmd && ISSPACE (*cmd); cmd++); - if (!*cmd) - { - xfree (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Look for ':'. */ - for (str = cmd; *str && *str != ':'; str++); - if (!*str) - { - xfree (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Zero-terminate the command. */ - *str++ = '\0'; - /* Look for the string beginning... */ - for (; *str && ISSPACE (*str); str++); - /* Look for comments or trailing spaces and kill them off. */ - for (p = str; *p; p++) - if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0'))) - { - /* We have found either a shell-style comment `+#' or some - trailing spaces. Now rewind to the beginning of the spaces - and place '\0' there. */ - while (p > str && ISSPACE (*p)) - --p; - if (p == str) - *p = '\0'; - else - *(p + 1) = '\0'; - break; - } - if (!strcasecmp (cmd, "User-agent")) - { - int match = 0; - /* Lowercase the agent string. */ - for (p = str; *p; p++) - *p = TOLOWER (*p); - /* If the string is `*', it matches. */ - if (*str == '*' && !*(str + 1)) - match = 1; - else - { - /* If the string contains wildcards, we'll run it through - fnmatch(). */ - if (has_wildcards_p (str)) - { - /* If the string contains '/', compare with the full - version. Else, compare it to base_version. */ - if (strchr (str, '/')) - match = !fnmatch (str, version, 0); - else - match = !fnmatch (str, base_version, 0); - } - else /* Substring search */ - { - if (strstr (version, str)) - match = 1; - else - match = 0; - } - } - /* If Wget is not matched, skip all the entries up to the - next User-agent field. */ - wget_matched = match; - } - else if (!wget_matched) - { - xfree (line); - DEBUGP (("(chucking out since it is not applicable for Wget)\n")); - continue; - } - else if (!strcasecmp (cmd, "Disallow")) - { - /* If "Disallow" is empty, the robot is welcome. */ - if (!*str) - { - free_vec (entries); - entries = (char **)xmalloc (sizeof (char *)); - *entries = NULL; - num = 0; - } - else - { - entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *)); - entries[num] = xstrdup (str); - entries[++num] = NULL; - /* Strip trailing spaces, according to specifications. */ - for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--) - if (ISSPACE (str[i])) - str[i] = '\0'; - } - } - else - { - /* unknown command */ - DEBUGP (("(chucked out)\n")); - } - xfree (line); - } - fclose (fp); - return entries; -} - -/* May the URL url be loaded according to disallowing rules stored in - forbidden? */ -static int -robots_match (struct urlinfo *u, char **forbidden) -{ - int l; - - if (!forbidden) - return 1; - DEBUGP (("Matching %s against: ", u->path)); - for (; *forbidden; forbidden++) - { - DEBUGP (("%s ", *forbidden)); - l = strlen (*forbidden); - /* If dir is forbidden, we may not load the file. */ - if (strncmp (u->path, *forbidden, l) == 0) - { - DEBUGP (("matched.\n")); - return 0; /* Matches, i.e. does not load... */ - } - } - DEBUGP (("not matched.\n")); - return 1; -}