X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Frecur.c;h=11c30a2157e00ba2f433cba0e98045b3bb9bb2a4;hb=d5be8ecca466601bda9b81c28a79077fbda6ccde;hp=4183fec991b7320cf1058d95270cf5b0341cf52e;hpb=eef4a668b754cf3a7bbee58a65c654df5d414ec6;p=wget diff --git a/src/recur.c b/src/recur.c index 4183fec9..11c30a21 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,20 +1,20 @@ /* Handling of recursive HTTP retrieving. Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. -This file is part of Wget. +This file is part of GNU Wget. -This program is free software; you can redistribute it and/or modify +GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -This program is distributed in the hope that it will be useful, +GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software +along with Wget; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include @@ -31,7 +31,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #endif /* HAVE_UNISTD_H */ #include #include -#include #include #include "wget.h" @@ -42,24 +41,24 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "ftp.h" #include "fnmatch.h" #include "host.h" +#include "hash.h" +#include "res.h" -extern char *version_string; - -#define ROBOTS_FILENAME "robots.txt" +#ifndef errno +extern int errno; +#endif -/* #### Many of these lists should really be hashtables! */ +extern char *version_string; -/* List of downloaded URLs. */ -static urlpos *urls_downloaded; +static struct hash_table *dl_file_url_map; +static struct hash_table *dl_url_file_map; -/* List of HTML URLs. */ -static slist *urls_html; +/* List of HTML files downloaded in this Wget run. Used for link + conversion after Wget is done. */ +static slist *downloaded_html_files; /* List of undesirable-to-load URLs. */ -static slist *ulist; - -/* List of forbidden locations. */ -static char **forbidden = NULL; +static struct hash_table *undesirable_urls; /* Current recursion depth. */ static int depth; @@ -67,33 +66,35 @@ static int depth; /* Base directory we're recursing from (used by no_parent). */ static char *base_dir; -/* The host name for which we last checked robots. */ -static char *robots_host; - static int first_time = 1; -/* Construct the robots URL. */ -static struct urlinfo *robots_url PARAMS ((const char *, const char *)); -static uerr_t retrieve_robots PARAMS ((const char *, const char *)); -static char **parse_robots PARAMS ((const char *)); -static int robots_match PARAMS ((struct urlinfo *, char **)); - /* Cleanup the data structures associated with recursive retrieving (the variables above). */ void recursive_cleanup (void) { - free_slist (ulist); - ulist = NULL; - free_vec (forbidden); - forbidden = NULL; - free_slist (urls_html); - urls_html = NULL; - free_urlpos (urls_downloaded); - urls_downloaded = NULL; + if (undesirable_urls) + { + string_set_free (undesirable_urls); + undesirable_urls = NULL; + } + if (dl_file_url_map) + { + free_keys_and_values (dl_file_url_map); + hash_table_destroy (dl_file_url_map); + dl_file_url_map = NULL; + } + if (dl_url_file_map) + { + free_keys_and_values (dl_url_file_map); + hash_table_destroy (dl_url_file_map); + dl_url_file_map = NULL; + } + undesirable_urls = NULL; + slist_free (downloaded_html_files); + downloaded_html_files = NULL; FREE_MAYBE (base_dir); - FREE_MAYBE (robots_host); first_time = 1; } @@ -117,12 +118,10 @@ recursive_retrieve (const char *file, const char *this_url) char *constr, *filename, *newloc; char *canon_this_url = NULL; int dt, inl, dash_p_leaf_HTML = FALSE; + int meta_disallow_follow; int this_url_ftp; /* See below the explanation */ - uerr_t err; - struct urlinfo *rurl; urlpos *url_list, *cur_url; - char *rfile; /* For robots */ - struct urlinfo *u; + struct url *u; assert (this_url != NULL); assert (file != NULL); @@ -132,17 +131,18 @@ recursive_retrieve (const char *file, const char *this_url) /* Cache the current URL in the list. */ if (first_time) { - ulist = add_slist (ulist, this_url, 0); - urls_downloaded = NULL; - urls_html = NULL; - /* Enter this_url to the slist, in original and "enhanced" form. */ - u = newurl (); - err = parseurl (this_url, u, 0); - if (err == URLOK) + /* These three operations need to be done only once per Wget + run. They should probably be at a different location. */ + if (!undesirable_urls) + undesirable_urls = make_string_hash_table (0); + + hash_table_clear (undesirable_urls); + string_set_add (undesirable_urls, this_url); + /* Enter this_url to the hash table, in original and "enhanced" form. */ + u = url_parse (this_url, NULL); + if (u) { - ulist = add_slist (ulist, u->url, 0); - urls_downloaded = add_url (urls_downloaded, u->url, file); - urls_html = add_slist (urls_html, file, NOSORT); + string_set_add (undesirable_urls, u->url); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This @@ -154,10 +154,8 @@ recursive_retrieve (const char *file, const char *this_url) DEBUGP (("Double yuck! The *base* URL is broken.\n")); base_dir = NULL; } - freeurl (u, 1); + url_free (u); depth = 1; - robots_host = NULL; - forbidden = NULL; first_time = 0; } else @@ -187,11 +185,19 @@ recursive_retrieve (const char *file, const char *this_url) that the retrieval is done through proxy. In that case, FTP links will be followed by default and recursion will not be turned off when following them. */ - this_url_ftp = (urlproto (this_url) == URLFTP); + this_url_ftp = (url_scheme (this_url) == SCHEME_FTP); /* Get the URL-s from an HTML file: */ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url, - 0, dash_p_leaf_HTML); + dash_p_leaf_HTML, &meta_disallow_follow); + + if (opt.use_robots && meta_disallow_follow) + { + /* The META tag says we are not to follow this file. Respect + that. */ + free_urlpos (url_list); + url_list = NULL; + } /* Decide what to do with each of the URLs. A URL will be loaded if it meets several requirements, discussed later. */ @@ -202,17 +208,10 @@ recursive_retrieve (const char *file, const char *this_url) break; /* Parse the URL for convenient use in other functions, as well as to get the optimized form. It also checks URL integrity. */ - u = newurl (); - if (parseurl (cur_url->url, u, 0) != URLOK) + u = url_parse (cur_url->url, NULL); + if (!u) { DEBUGP (("Yuck! A bad URL.\n")); - freeurl (u, 1); - continue; - } - if (u->proto == URLFILE) - { - DEBUGP (("Nothing to do with file:// around here.\n")); - freeurl (u, 1); continue; } assert (u->url != NULL); @@ -240,25 +239,25 @@ recursive_retrieve (const char *file, const char *this_url) the list. */ /* inl is set if the URL we are working on (constr) is stored in - ulist. Using it is crucial to avoid the incessant calls to - in_slist, which is quite slow. */ - inl = in_slist (ulist, constr); + undesirable_urls. Using it is crucial to avoid unnecessary + repeated continuous hits to the hash table. */ + inl = string_set_contains (undesirable_urls, constr); /* If it is FTP, and FTP is not followed, chuck it out. */ if (!inl) - if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp) + if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp) { DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } /* If it is absolute link and they are not followed, chuck it out. */ - if (!inl && u->proto != URLFTP) - if (opt.relative_only && !(cur_url->flags & URELATIVE)) + if (!inl && u->scheme != SCHEME_FTP) + if (opt.relative_only && !cur_url->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } /* If its domain is not to be accepted/looked-up, chuck it out. */ @@ -266,30 +265,30 @@ recursive_retrieve (const char *file, const char *this_url) if (!accept_domain (u)) { DEBUGP (("I don't like the smell of that domain.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } /* Check for parent directory. */ if (!inl && opt.no_parent /* If the new URL is FTP and the old was not, ignore opt.no_parent. */ - && !(!this_url_ftp && u->proto == URLFTP)) + && !(!this_url_ftp && u->scheme == SCHEME_FTP)) { /* Check for base_dir first. */ if (!(base_dir && frontcmp (base_dir, u->dir))) { /* Failing that, check for parent dir. */ - struct urlinfo *ut = newurl (); - if (parseurl (this_url, ut, 0) != URLOK) + struct url *ut = url_parse (this_url, NULL); + if (!ut) DEBUGP (("Double yuck! The *base* URL is broken.\n")); else if (!frontcmp (ut->dir, u->dir)) { /* Failing that too, kill the URL. */ DEBUGP (("Trying to escape parental guidance with no_parent on.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } - freeurl (ut, 1); + url_free (ut); } } /* If the file does not match the acceptance list, or is on the @@ -300,7 +299,7 @@ recursive_retrieve (const char *file, const char *this_url) if (!accdir (u->dir, ALLABS)) { DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir)); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -330,7 +329,7 @@ recursive_retrieve (const char *file, const char *this_url) { DEBUGP (("%s (%s) does not match acc/rej rules.\n", constr, u->file)); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -341,64 +340,71 @@ recursive_retrieve (const char *file, const char *this_url) if (!inl) { if (!opt.simple_check) - opt_url (u); + { + /* Find the "true" host. */ + char *host = realhost (u->host); + xfree (u->host); + u->host = host; + + /* Refresh the printed representation of the URL. */ + xfree (u->url); + u->url = url_string (u, 0); + } else { char *p; /* Just lowercase the hostname. */ for (p = u->host; *p; p++) *p = TOLOWER (*p); - free (u->url); - u->url = str_url (u, 0); + xfree (u->url); + u->url = url_string (u, 0); } - free (constr); + xfree (constr); constr = xstrdup (u->url); - inl = in_slist (ulist, constr); - if (!inl && !((u->proto == URLFTP) && !this_url_ftp)) + /* After we have canonicalized the URL, check if we have it + on the black list. */ + if (string_set_contains (undesirable_urls, constr)) + inl = 1; + /* This line is bogus. */ + /*string_set_add (undesirable_urls, constr);*/ + + if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp)) if (!opt.spanhost && this_url && !same_host (this_url, constr)) { DEBUGP (("This is not the same hostname as the parent's.\n")); - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); inl = 1; } } /* What about robots.txt? */ - if (!inl && opt.use_robots && u->proto == URLHTTP) + if (!inl && opt.use_robots && u->scheme == SCHEME_FTP) { - /* Since Wget knows about only one set of robot rules at a - time, /robots.txt must be reloaded whenever a new host is - accessed. - - robots_host holds the host the current `forbid' variable - is assigned to. */ - if (!robots_host || !same_host (robots_host, u->host)) + struct robot_specs *specs = res_get_specs (u->host, u->port); + if (!specs) { - FREE_MAYBE (robots_host); - /* Now make robots_host the new host, no matter what the - result will be. So if there is no /robots.txt on the - site, Wget will not retry getting robots all the - time. */ - robots_host = xstrdup (u->host); - free_vec (forbidden); - forbidden = NULL; - err = retrieve_robots (constr, ROBOTS_FILENAME); - if (err == ROBOTSOK) + char *rfile; + if (res_retrieve_file (constr, &rfile)) + { + specs = res_parse_from_file (rfile); + xfree (rfile); + } + else { - rurl = robots_url (constr, ROBOTS_FILENAME); - rfile = url_filename (rurl); - forbidden = parse_robots (rfile); - freeurl (rurl, 1); - free (rfile); + /* If we cannot get real specs, at least produce + dummy ones so that we can register them and stop + trying to retrieve them. */ + specs = res_parse ("", 0); } + res_register_specs (u->host, u->port, specs); } - /* Now that we have (or don't have) robots, we can check for - them. */ - if (!robots_match (u, forbidden)) + /* Now that we have (or don't have) robots.txt specs, we can + check what they say. */ + if (!res_match_path (specs, u->path)) { - DEBUGP (("Stuffing %s because %s forbids it.\n", this_url, - ROBOTS_FILENAME)); - ulist = add_slist (ulist, constr, 0); + DEBUGP (("Not following %s because robots.txt forbids it.\n", + constr)); + string_set_add (undesirable_urls, constr); inl = 1; } } @@ -409,10 +415,10 @@ recursive_retrieve (const char *file, const char *this_url) { DEBUGP (("I've decided to load it -> ")); /* Add it to the list of already-loaded URL-s. */ - ulist = add_slist (ulist, constr, 0); + string_set_add (undesirable_urls, constr); /* Automatically followed FTPs will *not* be downloaded recursively. */ - if (u->proto == URLFTP) + if (u->scheme == SCHEME_FTP) { /* Don't you adore side-effects? */ opt.recursive = 0; @@ -422,29 +428,16 @@ recursive_retrieve (const char *file, const char *this_url) /* Retrieve it. */ retrieve_url (constr, &filename, &newloc, canon_this_url ? canon_this_url : this_url, &dt); - if (u->proto == URLFTP) + if (u->scheme == SCHEME_FTP) { /* Restore... */ opt.recursive = 1; } if (newloc) { - free (constr); + xfree (constr); constr = newloc; } - /* In case of convert_links: If there was no error, add it to - the list of downloaded URLs. We might need it for - conversion. */ - if (opt.convert_links && filename) - { - if (dt & RETROKF) - { - urls_downloaded = add_url (urls_downloaded, constr, filename); - /* If the URL is HTML, note it. */ - if (dt & TEXTHTML) - urls_html = add_slist (urls_html, filename, NOSORT); - } - } /* If there was no error, and the type is text/html, parse it recursively. */ if (dt & TEXTHTML) @@ -477,18 +470,23 @@ recursive_retrieve (const char *file, const char *this_url) store the local filename. */ if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) { - cur_url->flags |= UABS2REL; + cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->local_name = xstrdup (filename); } } - DEBUGP (("%s already in list, so we don't load.\n", constr)); + else + DEBUGP (("%s already in list, so we don't load.\n", constr)); /* Free filename and constr. */ FREE_MAYBE (filename); FREE_MAYBE (constr); - freeurl (u, 1); + url_free (u); /* Increment the pbuf for the appropriate size. */ } if (opt.convert_links && !opt.delete_after) + /* This is merely the first pass: the links that have been + successfully downloaded are converted. In the second pass, + convert_all_links() will also convert those links that have NOT + been downloaded to their canonical form. */ convert_links (file, url_list); /* Free the linked list of URL-s. */ free_urlpos (url_list); @@ -502,12 +500,34 @@ recursive_retrieve (const char *file, const char *this_url) return RETROK; } -/* Simple calls to convert_links will often fail because only the - downloaded files are converted, and Wget cannot know which files - will be converted in the future. So, if we have file fileone.html - with: +void +register_download (const char *url, const char *file) +{ + if (!opt.convert_links) + return; + if (!dl_file_url_map) + dl_file_url_map = make_string_hash_table (0); + hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); + if (!dl_url_file_map) + dl_url_file_map = make_string_hash_table (0); + hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); +} + +void +register_html (const char *url, const char *file) +{ + if (!opt.convert_links) + return; + downloaded_html_files = slist_prepend (downloaded_html_files, file); +} + +/* convert_links() is called from recursive_retrieve() after we're + done with an HTML file. This call to convert_links is not complete + because it converts only the downloaded files, and Wget cannot know + which files will be downloaded afterwards. So, if we have file + fileone.html with: - + and /c/something.gif was not downloaded because it exceeded the recursion depth, the reference will *not* be changed. @@ -524,70 +544,73 @@ recursive_retrieve (const char *file, const char *this_url) convert_all_links to go once more through the entire list of retrieved HTMLs, and re-convert them. - All the downloaded HTMLs are kept in urls_html, and downloaded URLs + All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs in urls_downloaded. From these two lists information is extracted. */ void convert_all_links (void) { - uerr_t res; - urlpos *l1, *l2, *urls; - struct urlinfo *u; slist *html; - urlpos *urlhtml; - for (html = urls_html; html; html = html->next) + /* Destructively reverse downloaded_html_files to get it in the right order. + recursive_retrieve() used slist_prepend() consistently. */ + downloaded_html_files = slist_nreverse (downloaded_html_files); + + for (html = downloaded_html_files; html; html = html->next) { + urlpos *urls, *cur_url; + char *url; + DEBUGP (("Rescanning %s\n", html->string)); /* Determine the URL of the HTML file. get_urls_html will need it. */ - for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next) - if (!strcmp (urlhtml->local_name, html->string)) - break; - if (urlhtml) - DEBUGP (("It should correspond to %s.\n", urlhtml->url)); + url = hash_table_get (dl_file_url_map, html->string); + if (url) + DEBUGP (("It should correspond to %s.\n", url)); else DEBUGP (("I cannot find the corresponding URL.\n")); /* Parse the HTML file... */ - urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1, - FALSE); - if (!urls) - continue; - for (l1 = urls; l1; l1 = l1->next) + urls = get_urls_html (html->string, url, FALSE, NULL); + /* We don't respect meta_disallow_follow here because, even if + the file is not followed, we might still want to convert the + links that have been followed from other files. */ + for (cur_url = urls; cur_url; cur_url = cur_url->next) { + char *local_name; + /* The URL must be in canonical form to be compared. */ - u = newurl (); - res = parseurl (l1->url, u, 0); - if (res != URLOK) - { - freeurl (u, 1); - continue; - } + struct url *u = url_parse (cur_url->url, NULL); + if (!u) + continue; /* We decide the direction of conversion according to whether a URL was downloaded. Downloaded URLs will be converted - ABS2REL, whereas non-downloaded will be converted REL2ABS. - Note: not yet implemented; only ABS2REL works. */ - for (l2 = urls_downloaded; l2; l2 = l2->next) - if (!strcmp (l2->url, u->url)) - { - DEBUGP (("%s flagged for conversion, local %s\n", - l2->url, l2->local_name)); - break; - } - /* Clear the flags. */ - l1->flags &= ~ (UABS2REL | UREL2ABS); + ABS2REL, whereas non-downloaded will be converted REL2ABS. */ + local_name = hash_table_get (dl_url_file_map, u->url); + if (local_name) + DEBUGP (("%s marked for conversion, local %s\n", + u->url, local_name)); /* Decide on the conversion direction. */ - if (l2) + if (local_name) { - l1->flags |= UABS2REL; - l1->local_name = xstrdup (l2->local_name); + /* We've downloaded this URL. Convert it to relative + form. We do this even if the URL already is in + relative form, because our directory structure may + not be identical to that on the server (think `-nd', + `--cut-dirs', etc.) */ + cur_url->convert = CO_CONVERT_TO_RELATIVE; + cur_url->local_name = xstrdup (local_name); } else { - l1->flags |= UREL2ABS; - l1->local_name = NULL; + /* We haven't downloaded this URL. If it's not already + complete (including a full host name), convert it to + that form, so it can be reached while browsing this + HTML locally. */ + if (!cur_url->link_complete_p) + cur_url->convert = CO_CONVERT_TO_COMPLETE; + cur_url->local_name = NULL; } - freeurl (u, 1); + url_free (u); } /* Convert the links in the file. */ convert_links (html->string, urls); @@ -595,247 +618,3 @@ convert_all_links (void) free_urlpos (urls); } } - -/* Robots support. */ - -/* Construct the robots URL. */ -static struct urlinfo * -robots_url (const char *url, const char *robots_filename) -{ - struct urlinfo *u = newurl (); - uerr_t err; - - err = parseurl (url, u, 0); - assert (err == URLOK && u->proto == URLHTTP); - free (u->file); - free (u->dir); - free (u->url); - u->dir = xstrdup (""); - u->file = xstrdup (robots_filename); - u->url = str_url (u, 0); - return u; -} - -/* Retrieves the robots_filename from the root server directory, if - possible. Returns ROBOTSOK if robots were retrieved OK, and - NOROBOTS if robots could not be retrieved for any reason. */ -static uerr_t -retrieve_robots (const char *url, const char *robots_filename) -{ - int dt; - uerr_t err; - struct urlinfo *u; - - u = robots_url (url, robots_filename); - logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); - err = retrieve_url (u->url, NULL, NULL, NULL, &dt); - freeurl (u, 1); - if (err == RETROK) - return ROBOTSOK; - else - return NOROBOTS; -} - -/* Parse the robots_filename and return the disallowed path components - in a malloc-ed vector of character pointers. - - It should be fully compliant with the syntax as described in the - file norobots.txt, adopted by the robots mailing list - (robots@webcrawler.com). */ -static char ** -parse_robots (const char *robots_filename) -{ - FILE *fp; - char **entries; - char *line, *cmd, *str, *p; - char *base_version, *version; - int len, num, i; - int wget_matched; /* is the part meant for Wget? */ - - entries = NULL; - - num = 0; - fp = fopen (robots_filename, "rb"); - if (!fp) - return NULL; - - /* Kill version number. */ - if (opt.useragent) - { - STRDUP_ALLOCA (base_version, opt.useragent); - STRDUP_ALLOCA (version, opt.useragent); - } - else - { - int len = 10 + strlen (version_string); - base_version = (char *)alloca (len); - sprintf (base_version, "Wget/%s", version_string); - version = (char *)alloca (len); - sprintf (version, "Wget/%s", version_string); - } - for (p = version; *p; p++) - *p = TOLOWER (*p); - for (p = base_version; *p && *p != '/'; p++) - *p = TOLOWER (*p); - *p = '\0'; - - /* Setting this to 1 means that Wget considers itself under - restrictions by default, even if the User-Agent field is not - present. However, if it finds the user-agent set to anything - other than Wget, the rest will be ignored (up to the following - User-Agent field). Thus you may have something like: - - Disallow: 1 - Disallow: 2 - User-Agent: stupid-robot - Disallow: 3 - Disallow: 4 - User-Agent: Wget* - Disallow: 5 - Disallow: 6 - User-Agent: * - Disallow: 7 - - In this case the 1, 2, 5, 6 and 7 disallow lines will be - stored. */ - wget_matched = 1; - while ((line = read_whole_line (fp))) - { - len = strlen (line); - /* Destroy if there is one. */ - if (len && line[len - 1] == '\r') - line[len - 1] = '\0'; - /* According to specifications, optional space may be at the - end... */ - DEBUGP (("Line: %s\n", line)); - /* Skip spaces. */ - for (cmd = line; *cmd && ISSPACE (*cmd); cmd++); - if (!*cmd) - { - free (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Look for ':'. */ - for (str = cmd; *str && *str != ':'; str++); - if (!*str) - { - free (line); - DEBUGP (("(chucked out)\n")); - continue; - } - /* Zero-terminate the command. */ - *str++ = '\0'; - /* Look for the string beginning... */ - for (; *str && ISSPACE (*str); str++); - /* Look for comments or trailing spaces and kill them off. */ - for (p = str; *p; p++) - if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0'))) - { - /* We have found either a shell-style comment `+#' or some - trailing spaces. Now rewind to the beginning of the spaces - and place '\0' there. */ - while (p > str && ISSPACE (*p)) - --p; - if (p == str) - *p = '\0'; - else - *(p + 1) = '\0'; - break; - } - if (!strcasecmp (cmd, "User-agent")) - { - int match = 0; - /* Lowercase the agent string. */ - for (p = str; *p; p++) - *p = TOLOWER (*p); - /* If the string is `*', it matches. */ - if (*str == '*' && !*(str + 1)) - match = 1; - else - { - /* If the string contains wildcards, we'll run it through - fnmatch(). */ - if (has_wildcards_p (str)) - { - /* If the string contains '/', compare with the full - version. Else, compare it to base_version. */ - if (strchr (str, '/')) - match = !fnmatch (str, version, 0); - else - match = !fnmatch (str, base_version, 0); - } - else /* Substring search */ - { - if (strstr (version, str)) - match = 1; - else - match = 0; - } - } - /* If Wget is not matched, skip all the entries up to the - next User-agent field. */ - wget_matched = match; - } - else if (!wget_matched) - { - free (line); - DEBUGP (("(chucking out since it is not applicable for Wget)\n")); - continue; - } - else if (!strcasecmp (cmd, "Disallow")) - { - /* If "Disallow" is empty, the robot is welcome. */ - if (!*str) - { - free_vec (entries); - entries = (char **)xmalloc (sizeof (char *)); - *entries = NULL; - num = 0; - } - else - { - entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *)); - entries[num] = xstrdup (str); - entries[++num] = NULL; - /* Strip trailing spaces, according to specifications. */ - for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--) - if (ISSPACE (str[i])) - str[i] = '\0'; - } - } - else - { - /* unknown command */ - DEBUGP (("(chucked out)\n")); - } - free (line); - } - fclose (fp); - return entries; -} - -/* May the URL url be loaded according to disallowing rules stored in - forbidden? */ -static int -robots_match (struct urlinfo *u, char **forbidden) -{ - int l; - - if (!forbidden) - return 1; - DEBUGP (("Matching %s against: ", u->path)); - for (; *forbidden; forbidden++) - { - DEBUGP (("%s ", *forbidden)); - l = strlen (*forbidden); - /* If dir is forbidden, we may not load the file. */ - if (strncmp (u->path, *forbidden, l) == 0) - { - DEBUGP (("matched.\n")); - return 0; /* Matches, i.e. does not load... */ - } - } - DEBUGP (("not matched.\n")); - return 1; -}