- /* If quota was exceeded earlier, bail out. */
- if (downloaded_exceeds_quota ())
- break;
- /* Parse the URL for convenient use in other functions, as well
- as to get the optimized form. It also checks URL integrity. */
- u = newurl ();
- if (parseurl (cur_url->url, u, 0) != URLOK)
- {
- DEBUGP (("Yuck! A bad URL.\n"));
- freeurl (u, 1);
- continue;
- }
- if (u->proto == URLFILE)
- {
- DEBUGP (("Nothing to do with file:// around here.\n"));
- freeurl (u, 1);
- continue;
- }
- assert (u->url != NULL);
- constr = xstrdup (u->url);
-
- /* Several checkings whether a file is acceptable to load:
- 1. check if URL is ftp, and we don't load it
- 2. check for relative links (if relative_only is set)
- 3. check for domain
- 4. check for no-parent
- 5. check for excludes && includes
- 6. check for suffix
- 7. check for same host (if spanhost is unset), with possible
- gethostbyname baggage
- 8. check for robots.txt
-
- Addendum: If the URL is FTP, and it is to be loaded, only the
- domain and suffix settings are "stronger".
-
- Note that .html and (yuck) .htm will get loaded regardless of
- suffix rules (but that is remedied later with unlink) unless
- the depth equals the maximum depth.
-
- More time- and memory- consuming tests should be put later on
- the list. */
-
- /* inl is set if the URL we are working on (constr) is stored in
- undesirable_urls. Using it is crucial to avoid unnecessary
- repeated continuous hits to the hash table. */
- inl = string_set_exists (undesirable_urls, constr);
-
- /* If it is FTP, and FTP is not followed, chuck it out. */
- if (!inl)
- if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
- {
- DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* If it is absolute link and they are not followed, chuck it
- out. */
- if (!inl && u->proto != URLFTP)
- if (opt.relative_only && !cur_url->link_relative_p)
- {
- DEBUGP (("It doesn't really look like a relative link.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* If its domain is not to be accepted/looked-up, chuck it out. */
- if (!inl)
- if (!accept_domain (u))
- {
- DEBUGP (("I don't like the smell of that domain.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* Check for parent directory. */
- if (!inl && opt.no_parent
- /* If the new URL is FTP and the old was not, ignore
- opt.no_parent. */
- && !(!this_url_ftp && u->proto == URLFTP))
- {
- /* Check for base_dir first. */
- if (!(base_dir && frontcmp (base_dir, u->dir)))
- {
- /* Failing that, check for parent dir. */
- struct urlinfo *ut = newurl ();
- if (parseurl (this_url, ut, 0) != URLOK)
- DEBUGP (("Double yuck! The *base* URL is broken.\n"));
- else if (!frontcmp (ut->dir, u->dir))
- {
- /* Failing that too, kill the URL. */
- DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- freeurl (ut, 1);
- }
- }
- /* If the file does not match the acceptance list, or is on the
- rejection list, chuck it out. The same goes for the
- directory exclude- and include- lists. */
- if (!inl && (opt.includes || opt.excludes))
- {
- if (!accdir (u->dir, ALLABS))
- {
- DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
- if (!inl)
- {
- char *suf = NULL;
- /* We check for acceptance/rejection rules only for non-HTML
- documents. Since we don't know whether they really are
- HTML, it will be deduced from (an OR-ed list):
-
- 1) u->file is "" (meaning it is a directory)
- 2) suffix exists, AND:
- a) it is "html", OR
- b) it is "htm"
-
- If the file *is* supposed to be HTML, it will *not* be
- subject to acc/rej rules, unless a finite maximum depth has
- been specified and the current depth is the maximum depth. */
- if (!
- (!*u->file
- || (((suf = suffix (constr)) != NULL)
- && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
- && ((opt.reclevel != INFINITE_RECURSION) &&
- (depth != opt.reclevel))))))
- {
- if (!acceptable (u->file))
- {
- DEBUGP (("%s (%s) does not match acc/rej rules.\n",
- constr, u->file));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
- FREE_MAYBE (suf);
- }
- /* Optimize the URL (which includes possible DNS lookup) only
- after all other possibilities have been exhausted. */
- if (!inl)
- {
- if (!opt.simple_check)
- opt_url (u);
- else
- {
- char *p;
- /* Just lowercase the hostname. */
- for (p = u->host; *p; p++)
- *p = TOLOWER (*p);
- free (u->url);
- u->url = str_url (u, 0);
- }
- free (constr);
- constr = xstrdup (u->url);
- string_set_add (undesirable_urls, constr);
- if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
- if (!opt.spanhost && this_url && !same_host (this_url, constr))
- {
- DEBUGP (("This is not the same hostname as the parent's.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
- /* What about robots.txt? */
- if (!inl && opt.use_robots && u->proto == URLHTTP)
- {
- /* Since Wget knows about only one set of robot rules at a
- time, /robots.txt must be reloaded whenever a new host is
- accessed.
-
- robots_host holds the host the current `forbid' variable
- is assigned to. */
- if (!robots_host || !same_host (robots_host, u->host))
- {
- FREE_MAYBE (robots_host);
- /* Now make robots_host the new host, no matter what the
- result will be. So if there is no /robots.txt on the
- site, Wget will not retry getting robots all the
- time. */
- robots_host = xstrdup (u->host);
- free_vec (forbidden);
- forbidden = NULL;
- err = retrieve_robots (constr, ROBOTS_FILENAME);
- if (err == ROBOTSOK)
- {
- rurl = robots_url (constr, ROBOTS_FILENAME);
- rfile = url_filename (rurl);
- forbidden = parse_robots (rfile);
- freeurl (rurl, 1);
- free (rfile);
- }
- }
-
- /* Now that we have (or don't have) robots, we can check for
- them. */
- if (!robots_match (u, forbidden))
- {
- DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
- ROBOTS_FILENAME));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
-
- filename = NULL;
- /* If it wasn't chucked out, do something with it. */
- if (!inl)
- {
- DEBUGP (("I've decided to load it -> "));
- /* Add it to the list of already-loaded URL-s. */
- string_set_add (undesirable_urls, constr);
- /* Automatically followed FTPs will *not* be downloaded
- recursively. */
- if (u->proto == URLFTP)
- {
- /* Don't you adore side-effects? */
- opt.recursive = 0;
- }
- /* Reset its type. */
- dt = 0;
- /* Retrieve it. */
- retrieve_url (constr, &filename, &newloc,
- canon_this_url ? canon_this_url : this_url, &dt);
- if (u->proto == URLFTP)
- {
- /* Restore... */
- opt.recursive = 1;
- }
- if (newloc)
- {
- free (constr);
- constr = newloc;
- }
- /* In case of convert_links: If there was no error, add it to
- the list of downloaded URLs. We might need it for
- conversion. */
- if (opt.convert_links && filename)
- {
- if (dt & RETROKF)
- {
- hash_table_put (dl_file_url_map,
- xstrdup (filename), xstrdup (constr));
- hash_table_put (dl_url_file_map,
- xstrdup (constr), xstrdup (filename));
- /* If the URL is HTML, note it. */
- if (dt & TEXTHTML)
- urls_html = slist_prepend (urls_html, filename);
- }
- }
- /* If there was no error, and the type is text/html, parse
- it recursively. */
- if (dt & TEXTHTML)
- {
- if (dt & RETROKF)
- recursive_retrieve (filename, constr);
- }
- else
- DEBUGP (("%s is not text/html so we don't chase.\n",
- filename ? filename: "(null)"));
-
- if (opt.delete_after || (filename && !acceptable (filename)))
- /* Either --delete-after was specified, or we loaded this otherwise
- rejected (e.g. by -R) HTML file just so we could harvest its
- hyperlinks -- in either case, delete the local file. */
- {
- DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
- opt.delete_after ? "--delete-after" :
- "recursive rejection criteria"));
- logprintf (LOG_VERBOSE,
- (opt.delete_after ? _("Removing %s.\n")
- : _("Removing %s since it should be rejected.\n")),
- filename);
- if (unlink (filename))
- logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
- dt &= ~RETROKF;
- }
-
- /* If everything was OK, and links are to be converted, let's
- store the local filename. */
- if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
- {
- cur_url->convert = CO_CONVERT_TO_RELATIVE;
- cur_url->local_name = xstrdup (filename);
- }
- }