- assert (u->url != NULL);
- constr = xstrdup (u->url);
-
- /* Several checkings whether a file is acceptable to load:
- 1. check if URL is ftp, and we don't load it
- 2. check for relative links (if relative_only is set)
- 3. check for domain
- 4. check for no-parent
- 5. check for excludes && includes
- 6. check for suffix
- 7. check for same host (if spanhost is unset), with possible
- gethostbyname baggage
- 8. check for robots.txt
-
- Addendum: If the URL is FTP, and it is to be loaded, only the
- domain and suffix settings are "stronger".
-
- Note that .html and (yuck) .htm will get loaded regardless of
- suffix rules (but that is remedied later with unlink) unless
- the depth equals the maximum depth.
-
- More time- and memory- consuming tests should be put later on
- the list. */
-
- /* inl is set if the URL we are working on (constr) is stored in
- undesirable_urls. Using it is crucial to avoid unnecessary
- repeated continuous hits to the hash table. */
- inl = string_set_contains (undesirable_urls, constr);
-
- /* If it is FTP, and FTP is not followed, chuck it out. */
- if (!inl)
- if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp)
- {
- DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* If it is absolute link and they are not followed, chuck it
- out. */
- if (!inl && u->scheme != SCHEME_FTP)
- if (opt.relative_only && !cur_url->link_relative_p)
- {
- DEBUGP (("It doesn't really look like a relative link.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* If its domain is not to be accepted/looked-up, chuck it out. */
- if (!inl)
- if (!accept_domain (u))
- {
- DEBUGP (("I don't like the smell of that domain.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* Check for parent directory. */
- if (!inl && opt.no_parent
- /* If the new URL is FTP and the old was not, ignore
- opt.no_parent. */
- && !(!this_url_ftp && u->scheme == SCHEME_FTP))