- /* If quota was exceeded earlier, bail out. */
- if (downloaded_exceeds_quota ())
- break;
- /* Parse the URL for convenient use in other functions, as well
- as to get the optimized form. It also checks URL integrity. */
- u = newurl ();
- if (parseurl (cur_url->url, u, 0) != URLOK)
- {
- DEBUGP (("Yuck! A bad URL.\n"));
- freeurl (u, 1);
- continue;
- }
- if (u->proto == URLFILE)
- {
- DEBUGP (("Nothing to do with file:// around here.\n"));
- freeurl (u, 1);
- continue;
- }
- assert (u->url != NULL);
- constr = xstrdup (u->url);
-
- /* Several checkings whether a file is acceptable to load:
- 1. check if URL is ftp, and we don't load it
- 2. check for relative links (if relative_only is set)
- 3. check for domain
- 4. check for no-parent
- 5. check for excludes && includes
- 6. check for suffix
- 7. check for same host (if spanhost is unset), with possible
- gethostbyname baggage
- 8. check for robots.txt
-
- Addendum: If the URL is FTP, and it is to be loaded, only the
- domain and suffix settings are "stronger".
-
- Note that .html and (yuck) .htm will get loaded regardless of
- suffix rules (but that is remedied later with unlink) unless
- the depth equals the maximum depth.
-
- More time- and memory- consuming tests should be put later on
- the list. */
-
- /* inl is set if the URL we are working on (constr) is stored in
- undesirable_urls. Using it is crucial to avoid unnecessary
- repeated continuous hits to the hash table. */
- inl = string_set_exists (undesirable_urls, constr);
-
- /* If it is FTP, and FTP is not followed, chuck it out. */
- if (!inl)
- if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
- {
- DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* If it is absolute link and they are not followed, chuck it
- out. */
- if (!inl && u->proto != URLFTP)
- if (opt.relative_only && !(cur_url->flags & URELATIVE))
- {
- DEBUGP (("It doesn't really look like a relative link.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* If its domain is not to be accepted/looked-up, chuck it out. */
- if (!inl)
- if (!accept_domain (u))
- {
- DEBUGP (("I don't like the smell of that domain.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- /* Check for parent directory. */
- if (!inl && opt.no_parent
- /* If the new URL is FTP and the old was not, ignore
- opt.no_parent. */
- && !(!this_url_ftp && u->proto == URLFTP))
- {
- /* Check for base_dir first. */
- if (!(base_dir && frontcmp (base_dir, u->dir)))
- {
- /* Failing that, check for parent dir. */
- struct urlinfo *ut = newurl ();
- if (parseurl (this_url, ut, 0) != URLOK)
- DEBUGP (("Double yuck! The *base* URL is broken.\n"));
- else if (!frontcmp (ut->dir, u->dir))
- {
- /* Failing that too, kill the URL. */
- DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- freeurl (ut, 1);
- }
- }
- /* If the file does not match the acceptance list, or is on the
- rejection list, chuck it out. The same goes for the
- directory exclude- and include- lists. */
- if (!inl && (opt.includes || opt.excludes))
- {
- if (!accdir (u->dir, ALLABS))
- {
- DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
- if (!inl)
- {
- char *suf = NULL;
- /* We check for acceptance/rejection rules only for non-HTML
- documents. Since we don't know whether they really are
- HTML, it will be deduced from (an OR-ed list):
-
- 1) u->file is "" (meaning it is a directory)
- 2) suffix exists, AND:
- a) it is "html", OR
- b) it is "htm"
-
- If the file *is* supposed to be HTML, it will *not* be
- subject to acc/rej rules, unless a finite maximum depth has
- been specified and the current depth is the maximum depth. */
- if (!
- (!*u->file
- || (((suf = suffix (constr)) != NULL)
- && ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
- && ((opt.reclevel != INFINITE_RECURSION) &&
- (depth != opt.reclevel))))))
- {
- if (!acceptable (u->file))
- {
- DEBUGP (("%s (%s) does not match acc/rej rules.\n",
- constr, u->file));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
- FREE_MAYBE (suf);
- }
- /* Optimize the URL (which includes possible DNS lookup) only
- after all other possibilities have been exhausted. */
- if (!inl)
- {
- if (!opt.simple_check)
- opt_url (u);
- else
- {
- char *p;
- /* Just lowercase the hostname. */
- for (p = u->host; *p; p++)
- *p = TOLOWER (*p);
- free (u->url);
- u->url = str_url (u, 0);
- }
- free (constr);
- constr = xstrdup (u->url);
- string_set_add (undesirable_urls, constr);
- if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
- if (!opt.spanhost && this_url && !same_host (this_url, constr))
- {
- DEBUGP (("This is not the same hostname as the parent's.\n"));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
- /* What about robots.txt? */
- if (!inl && opt.use_robots && u->proto == URLHTTP)
- {
- /* Since Wget knows about only one set of robot rules at a
- time, /robots.txt must be reloaded whenever a new host is
- accessed.
-
- robots_host holds the host the current `forbid' variable
- is assigned to. */
- if (!robots_host || !same_host (robots_host, u->host))
- {
- FREE_MAYBE (robots_host);
- /* Now make robots_host the new host, no matter what the
- result will be. So if there is no /robots.txt on the
- site, Wget will not retry getting robots all the
- time. */
- robots_host = xstrdup (u->host);
- free_vec (forbidden);
- forbidden = NULL;
- err = retrieve_robots (constr, ROBOTS_FILENAME);
- if (err == ROBOTSOK)
- {
- rurl = robots_url (constr, ROBOTS_FILENAME);
- rfile = url_filename (rurl);
- forbidden = parse_robots (rfile);
- freeurl (rurl, 1);
- free (rfile);
- }
- }
-
- /* Now that we have (or don't have) robots, we can check for
- them. */
- if (!robots_match (u, forbidden))
- {
- DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
- ROBOTS_FILENAME));
- string_set_add (undesirable_urls, constr);
- inl = 1;
- }
- }
-
- filename = NULL;
- /* If it wasn't chucked out, do something with it. */
- if (!inl)
- {
- DEBUGP (("I've decided to load it -> "));
- /* Add it to the list of already-loaded URL-s. */
- string_set_add (undesirable_urls, constr);
- /* Automatically followed FTPs will *not* be downloaded
- recursively. */
- if (u->proto == URLFTP)
- {
- /* Don't you adore side-effects? */
- opt.recursive = 0;
- }
- /* Reset its type. */
- dt = 0;
- /* Retrieve it. */
- retrieve_url (constr, &filename, &newloc,
- canon_this_url ? canon_this_url : this_url, &dt);
- if (u->proto == URLFTP)
- {
- /* Restore... */
- opt.recursive = 1;
- }
- if (newloc)
- {
- free (constr);
- constr = newloc;
- }
- /* In case of convert_links: If there was no error, add it to
- the list of downloaded URLs. We might need it for
- conversion. */
- if (opt.convert_links && filename)
- {
- if (dt & RETROKF)
- {
- hash_table_put (dl_file_url_map,
- xstrdup (filename), xstrdup (constr));
- hash_table_put (dl_url_file_map,
- xstrdup (constr), xstrdup (filename));
- /* If the URL is HTML, note it. */
- if (dt & TEXTHTML)
- urls_html = slist_append (urls_html, filename);
- }
- }
- /* If there was no error, and the type is text/html, parse
- it recursively. */
- if (dt & TEXTHTML)
- {
- if (dt & RETROKF)
- recursive_retrieve (filename, constr);
- }
- else
- DEBUGP (("%s is not text/html so we don't chase.\n",
- filename ? filename: "(null)"));
-
- if (opt.delete_after || (filename && !acceptable (filename)))
- /* Either --delete-after was specified, or we loaded this otherwise
- rejected (e.g. by -R) HTML file just so we could harvest its
- hyperlinks -- in either case, delete the local file. */
- {
- DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
- opt.delete_after ? "--delete-after" :
- "recursive rejection criteria"));
- logprintf (LOG_VERBOSE,
- (opt.delete_after ? _("Removing %s.\n")
- : _("Removing %s since it should be rejected.\n")),
- filename);
- if (unlink (filename))
- logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
- dt &= ~RETROKF;
- }
-
- /* If everything was OK, and links are to be converted, let's
- store the local filename. */
- if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
- {
- cur_url->flags |= UABS2REL;
- cur_url->local_name = xstrdup (filename);
- }
- }
- DEBUGP (("%s already in list, so we don't load.\n", constr));
- /* Free filename and constr. */
- FREE_MAYBE (filename);
- FREE_MAYBE (constr);
- freeurl (u, 1);
- /* Increment the pbuf for the appropriate size. */
+ bool descend = false;
+ char *url, *referer, *file = NULL;
+ int depth;
+ bool html_allowed, css_allowed;
+ bool is_css = false;
+ bool dash_p_leaf_HTML = false;
+
+ if (opt.quota && total_downloaded_bytes > opt.quota)
+ break;
+ if (status == FWRITEERR)
+ break;
+
+ /* Get the next URL from the queue... */
+
+ if (!url_dequeue (queue,
+ (const char **)&url, (const char **)&referer,
+ &depth, &html_allowed, &css_allowed))
+ break;
+
+ /* ...and download it. Note that this download is in most cases
+ unconditional, as download_child_p already makes sure a file
+ doesn't get enqueued twice -- and yet this check is here, and
+ not in download_child_p. This is so that if you run `wget -r
+ URL1 URL2', and a random URL is encountered once under URL1
+ and again under URL2, but at a different (possibly smaller)
+ depth, we want the URL's children to be taken into account
+ the second time. */
+ if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
+ {
+ file = xstrdup (hash_table_get (dl_url_file_map, url));
+
+ DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
+ url, file));
+
+ /* this sucks, needs to be combined! */
+ if (html_allowed
+ && downloaded_html_set
+ && string_set_contains (downloaded_html_set, file))
+ {
+ descend = true;
+ is_css = false;
+ }
+ if (css_allowed
+ && downloaded_css_set
+ && string_set_contains (downloaded_css_set, file))
+ {
+ descend = true;
+ is_css = true;
+ }
+ }
+ else
+ {
+ int dt = 0;
+ char *redirected = NULL;
+
+ status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+
+ if (html_allowed && file && status == RETROK
+ && (dt & RETROKF) && (dt & TEXTHTML))
+ {
+ descend = true;
+ is_css = false;
+ }
+
+ /* a little different, css_allowed can override content type
+ lots of web servers serve css with an incorrect content type
+ */
+ if (file && status == RETROK
+ && (dt & RETROKF) &&
+ ((dt & TEXTCSS) || css_allowed))
+ {
+ descend = true;
+ is_css = true;
+ }
+
+ if (redirected)
+ {
+ /* We have been redirected, possibly to another host, or
+ different path, or wherever. Check whether we really
+ want to follow it. */
+ if (descend)
+ {
+ if (!descend_redirect_p (redirected, url, depth,
+ start_url_parsed, blacklist))
+ descend = false;
+ else
+ /* Make sure that the old pre-redirect form gets
+ blacklisted. */
+ string_set_add (blacklist, url);
+ }
+
+ xfree (url);
+ url = redirected;
+ }
+ }
+
+ if (opt.spider)
+ {
+ visited_url (url, referer);
+ }
+
+ if (descend
+ && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
+ {
+ if (opt.page_requisites
+ && (depth == opt.reclevel || depth == opt.reclevel + 1))
+ {
+ /* When -p is specified, we are allowed to exceed the
+ maximum depth, but only for the "inline" links,
+ i.e. those that are needed to display the page.
+ Originally this could exceed the depth at most by
+ one, but we allow one more level so that the leaf
+ pages that contain frames can be loaded
+ correctly. */
+ dash_p_leaf_HTML = true;
+ }
+ else
+ {
+ /* Either -p wasn't specified or it was and we've
+ already spent the two extra (pseudo-)levels that it
+ affords us, so we need to bail out. */
+ DEBUGP (("Not descending further; at depth %d, max. %d.\n",
+ depth, opt.reclevel));
+ descend = false;
+ }
+ }
+
+ /* If the downloaded document was HTML or CSS, parse it and enqueue the
+ links it contains. */
+
+ if (descend)
+ {
+ bool meta_disallow_follow = false;
+ struct urlpos *children
+ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow);
+
+ if (opt.use_robots && meta_disallow_follow)
+ {
+ free_urlpos (children);
+ children = NULL;
+ }
+
+ if (children)
+ {
+ struct urlpos *child = children;
+ struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ char *referer_url = url;
+ bool strip_auth = (url_parsed != NULL
+ && url_parsed->user != NULL);
+ assert (url_parsed != NULL);
+
+ /* Strip auth info if present */
+ if (strip_auth)
+ referer_url = url_string (url_parsed, URL_AUTH_HIDE);
+
+ for (; child; child = child->next)
+ {
+ if (child->ignore_when_downloading)
+ continue;
+ if (dash_p_leaf_HTML && !child->link_inline_p)
+ continue;
+ if (download_child_p (child, url_parsed, depth, start_url_parsed,
+ blacklist))
+ {
+ url_enqueue (queue, xstrdup (child->url->url),
+ xstrdup (referer_url), depth + 1,
+ child->link_expect_html,
+ child->link_expect_css);
+ /* We blacklist the URL we have enqueued, because we
+ don't want to enqueue (and hence download) the
+ same URL twice. */
+ string_set_add (blacklist, child->url->url);
+ }
+ }
+
+ if (strip_auth)
+ xfree (referer_url);
+ url_free (url_parsed);
+ free_urlpos (children);
+ }
+ }
+
+ if (file
+ && (opt.delete_after
+ || opt.spider /* opt.recursive is implicitely true */
+ || !acceptable (file)))
+ {
+ /* Either --delete-after was specified, or we loaded this
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
+ delete the local file. */
+ DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
+ opt.delete_after ? "--delete-after" :
+ (opt.spider ? "--spider" :
+ "recursive rejection criteria")));
+ logprintf (LOG_VERBOSE,
+ (opt.delete_after || opt.spider
+ ? _("Removing %s.\n")
+ : _("Removing %s since it should be rejected.\n")),
+ file);
+ if (unlink (file))
+ logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+ logputs (LOG_VERBOSE, "\n");
+ register_delete_file (file);
+ }
+
+ xfree (url);
+ xfree_null (referer);
+ xfree_null (file);