/* Handling of recursive HTTP retrieving.
- Copyright (C) 1996-2006 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+ 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
-#include <config.h>
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "url.h"
#include "recur.h"
#include "utils.h"
#include "res.h"
#include "convert.h"
#include "spider.h"
+#include "iri.h"
\f
/* Functions for maintaining the URL queue. */
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
-
+ char *remote_encoding;
struct queue_element *next; /* next element in queue */
};
const char *url, const char *referer, int depth, bool html_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
+ char *charset = get_current_charset ();
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
qel->next = NULL;
+ if (charset)
+ qel->remote_encoding = xstrdup (charset);
+ else
+ qel->remote_encoding = NULL;
+
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
+ /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
if (!queue->head)
queue->tail = NULL;
+ set_remote_charset (qel->remote_encoding);
+ if (qel->remote_encoding)
+ xfree (qel->remote_encoding);
+
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
retrieve_tree (const char *start_url)
{
uerr_t status = RETROK;
+ bool utf8_encode = false;
/* The queue of URLs we need to load. */
struct url_queue *queue;
struct hash_table *blacklist;
int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode);
if (!start_url_parsed)
{
if (children)
{
struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode);
char *referer_url = url;
- bool strip_auth = (bool)url_parsed->user;
+ bool strip_auth = (url_parsed != NULL
+ && url_parsed->user != NULL);
assert (url_parsed != NULL);
/* Strip auth info if present */
}
}
- if (file
- && (opt.delete_after
+ if (file
+ && (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- (otherwise unneeded because of --spider or rejected by -R)
- HTML file just to harvest its hyperlinks -- in either case,
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- (opt.spider ? "--spider" :
+ (opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
if (opt.spider)
{
char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
- DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+ DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
visited_url (url, referrer);
xfree (referrer);
}
if (res_retrieve_file (url, &rfile))
{
specs = res_parse_from_file (rfile);
+
+ /* Delete the robots.txt file if we chose to either delete the
+ files after downloading or we're just running a spider. */
+ if (opt.delete_after || opt.spider)
+ {
+ logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
+ if (unlink (rfile))
+ logprintf (LOG_NOTQUIET, "unlink: %s\n",
+ strerror (errno));
+ }
+
xfree (rfile);
}
else
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
+ bool utf8_encode = false;
- orig_parsed = url_parse (original, NULL);
+ orig_parsed = url_parse (original, NULL, &utf8_encode);
assert (orig_parsed != NULL);
- new_parsed = url_parse (redirected, NULL);
+ new_parsed = url_parse (redirected, NULL, &utf8_encode);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);