/* File retrieval.
Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
-#include <config.h>
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
-#include "wget.h"
#include "utils.h"
#include "retr.h"
#include "progress.h"
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
+#include "html-url.h"
/* Total size of downloaded files. Used to enforce quota. */
SUM_SIZE_INT total_downloaded_bytes;
/* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
argument to progress_create because the indicator doesn't
(yet) know about "skipping" data. */
- progress = progress_create (skip ? 0 : startpos, startpos + toread);
+ wgint start = skip ? 0 : startpos;
+ progress = progress_create (start, start + toread);
progress_interactive = progress_interactive_p (progress);
}
char *hunk = xmalloc (bufsize);
int tail = 0; /* tail position in HUNK */
- assert (maxsize >= bufsize);
+ assert (!maxsize || maxsize >= bufsize);
while (1)
{
multiple points. */
uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+ char **newloc, const char *refurl, int *dt, bool recursive,
+ struct iri *iri)
{
uerr_t result;
char *url;
bool location_changed;
int dummy;
char *mynewloc, *proxy;
- struct url *u, *proxy_url;
+ struct url *u = orig_parsed, *proxy_url;
int up_error_code; /* url parse error code */
char *local_file;
int redirection_count = 0;
if (file)
*file = NULL;
- u = url_parse (url, &up_error_code);
- if (!u)
- {
- logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
- xfree (url);
- return URLERROR;
- }
+ second_try:
+ DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
+ iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
+ iri->utf8_encode));
if (!refurl)
refurl = opt.referer;
proxy = getproxy (u);
if (proxy)
{
+ struct iri *pi = iri_new ();
+ set_uri_encoding (pi, opt.locale, true);
+ pi->utf8_encode = false;
+
/* Parse the proxy URL. */
- proxy_url = url_parse (proxy, &up_error_code);
+ proxy_url = url_parse (proxy, &up_error_code, NULL, true);
if (!proxy_url)
{
+ char *error = url_error (proxy, up_error_code);
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
- proxy, url_error (up_error_code));
+ proxy, error);
xfree (url);
+ xfree (error);
RESTORE_POST_DATA;
return PROXERR;
}
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
- result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+ result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
xfree (mynewloc);
mynewloc = construced_newloc;
+ /* Reset UTF-8 encoding state, keep the URI encoding and reset
+ the content encoding. */
+ iri->utf8_encode = opt.enable_iri;
+ set_content_encoding (iri, NULL);
+ xfree_null (iri->orig_url);
+
/* Now, see if this new location makes sense. */
- newloc_parsed = url_parse (mynewloc, &up_error_code);
+ newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
if (!newloc_parsed)
{
+ char *error = url_error (mynewloc, up_error_code);
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
- url_error (up_error_code));
- url_free (u);
+ error);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
+ xfree (error);
RESTORE_POST_DATA;
return result;
}
logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
opt.max_redirect);
url_free (newloc_parsed);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
RESTORE_POST_DATA;
xfree (url);
url = mynewloc;
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
u = newloc_parsed;
/* If we're being redirected from POST, we don't want to POST
goto redirected;
}
- if (local_file)
+ /* Try to not encode in UTF-8 if fetching failed */
+ if (!(*dt & RETROKF) && iri->utf8_encode)
+ {
+ iri->utf8_encode = false;
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
+ u = url_parse (origurl, NULL, iri, true);
+ if (u)
+ {
+ DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
+ goto second_try;
+ }
+ else
+ DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
+ }
+
+ if (local_file && *dt & RETROKF)
{
+ register_download (u->url, local_file);
+ if (redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+ if (*dt & TEXTHTML)
+ register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
+ if (*dt & TEXTCSS)
+ register_css (u->url, local_file);
}
}
else
xfree_null (local_file);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
if (redirection_count)
{
{
uerr_t status;
struct urlpos *url_list, *cur_url;
+ struct iri *iri = iri_new();
+
+ char *input_file = NULL;
+ const char *url = file;
- url_list = (html ? get_urls_html (file, NULL, NULL)
- : get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
+ /* sXXXav : Assume filename and links in the file are in the locale */
+ set_uri_encoding (iri, opt.locale, true);
+ set_content_encoding (iri, opt.locale);
+
+ if (url_has_scheme (url))
+ {
+ int dt,url_err;
+ uerr_t status;
+ struct url * url_parsed = url_parse(url, &url_err, NULL, true);
+
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ return URLERROR;
+ }
+
+ if (!opt.base_href)
+ opt.base_href = xstrdup (url);
+
+ status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
+ false, iri);
+ if (status != RETROK)
+ return status;
+
+ if (dt & TEXTHTML)
+ html = true;
+
+ /* If we have a found a content encoding, use it */
+ if (iri->content_encoding)
+ set_uri_encoding (iri, iri->content_encoding, false);
+ }
+ else
+ input_file = (char *) file;
+
+ url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
+ : get_urls_file (input_file));
+
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
{
char *filename = NULL, *new_file = NULL;
status = QUOTEXC;
break;
}
+
+ /* Reset UTF-8 encode status */
+ iri->utf8_encode = opt.enable_iri;
+ xfree_null (iri->orig_url);
+ iri->orig_url = NULL;
+
if ((opt.recursive || opt.page_requisites)
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
- status = retrieve_tree (cur_url->url->url);
+
+ status = retrieve_tree (cur_url->url, iri);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+ status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
+ &new_file, NULL, &dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
/* Free the linked list of URL-s. */
free_urlpos (url_list);
+ iri_free (iri);
+
return status;
}
/* Returns true if URL would be downloaded through a proxy. */
bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
{
bool ret;
- struct url *u = url_parse (url, NULL);
if (!u)
return false;
ret = getproxy (u) != NULL;
- url_free (u);
return ret;
}
else
return sufmatch (no_proxy, host);
}
+
+/* Set the file parameter to point to the local file string. */
+void
+set_local_file (const char **file, const char *default_file)
+{
+ if (opt.output_document)
+ {
+ if (output_stream_regular)
+ *file = opt.output_document;
+ }
+ else
+ *file = default_file;
+}