{
char *str = NULL, *name;
- if (opt.enable_iri && (name = idn_decode (host)) != NULL)
+ if (opt.enable_iri && (name = idn_decode ((char *) host)) != NULL)
{
int len = strlen (host) + strlen (name) + 4;
str = xmalloc (len);
xfree (name);
}
- logprintf (LOG_VERBOSE, _("Resolving %s... "),
+ logprintf (LOG_VERBOSE, _("Resolving %s... "),
quotearg_style (escape_quoting_style, str ? str : host));
if (str)
struct urlpos *newel;
const char *base = ctx->base ? ctx->base : ctx->parent_base;
struct url *url;
+ bool utf8_encode = false;
if (!base)
{
return NULL;
}
- url = url_parse (link_uri, NULL);
+ url = url_parse (link_uri, NULL, &utf8_encode);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
- url = url_parse (complete_uri, NULL);
+ url = url_parse (complete_uri, NULL, &utf8_encode);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
if (!mcharset)
return;
- logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
+ /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
- /* sXXXav: Not used yet */
+ set_current_charset (mcharset);
xfree (mcharset);
}
else if (name && 0 == strcasecmp (name, "robots"))
struct file_memory *fm;
struct urlpos *head, *tail;
const char *text, *text_end;
+ bool utf8_encode = false;
/* Load the file. */
fm = read_file (file);
url_text = merged;
}
- url = url_parse (url_text, &up_error_code);
+ url = url_parse (url_text, &up_error_code, &utf8_encode);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
hs->local_file = url_file_name (u);
}
}
-
+
/* TODO: perform this check only once. */
if (!hs->existence_checked && file_exists_p (hs->local_file))
{
local_dot_orig_file_exists = true;
local_filename = filename_plus_orig_suffix;
}
- }
+ }
if (!local_dot_orig_file_exists)
/* Couldn't stat() <file>.orig, so try to stat() <file>. */
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
- /* xxx = */ parse_charset (tmp2);
+ set_current_charset (parse_charset (tmp2));
}
}
hs->newloc = resp_header_strdup (resp, "Location");
#include "utils.h"
#include "iri.h"
+char *remote;
+char *current;
static iconv_t locale2utf8;
return NULL;
}
- logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
+ /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset;
}
(*out)++;
outlen--;
}
- else if (errno == E2BIG) /* Output buffer full */
+ else if (errno == E2BIG) /* Output buffer full */
{
char *new;
/* Try to ASCII encode UTF-8 host. Return the new domain on success or NULL
on error. */
-char *idn_encode (char *host)
+char *
+idn_encode (char *host, bool utf8_encoded)
{
char *new;
int ret;
+ /* Encode to UTF-8 if not done using current remote */
+ if (!utf8_encoded)
+ {
+ if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+ {
+ /* Nothing to encode or an error occured */
+ return NULL;
+ }
+
+ host = new;
+ }
+
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, 0);
if (ret != IDNA_SUCCESS)
{
+ /* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
/* Try to decode an ASCII encoded host. Return the new domain in the locale on
success or NULL on error. */
-char *idn_decode (char *host)
+char *
+idn_decode (char *host)
{
char *new;
int ret;
return new;
}
+/* Return a new string */
+bool
+remote_to_utf8 (const char *str, const char **new)
+{
+ char *remote;
+ iconv_t cd;
+ bool ret = false;
+
+ if (opt.encoding_remote)
+ remote = opt.encoding_remote;
+ else if (current)
+ remote = current;
+ else
+ return false;
+
+ cd = iconv_open ("UTF-8", remote);
+ if (cd == (iconv_t)(-1))
+ return false;
+
+ if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+ ret = true;
+
+ iconv_close (cd);
+
+ /* Test if something was converted */
+ if (!strcmp (str, *new))
+ {
+ xfree ((char *) *new);
+ return false;
+ }
+
+ return ret;
+}
+
+char *get_remote_charset (void)
+{
+ return remote;
+}
+
+char *get_current_charset (void)
+{
+ return current;
+}
+
+void set_current_charset (char *charset)
+{
+ /*printf("[ current = `%s'\n", charset);*/
+
+ if (current)
+ xfree (current);
+
+ current = charset ? xstrdup (charset) : NULL;
+}
+
+void set_current_as_locale (void)
+{
+ /*printf("[ current = locale = `%s'\n", opt.locale);*/
+ if (current)
+ xfree (current);
+
+ /* sXXXav : assert opt.locale NULL ? */
+ current = xstrdup (opt.locale);
+}
+
+void
+set_remote_charset (char *charset)
+{
+ /*printf("[ remote = `%s'\n", charset);*/
+ if (remote)
+ xfree (remote);
+
+ remote = charset ? xstrdup (charset) : NULL;
+}
+
+void
+set_remote_as_current (void)
+{
+ /*printf("[ remote = current = `%s'\n", current);*/
+ if (remote)
+ xfree (remote);
+
+ remote = current ? xstrdup (current) : NULL;
+}
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
-char *idn_encode (char *host);
+char *idn_encode (char *host, bool utf8_encoded);
char *idn_decode (char *host);
+char *get_remote_charset (void);
+char *get_current_charset (void);
+void set_current_charset (char *charset);
+void set_current_as_locale (void);
+void set_current_charset (char *charset);
+void set_remote_charset (char *charset);
+void set_remote_as_current (void);
+bool remote_to_utf8 (const char *str, const char **new);
#else /* ENABLE_IRI */
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
-#define idn_encode(str) NULL
+#define idn_encode(str,encoded) NULL
#define idn_decode(str) NULL
+#define get_remote_charset() NULL
+#define get_current_charset() NULL
+#define set_current_charset(str)
+#define set_current_as_locale()
+#define set_current_charset(str)
+#define set_remote_charset(str)
+#define set_remote_as_current()
+#define remote_to_utf8(a,b) false
#endif /* ENABLE_IRI */
#endif /* IRI_H */
#ifdef ENABLE_IRI
if (opt.enable_iri)
{
- if (opt.locale && !check_encoding_name(opt.locale))
+ if (opt.locale && !check_encoding_name (opt.locale))
opt.locale = NULL;
if (!opt.locale)
opt.locale = find_locale ();
- if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote))
+ if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
opt.encoding_remote = NULL;
- logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));
+ /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
}
#else
if (opt.enable_iri || opt.locale || opt.encoding_remote)
char *filename = NULL, *redirected_URL = NULL;
int dt;
+ set_current_as_locale ();
+
if ((opt.recursive || opt.page_requisites)
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (url_scheme (*t) == SCHEME_FTP)
+ if (url_scheme (*t) == SCHEME_FTP)
opt.follow_ftp = 1;
-
+
status = retrieve_tree (*t);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ {
+ set_remote_as_current ();
+ status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ }
if (opt.delete_after && file_exists_p(filename))
{
#include "res.h"
#include "convert.h"
#include "spider.h"
+#include "iri.h"
\f
/* Functions for maintaining the URL queue. */
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
-
+ char *remote_encoding;
struct queue_element *next; /* next element in queue */
};
const char *url, const char *referer, int depth, bool html_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
+ char *charset = get_current_charset ();
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->html_allowed = html_allowed;
qel->next = NULL;
+ if (charset)
+ qel->remote_encoding = xstrdup (charset);
+ else
+ qel->remote_encoding = NULL;
+
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
+ /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
if (!queue->head)
queue->tail = NULL;
+ set_remote_charset (qel->remote_encoding);
+ if (qel->remote_encoding)
+ xfree (qel->remote_encoding);
+
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
retrieve_tree (const char *start_url)
{
uerr_t status = RETROK;
+ bool utf8_encode = false;
/* The queue of URLs we need to load. */
struct url_queue *queue;
struct hash_table *blacklist;
int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ struct url *start_url_parsed = url_parse (start_url, &up_error_code, &utf8_encode);
if (!start_url_parsed)
{
if (children)
{
struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ struct url *url_parsed = url_parsed = url_parse (url, NULL, &utf8_encode);
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
}
}
- if (file
- && (opt.delete_after
+ if (file
+ && (opt.delete_after
|| opt.spider /* opt.recursive is implicitely true */
|| !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- (otherwise unneeded because of --spider or rejected by -R)
- HTML file just to harvest its hyperlinks -- in either case,
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- (opt.spider ? "--spider" :
+ (opt.spider ? "--spider" :
"recursive rejection criteria")));
logprintf (LOG_VERBOSE,
(opt.delete_after || opt.spider
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
+ bool utf8_encode = false;
- orig_parsed = url_parse (original, NULL);
+ orig_parsed = url_parse (original, NULL, &utf8_encode);
assert (orig_parsed != NULL);
- new_parsed = url_parse (redirected, NULL);
+ new_parsed = url_parse (redirected, NULL, &utf8_encode);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
+#include "iri.h"
/* Total size of downloaded files. Used to enforce quota. */
SUM_SIZE_INT total_downloaded_bytes;
char *saved_post_data = NULL;
char *saved_post_file_name = NULL;
+ bool utf8_encoded = opt.enable_iri;
+
/* If dt is NULL, use local storage. */
if (!dt)
{
if (file)
*file = NULL;
- u = url_parse (url, &up_error_code);
+ second_try:
+ u = url_parse (url, &up_error_code, &utf8_encoded);
if (!u)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
return URLERROR;
}
+ /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+
if (!refurl)
refurl = opt.referer;
proxy = getproxy (u);
if (proxy)
{
+ /* sXXXav : support IRI for proxy */
+ bool proxy_utf8_encode = false;
/* Parse the proxy URL. */
- proxy_url = url_parse (proxy, &up_error_code);
+ proxy_url = url_parse (proxy, &up_error_code, &proxy_utf8_encode);
if (!proxy_url)
{
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
xfree (mynewloc);
mynewloc = construced_newloc;
+ utf8_encoded = opt.enable_iri;
+
/* Now, see if this new location makes sense. */
- newloc_parsed = url_parse (mynewloc, &up_error_code);
+ newloc_parsed = url_parse (mynewloc, &up_error_code, &utf8_encoded);
if (!newloc_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
goto redirected;
}
- if (local_file)
+ /* Try to not encode in UTF-8 if fetching failed */
+ if (result != RETROK && utf8_encoded)
{
- if (*dt & RETROKF)
- {
- register_download (u->url, local_file);
- if (redirection_count && 0 != strcmp (origurl, u->url))
- register_redirection (origurl, u->url);
- if (*dt & TEXTHTML)
- register_html (u->url, local_file);
- }
+ utf8_encoded = false;
+ /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+ goto second_try;
+ }
+
+ if (local_file && *dt & RETROKF)
+ {
+ register_download (u->url, local_file);
+ if (redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+ if (*dt & TEXTHTML)
+ register_html (u->url, local_file);
}
if (file)
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
+
status = retrieve_tree (cur_url->url->url);
opt.follow_ftp = old_follow_ftp;
bool
url_uses_proxy (const char *url)
{
- bool ret;
- struct url *u = url_parse (url, NULL);
+ bool ret, utf8_encode = false;
+ struct url *u = url_parse (url, NULL, &utf8_encode);
if (!u)
return false;
ret = getproxy (u) != NULL;
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, bool *utf8_encode)
{
struct url *u;
const char *p;
goto error;
}
- if (opt.enable_iri)
+ if (opt.enable_iri && *utf8_encode)
{
+ const char *new;
url_unescape ((char *) url);
- url = locale_to_utf8(url);
+ *utf8_encode = remote_to_utf8 (url, &new);
+ if (*utf8_encode)
+ url = new;
}
url_encoded = reencode_escapes (url);
if (opt.enable_iri)
{
- char *new = idn_encode (u->host);
+ char *new = idn_encode (u->host, *utf8_encode);
if (new)
{
xfree (u->host);
char *url_escape (const char *);
-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, bool *);
const char *url_error (int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);