/* Parse the file... */
urls = is_css ? get_urls_css_file (file, url) :
- get_urls_html (file, url, NULL);
+ get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the
#include "recur.h"
#include "html-url.h"
#include "css-url.h"
-#include "iri.h"
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
static struct hash_table *interesting_tags;
static struct hash_table *interesting_attributes;
+/* Will contains the (last) charset found in 'http-equiv=content-type'
+ meta tags */
+static char *meta_charset;
+
static void
init_interesting (void)
{
return NULL;
}
- set_ugly_no_encode (true);
- url = url_parse (link_uri, NULL);
- set_ugly_no_encode (false);
+ url = url_parse (link_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
- set_ugly_no_encode (true);
- url = url_parse (complete_uri, NULL);
- set_ugly_no_encode (false);
+ url = url_parse (complete_uri, NULL, NULL);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
return;
/*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
-
- set_current_charset (mcharset);
- xfree (mcharset);
+ xfree_null (meta_charset);
+ meta_charset = mcharset;
}
else if (name && 0 == strcasecmp (name, "robots"))
{
<base href=...> and does the right thing. */
struct urlpos *
-get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+ struct iri *iri)
{
struct file_memory *fm;
struct map_context ctx;
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
NULL, interesting_attributes);
+ /* If meta charset isn't null, override content encoding */
+ if (iri && meta_charset)
+ set_content_encoding (iri, meta_charset);
+
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
url_text = merged;
}
- set_ugly_no_encode (true);
- url = url_parse (url_text, &up_error_code);
- set_ugly_no_encode (false);
+ url = url_parse (url_text, &up_error_code, NULL);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
};
struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *);
void free_urlpos (struct urlpos *);
#include "retr.h"
#include "connect.h"
#include "netrc.h"
-#include "iri.h"
#ifdef HAVE_SSL
# include "ssl.h"
#endif
If PROXY is non-NULL, the connection will be made to the proxy
server, and u->url will be requested. */
static uerr_t
-gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
+gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
+ struct iri *iri)
{
struct request *req;
/* Try to get remote encoding if needed */
if (opt.enable_iri && !opt.encoding_remote)
- set_current_charset (parse_charset (tmp2));
+ {
+ tmp = parse_charset (tmp2);
+ if (tmp)
+ set_content_encoding (iri, tmp);
+ }
}
}
hs->newloc = resp_header_strdup (resp, "Location");
retried, and retried, and retried, and... */
uerr_t
http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
- int *dt, struct url *proxy)
+ int *dt, struct url *proxy, struct iri *iri)
{
int count;
bool got_head = false; /* used for time-stamping and filename detection */
*dt &= ~SEND_NOCACHE;
/* Try fetching the document, or at least its head. */
- err = gethttp (u, &hstat, dt, proxy);
+ err = gethttp (u, &hstat, dt, proxy, iri);
/* Time? */
tms = datetime_str (time (NULL));
}
/* Maybe we should always keep track of broken links, not just in
* spider mode.
- * Don't log error if it was utf8 encoded because we will try
- * one unencoded. */
- else if (opt.spider && !get_utf8_encode ())
+ * Don't log error if it was UTF-8 encoded because we will try
+ * once unencoded. */
+ else if (opt.spider && !iri->utf8_encode)
{
/* #### Again: ugly ugly ugly! */
if (!hurl)
struct url;
uerr_t http_loop (struct url *, char **, char **, const char *, int *,
- struct url *);
+ struct url *, struct iri *);
void save_cookies (void);
void http_cleanup (void);
time_t http_atotm (const char *);
/* Note: locale encoding is kept in options struct (opt.locale) */
-/* Hold the encoding used for the current fetch */
-char *remote;
-
-/* Hold the encoding for the future found links */
-char *current;
-
-/* Will/Is the current URL encoded in utf8 ? */
-bool utf8_encode;
-
-/* Force no utf8 encoding for url_parse () */
-bool ugly_no_encode;
-
static iconv_t locale2utf8;
static bool open_locale_to_utf8 (void);
/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
char *
-idn_encode (char *host, bool utf8_encoded)
+idn_encode (struct iri *i, char *host)
{
char *new;
int ret;
- /* Encode to UTF-8 if not done using current remote */
- if (!utf8_encoded)
+ /* Encode to UTF-8 if not done */
+ if (!i->utf8_encode)
{
- if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+ if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
{
/* Nothing to encode or an error occured */
return NULL;
/* Try to transcode string str from remote encoding to UTF-8. On success, *new
contains the transcoded string. *new content is unspecified otherwise. */
bool
-remote_to_utf8 (const char *str, const char **new)
+remote_to_utf8 (struct iri *i, const char *str, const char **new)
{
char *r;
iconv_t cd;
if (opt.encoding_remote)
r = opt.encoding_remote;
- else if (current)
- r = current;
+ else if (i->uri_encoding)
+ r = i->uri_encoding;
else
return false;
return ret;
}
-char *get_remote_charset (void)
+struct iri *
+iri_new (void)
{
- return remote;
+ struct iri *i = xmalloc (sizeof (struct iri));
+ i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
+ i->content_encoding = NULL;
+ i->utf8_encode = opt.enable_iri;
}
-char *get_current_charset (void)
-{
- return current;
-}
-
-void set_current_charset (char *charset)
-{
- /*printf("[ current = `%s'\n", charset);*/
- if (current)
- {
- /* Do nothing if already equal */
- if (!strcasecmp (current, charset))
- return;
- xfree (current);
- }
-
- current = charset ? xstrdup (charset) : NULL;
-}
-
-void set_current_as_locale (void)
+void
+iri_free (struct iri *i)
{
- /* sXXXav : assert opt.locale NULL ? */
- /*printf("[ current = locale = `%s'\n", opt.locale);*/
- if (current)
- {
- if (!strcasecmp (current, opt.locale))
- return;
- xfree (current);
- }
-
- current = xstrdup (opt.locale);
+ xfree_null (i->uri_encoding);
+ xfree_null (i->content_encoding);
+ xfree (i);
}
void
-set_remote_charset (char *charset)
+set_uri_encoding (struct iri *i, char *charset)
{
- /*printf("[ remote = `%s'\n", charset);*/
- if (remote)
+ logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset);
+ if (opt.encoding_remote)
+ return;
+ if (i->uri_encoding)
{
- /* Do nothing if already equal */
- if (!strcasecmp (remote, charset))
+ if (!strcasecmp (i->uri_encoding, charset))
return;
- xfree (remote);
+ xfree (i->uri_encoding);
}
- remote = charset ? xstrdup (charset) : NULL;
+
+ i->uri_encoding = charset ? xstrdup (charset) : NULL;
}
void
-set_remote_as_current (void)
+set_content_encoding (struct iri *i, char *charset)
{
- /*printf("[ remote = current = `%s'\n", current);*/
- if (remote)
+ logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset);
+ if (opt.encoding_remote)
+ return;
+ if (i->content_encoding)
{
- /* Do nothing if already equal */
- if (current && !strcasecmp (remote, current))
+ if (!strcasecmp (i->content_encoding, charset))
return;
- xfree (remote);
+ xfree (i->content_encoding);
}
- remote = current ? xstrdup (current) : NULL;
-}
-
-void reset_utf8_encode (void)
-{
- set_utf8_encode (opt.enable_iri);
-}
-
-void set_utf8_encode (bool encode)
-{
- utf8_encode = encode;
-}
-
-bool get_utf8_encode (void)
-{
- return (!ugly_no_encode && utf8_encode);
-}
-
-void set_ugly_no_encode (bool ugly)
-{
- ugly_no_encode = ugly;
+ i->content_encoding = charset ? xstrdup (charset) : NULL;
}
#ifndef IRI_H
#define IRI_H
+struct iri {
+ char *uri_encoding; /* Encoding of the uri to fetch */
+ char *content_encoding; /* Encoding of links inside the fetched file */
+ bool utf8_encode; /* Will/Is the current url encoded in utf8 */
+};
+
#ifdef ENABLE_IRI
char *parse_charset (char *str);
char *find_locale (void);
bool check_encoding_name (char *encoding);
const char *locale_to_utf8 (const char *str);
-char *idn_encode (char *host, bool utf8_encoded);
+char *idn_encode (struct iri *i, char *host);
char *idn_decode (char *host);
-char *get_remote_charset (void);
-char *get_current_charset (void);
-void set_current_charset (char *charset);
-void set_current_as_locale (void);
-void set_current_charset (char *charset);
-void set_remote_charset (char *charset);
-void set_remote_as_current (void);
-bool remote_to_utf8 (const char *str, const char **new);
-void reset_utf8_encode (void);
-void set_utf8_encode (bool encode);
-bool get_utf8_encode (void);
-
-/* ugly ugly ugly */
-void set_ugly_no_encode (bool ugly);
+bool remote_to_utf8 (struct iri *i, const char *str, const char **new);
+struct iri *iri_new (void);
+void iri_free (struct iri *i);
+void set_uri_encoding (struct iri *i, char *charset);
+void set_content_encoding (struct iri *i, char *charset);
#else /* ENABLE_IRI */
+struct iri dummy_iri;
+
#define parse_charset(str) NULL
#define find_locale() NULL
#define check_encoding_name(str) false
#define locale_to_utf8(str) (str)
-#define idn_encode(str,encoded) NULL
+#define idn_encode(a,b,c) NULL
#define idn_decode(str) NULL
-#define get_remote_charset() NULL
-#define get_current_charset() NULL
-#define set_current_charset(str)
-#define set_current_as_locale()
-#define set_current_charset(str)
-#define set_remote_charset(str)
-#define set_remote_as_current()
-#define remote_to_utf8(a,b) false
-#define reset_utf8_encode()
-#define set_utf8_encode(a)
-#define get_utf8_encode() false
-#define set_ugly_no_encode(a)
+#define remote_to_utf8(a,b,c) false
+#define iri_new() (&dummy_iri)
+#define iri_free(a)
+#define set_uri_encoding(a,b)
+#define set_content_encoding(a,b)
#endif /* ENABLE_IRI */
#endif /* IRI_H */
#include "convert.h"
#include "spider.h"
#include "http.h" /* for save_cookies */
-#include "iri.h"
#include <getopt.h>
#include <getpass.h>
char *filename = NULL, *redirected_URL = NULL;
int dt;
- set_current_as_locale ();
- set_ugly_no_encode (false);
-
if ((opt.recursive || opt.page_requisites)
&& (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
{
}
else
{
- set_remote_as_current ();
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ struct iri *i = iri_new ();
+ set_uri_encoding (i, opt.locale);
+ status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
+ opt.recursive, i);
+ iri_free (i);
}
if (opt.delete_after && file_exists_p(filename))
int depth; /* the depth */
bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
- char *remote_encoding;
+ struct iri *iri; /* sXXXav */
bool css_allowed; /* whether the document is allowed to
be treated as CSS. */
struct queue_element *next; /* next element in queue */
into it. */
static void
-url_enqueue (struct url_queue *queue,
+url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
- char *charset = get_current_charset ();
+ qel->iri = i;
qel->url = url;
qel->referer = referer;
qel->depth = depth;
qel->css_allowed = css_allowed;
qel->next = NULL;
- if (charset)
- qel->remote_encoding = xstrdup (charset);
- else
- qel->remote_encoding = NULL;
-
++queue->count;
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
- /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+ if (i)
+ printf ("[Enqueuing %s with %s\n", url, i->uri_encoding);
if (queue->tail)
queue->tail->next = qel;
succeeded, or false if the queue is empty. */
static bool
-url_dequeue (struct url_queue *queue,
+url_dequeue (struct url_queue *queue, struct iri **i,
const char **url, const char **referer, int *depth,
bool *html_allowed, bool *css_allowed)
{
if (!queue->head)
queue->tail = NULL;
- set_remote_charset (qel->remote_encoding);
- if (qel->remote_encoding)
- xfree (qel->remote_encoding);
-
+ *i = qel->iri;
*url = qel->url;
*referer = qel->referer;
*depth = qel->depth;
}
\f
static bool download_child_p (const struct urlpos *, struct url *, int,
- struct url *, struct hash_table *);
+ struct url *, struct hash_table *, struct iri *);
static bool descend_redirect_p (const char *, const char *, int,
- struct url *, struct hash_table *);
+ struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
int up_error_code;
struct url *start_url_parsed;
+ struct iri *i = iri_new ();
+ set_uri_encoding (i, opt.locale);
- set_ugly_no_encode (true);
- start_url_parsed= url_parse (start_url, &up_error_code);
- set_ugly_no_encode (false);
+ start_url_parsed = url_parse (start_url, &up_error_code, i);
if (!start_url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
/* Enqueue the starting URL. Use start_url_parsed->url rather than
just URL so we enqueue the canonical form of the URL. */
- url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
+ url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
+ false);
string_set_add (blacklist, start_url_parsed->url);
while (1)
/* Get the next URL from the queue... */
- if (!url_dequeue (queue,
+ if (!url_dequeue (queue, (struct iri **) &i,
(const char **)&url, (const char **)&referer,
&depth, &html_allowed, &css_allowed))
break;
int dt = 0;
char *redirected = NULL;
- status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+ status = retrieve_url (url, &file, &redirected, referer, &dt,
+ false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
if (descend)
{
if (!descend_redirect_p (redirected, url, depth,
- start_url_parsed, blacklist))
+ start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
bool meta_disallow_follow = false;
struct urlpos *children
= is_css ? get_urls_css_file (file, url) :
- get_urls_html (file, url, &meta_disallow_follow);
+ get_urls_html (file, url, &meta_disallow_follow, i);
if (opt.use_robots && meta_disallow_follow)
{
if (children)
{
struct urlpos *child = children;
- set_ugly_no_encode (true);
- struct url *url_parsed = url_parse (url, NULL);
- set_ugly_no_encode (false);
+ struct url *url_parsed = url_parse (url, NULL, i);
+ struct iri *ci;
char *referer_url = url;
bool strip_auth = (url_parsed != NULL
&& url_parsed->user != NULL);
if (dash_p_leaf_HTML && !child->link_inline_p)
continue;
if (download_child_p (child, url_parsed, depth, start_url_parsed,
- blacklist))
+ blacklist, i))
{
- url_enqueue (queue, xstrdup (child->url->url),
+ ci = iri_new ();
+ set_uri_encoding (ci, i->content_encoding);
+ url_enqueue (queue, ci, xstrdup (child->url->url),
xstrdup (referer_url), depth + 1,
child->link_expect_html,
child->link_expect_css);
xfree (url);
xfree_null (referer);
xfree_null (file);
+ iri_free (i);
}
/* If anything is left of the queue due to a premature exit, free it
char *d1, *d2;
int d3;
bool d4, d5;
- while (url_dequeue (queue,
+ struct iri *d6;
+ while (url_dequeue (queue, (struct iri **)&d6,
(const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
+ iri_free (d6);
xfree (d1);
xfree_null (d2);
}
static bool
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
- struct url *start_url_parsed, struct hash_table *blacklist)
+ struct url *start_url_parsed, struct hash_table *blacklist,
+ struct iri *iri)
{
struct url *u = upos->url;
const char *url = u->url;
if (!specs)
{
char *rfile;
- if (res_retrieve_file (url, &rfile))
+ if (res_retrieve_file (url, &rfile, iri))
{
specs = res_parse_from_file (rfile);
static bool
descend_redirect_p (const char *redirected, const char *original, int depth,
- struct url *start_url_parsed, struct hash_table *blacklist)
+ struct url *start_url_parsed, struct hash_table *blacklist,
+ struct iri *iri)
{
struct url *orig_parsed, *new_parsed;
struct urlpos *upos;
bool success;
- set_ugly_no_encode (true);
- orig_parsed = url_parse (original, NULL);
+ orig_parsed = url_parse (original, NULL, NULL);
assert (orig_parsed != NULL);
- new_parsed = url_parse (redirected, NULL);
+ new_parsed = url_parse (redirected, NULL, NULL);
assert (new_parsed != NULL);
- set_ugly_no_encode (false);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
- start_url_parsed, blacklist);
+ start_url_parsed, blacklist, iri);
url_free (orig_parsed);
url_free (new_parsed);
Return true if robots were retrieved OK, false otherwise. */
bool
-res_retrieve_file (const char *url, char **file)
+res_retrieve_file (const char *url, char **file, struct iri *iri)
{
+ struct iri *i = iri_new ();
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
int saved_sp_val = opt.spider;
+ /* Copy server URI encoding for a possible IDNA transformation, no need to
+ encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+ set_uri_encoding (i, iri->uri_encoding);
+ i->utf8_encode = false;
+
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+ err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
opt.timestamping = saved_ts_val;
- opt.spider = saved_sp_val;
+ opt.spider = saved_sp_val;
xfree (robots_url);
+ iri_free (i);
if (err != RETROK && *file != NULL)
{
void res_register_specs (const char *, int, struct robot_specs *);
struct robot_specs *res_get_specs (const char *, int);
-bool res_retrieve_file (const char *, char **);
+bool res_retrieve_file (const char *, char **, struct iri *);
bool is_robots_txt_url (const char *);
uerr_t
retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive)
+ const char *refurl, int *dt, bool recursive, struct iri *iri)
{
uerr_t result;
char *url;
if (file)
*file = NULL;
- reset_utf8_encode ();
-
second_try:
- u = url_parse (url, &up_error_code);
+ u = url_parse (url, &up_error_code, iri);
if (!u)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
return URLERROR;
}
- /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+ printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode);
if (!refurl)
refurl = opt.referer;
proxy = getproxy (u);
if (proxy)
{
- /* sXXXav : support IRI for proxy */
+ /* sXXXav : could a proxy include a path ??? */
+ struct iri *pi = iri_new ();
+ set_uri_encoding (pi, opt.locale);
+ pi->utf8_encode = false;
+
/* Parse the proxy URL. */
- set_ugly_no_encode (true);
- proxy_url = url_parse (proxy, &up_error_code);
- set_ugly_no_encode (false);
+ proxy_url = url_parse (proxy, &up_error_code, NULL);
if (!proxy_url)
{
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
- result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+ result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
xfree (mynewloc);
mynewloc = construced_newloc;
- reset_utf8_encode ();
+ /* Reset UTF-8 encoding state, keep the URI encoding and reset
+ the content encoding. */
+ iri->utf8_encode = opt.enable_iri;
+ set_content_encoding (iri, NULL);
/* Now, see if this new location makes sense. */
- newloc_parsed = url_parse (mynewloc, &up_error_code);
+ newloc_parsed = url_parse (mynewloc, &up_error_code, iri);
if (!newloc_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
}
/* Try to not encode in UTF-8 if fetching failed */
- if (!(*dt & RETROKF) && get_utf8_encode ())
+ if (!(*dt & RETROKF) && iri->utf8_encode)
{
- set_utf8_encode (false);
- /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+ iri->utf8_encode = false;
+ printf ("[Fallbacking to non-utf8 for `%s'\n", url);
goto second_try;
}
{
uerr_t status;
struct urlpos *url_list, *cur_url;
+ struct iri *iri = iri_new();
char *input_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
-
+
+ /* sXXXav : Assume filename and links in the file are in the locale */
+ set_content_encoding (iri, opt.locale);
+
if (url_has_scheme (url))
{
uerr_t status;
- status = retrieve_url (url, &input_file, NULL, NULL, NULL, false);
+ status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri);
if (status != RETROK)
return status;
}
else
input_file = (char *) file;
- url_list = (html ? get_urls_html (input_file, NULL, NULL)
+ url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+ status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
+ &dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
{
bool ret;
struct url *u;
- set_ugly_no_encode(true);
- u= url_parse (url, NULL);
- set_ugly_no_encode(false);
+ struct iri *i = iri_new();
+ /* url was given in the command line, so use locale as encoding */
+ set_uri_encoding (i, opt.locale);
+ u= url_parse (url, NULL, i);
if (!u)
return false;
ret = getproxy (u) != NULL;
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
+uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
+ bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri)
{
struct url *u;
const char *p;
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded = NULL;
+ char *url_encoded = NULL, *new_url = NULL;
int error_code;
goto error;
}
- if (opt.enable_iri && get_utf8_encode ())
+ if (iri && iri->utf8_encode)
{
- const char *new;
- bool utf8_encode;
url_unescape ((char *) url);
- utf8_encode = remote_to_utf8 (url, &new);
- set_utf8_encode (utf8_encode);
- if (utf8_encode)
- url = new;
+ iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url);
+ if (!iri->utf8_encode)
+ new_url = NULL;
}
- url_encoded = reencode_escapes (url);
+ url_encoded = reencode_escapes (new_url ? new_url : url);
p = url_encoded;
+ if (new_url && url_encoded != new_url)
+ xfree (new_url);
+
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
{
url_unescape (u->host);
host_modified = true;
- }
- if (opt.enable_iri)
- {
- char *new = idn_encode (u->host, get_utf8_encode ());
- if (new)
+ /* Apply IDNA regardless of iri->utf8_encode status */
+ if (opt.enable_iri && iri)
{
- xfree (u->host);
- u->host = new;
- host_modified = true;
+ char *new = idn_encode (iri, u->host);
+ if (new)
+ {
+ xfree (u->host);
+ u->host = new;
+ host_modified = true;
+ }
}
}
char *url_escape (const char *);
-struct url *url_parse (const char *, int *);
+struct url *url_parse (const char *, int *, struct iri *iri);
const char *url_error (int);
char *url_full_path (const struct url *);
void url_set_dir (struct url *, const char *);
#include "quote.h"
#include "quotearg.h"
+/* Likewise for struct iri definition */
+#include "iri.h"
+
/* Useful macros used across the code: */
/* The number of elements in an array. For example: