From d82f80ecab9bfef857d780f894cca7e890780ce0 Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 24 Jul 2008 00:56:29 +0200 Subject: [PATCH] Change global variable model for state-object --- src/convert.c | 2 +- src/html-url.c | 29 ++++++------ src/html-url.h | 2 +- src/http.c | 20 +++++---- src/http.h | 2 +- src/iri.c | 120 +++++++++++++++---------------------------------- src/iri.h | 48 +++++++++----------- src/main.c | 11 +++-- src/recur.c | 75 +++++++++++++++---------------- src/res.c | 13 ++++-- src/res.h | 2 +- src/retr.c | 53 +++++++++++++--------- src/retr.h | 3 +- src/url.c | 37 +++++++-------- src/url.h | 2 +- src/wget.h | 3 ++ 16 files changed, 194 insertions(+), 228 deletions(-) diff --git a/src/convert.c b/src/convert.c index e72a4b0f..54004ad0 100644 --- a/src/convert.c +++ b/src/convert.c @@ -96,7 +96,7 @@ convert_links_in_hashtable (struct hash_table *downloaded_set, /* Parse the file... */ urls = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, NULL); + get_urls_html (file, url, NULL, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the diff --git a/src/html-url.c b/src/html-url.c index ef93a7e4..6e886083 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -44,7 +44,6 @@ as that of the covered work. */ #include "recur.h" #include "html-url.h" #include "css-url.h" -#include "iri.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -175,6 +174,10 @@ static const char *additional_attributes[] = { static struct hash_table *interesting_tags; static struct hash_table *interesting_attributes; +/* Will contains the (last) charset found in 'http-equiv=content-type' + meta tags */ +static char *meta_charset; + static void init_interesting (void) { @@ -285,9 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - set_ugly_no_encode (true); - url = url_parse (link_uri, NULL); - set_ugly_no_encode (false); + url = url_parse (link_uri, NULL, NULL); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -306,9 +307,7 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - set_ugly_no_encode (true); - url = url_parse (complete_uri, NULL); - set_ugly_no_encode (false); + url = url_parse (complete_uri, NULL, NULL); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -573,9 +572,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) return; /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ - - set_current_charset (mcharset); - xfree (mcharset); + xfree_null (meta_charset); + meta_charset = mcharset; } else if (name && 0 == strcasecmp (name, "robots")) { @@ -641,7 +639,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg) and does the right thing. */ struct urlpos * -get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, + struct iri *iri) { struct file_memory *fm; struct map_context ctx; @@ -681,6 +680,10 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, NULL, interesting_attributes); + /* If meta charset isn't null, override content encoding */ + if (iri && meta_charset) + set_content_encoding (iri, meta_charset); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; @@ -750,9 +753,7 @@ get_urls_file (const char *file) url_text = merged; } - set_ugly_no_encode (true); - url = url_parse (url_text, &up_error_code); - set_ugly_no_encode (false); + url = url_parse (url_text, &up_error_code, NULL); if (!url) { logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), diff --git a/src/html-url.h b/src/html-url.h index a94f0db6..2e9ec820 100644 --- a/src/html-url.h +++ b/src/html-url.h @@ -44,7 +44,7 @@ struct map_context { }; struct urlpos *get_urls_file (const char *); -struct urlpos *get_urls_html (const char *, const char *, bool *); +struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *); struct urlpos *append_url (const char *, int, int, struct map_context *); void free_urlpos (struct urlpos *); diff --git a/src/http.c b/src/http.c index 5ec70d27..589e18ee 100644 --- a/src/http.c +++ b/src/http.c @@ -49,7 +49,6 @@ as that of the covered work. */ #include "retr.h" #include "connect.h" #include "netrc.h" -#include "iri.h" #ifdef HAVE_SSL # include "ssl.h" #endif @@ -1365,7 +1364,8 @@ free_hstat (struct http_stat *hs) If PROXY is non-NULL, the connection will be made to the proxy server, and u->url will be requested. */ static uerr_t -gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy) +gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, + struct iri *iri) { struct request *req; @@ -2058,7 +2058,11 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); /* Try to get remote encoding if needed */ if (opt.enable_iri && !opt.encoding_remote) - set_current_charset (parse_charset (tmp2)); + { + tmp = parse_charset (tmp2); + if (tmp) + set_content_encoding (iri, tmp); + } } } hs->newloc = resp_header_strdup (resp, "Location"); @@ -2333,7 +2337,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); retried, and retried, and retried, and... */ uerr_t http_loop (struct url *u, char **newloc, char **local_file, const char *referer, - int *dt, struct url *proxy) + int *dt, struct url *proxy, struct iri *iri) { int count; bool got_head = false; /* used for time-stamping and filename detection */ @@ -2497,7 +2501,7 @@ Spider mode enabled. Check if remote file exists.\n")); *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy); + err = gethttp (u, &hstat, dt, proxy, iri); /* Time? */ tms = datetime_str (time (NULL)); @@ -2576,9 +2580,9 @@ Spider mode enabled. Check if remote file exists.\n")); } /* Maybe we should always keep track of broken links, not just in * spider mode. - * Don't log error if it was utf8 encoded because we will try - * one unencoded. */ - else if (opt.spider && !get_utf8_encode ()) + * Don't log error if it was UTF-8 encoded because we will try + * once unencoded. */ + else if (opt.spider && !iri->utf8_encode) { /* #### Again: ugly ugly ugly! */ if (!hurl) diff --git a/src/http.h b/src/http.h index e0e66cea..4769e9d3 100644 --- a/src/http.h +++ b/src/http.h @@ -33,7 +33,7 @@ as that of the covered work. */ struct url; uerr_t http_loop (struct url *, char **, char **, const char *, int *, - struct url *); + struct url *, struct iri *); void save_cookies (void); void http_cleanup (void); time_t http_atotm (const char *); diff --git a/src/iri.c b/src/iri.c index d23615ae..783aa331 100644 --- a/src/iri.c +++ b/src/iri.c @@ -46,18 +46,6 @@ as that of the covered work. */ /* Note: locale encoding is kept in options struct (opt.locale) */ -/* Hold the encoding used for the current fetch */ -char *remote; - -/* Hold the encoding for the future found links */ -char *current; - -/* Will/Is the current URL encoded in utf8 ? */ -bool utf8_encode; - -/* Force no utf8 encoding for url_parse () */ -bool ugly_no_encode; - static iconv_t locale2utf8; static bool open_locale_to_utf8 (void); @@ -239,15 +227,15 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL on error. */ char * -idn_encode (char *host, bool utf8_encoded) +idn_encode (struct iri *i, char *host) { char *new; int ret; - /* Encode to UTF-8 if not done using current remote */ - if (!utf8_encoded) + /* Encode to UTF-8 if not done */ + if (!i->utf8_encode) { - if (!remote_to_utf8 ((const char *) host, (const char **) &new)) + if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) { /* Nothing to encode or an error occured */ return NULL; @@ -291,7 +279,7 @@ idn_decode (char *host) /* Try to transcode string str from remote encoding to UTF-8. On success, *new contains the transcoded string. *new content is unspecified otherwise. */ bool -remote_to_utf8 (const char *str, const char **new) +remote_to_utf8 (struct iri *i, const char *str, const char **new) { char *r; iconv_t cd; @@ -299,8 +287,8 @@ remote_to_utf8 (const char *str, const char **new) if (opt.encoding_remote) r = opt.encoding_remote; - else if (current) - r = current; + else if (i->uri_encoding) + r = i->uri_encoding; else return false; @@ -323,90 +311,52 @@ remote_to_utf8 (const char *str, const char **new) return ret; } -char *get_remote_charset (void) +struct iri * +iri_new (void) { - return remote; + struct iri *i = xmalloc (sizeof (struct iri)); + i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; + i->content_encoding = NULL; + i->utf8_encode = opt.enable_iri; } -char *get_current_charset (void) -{ - return current; -} - -void set_current_charset (char *charset) -{ - /*printf("[ current = `%s'\n", charset);*/ - if (current) - { - /* Do nothing if already equal */ - if (!strcasecmp (current, charset)) - return; - xfree (current); - } - - current = charset ? xstrdup (charset) : NULL; -} - -void set_current_as_locale (void) +void +iri_free (struct iri *i) { - /* sXXXav : assert opt.locale NULL ? */ - /*printf("[ current = locale = `%s'\n", opt.locale);*/ - if (current) - { - if (!strcasecmp (current, opt.locale)) - return; - xfree (current); - } - - current = xstrdup (opt.locale); + xfree_null (i->uri_encoding); + xfree_null (i->content_encoding); + xfree (i); } void -set_remote_charset (char *charset) +set_uri_encoding (struct iri *i, char *charset) { - /*printf("[ remote = `%s'\n", charset);*/ - if (remote) + logprintf (LOG_VERBOSE, "[ uri = `%s'\n", charset); + if (opt.encoding_remote) + return; + if (i->uri_encoding) { - /* Do nothing if already equal */ - if (!strcasecmp (remote, charset)) + if (!strcasecmp (i->uri_encoding, charset)) return; - xfree (remote); + xfree (i->uri_encoding); } - remote = charset ? xstrdup (charset) : NULL; + + i->uri_encoding = charset ? xstrdup (charset) : NULL; } void -set_remote_as_current (void) +set_content_encoding (struct iri *i, char *charset) { - /*printf("[ remote = current = `%s'\n", current);*/ - if (remote) + logprintf (LOG_VERBOSE, "[ content = `%s'\n", charset); + if (opt.encoding_remote) + return; + if (i->content_encoding) { - /* Do nothing if already equal */ - if (current && !strcasecmp (remote, current)) + if (!strcasecmp (i->content_encoding, charset)) return; - xfree (remote); + xfree (i->content_encoding); } - remote = current ? xstrdup (current) : NULL; -} - -void reset_utf8_encode (void) -{ - set_utf8_encode (opt.enable_iri); -} - -void set_utf8_encode (bool encode) -{ - utf8_encode = encode; -} - -bool get_utf8_encode (void) -{ - return (!ugly_no_encode && utf8_encode); -} - -void set_ugly_no_encode (bool ugly) -{ - ugly_no_encode = ugly; + i->content_encoding = charset ? xstrdup (charset) : NULL; } diff --git a/src/iri.h b/src/iri.h index 50102df4..173d0656 100644 --- a/src/iri.h +++ b/src/iri.h @@ -30,49 +30,41 @@ as that of the covered work. */ #ifndef IRI_H #define IRI_H +struct iri { + char *uri_encoding; /* Encoding of the uri to fetch */ + char *content_encoding; /* Encoding of links inside the fetched file */ + bool utf8_encode; /* Will/Is the current url encoded in utf8 */ +}; + #ifdef ENABLE_IRI char *parse_charset (char *str); char *find_locale (void); bool check_encoding_name (char *encoding); const char *locale_to_utf8 (const char *str); -char *idn_encode (char *host, bool utf8_encoded); +char *idn_encode (struct iri *i, char *host); char *idn_decode (char *host); -char *get_remote_charset (void); -char *get_current_charset (void); -void set_current_charset (char *charset); -void set_current_as_locale (void); -void set_current_charset (char *charset); -void set_remote_charset (char *charset); -void set_remote_as_current (void); -bool remote_to_utf8 (const char *str, const char **new); -void reset_utf8_encode (void); -void set_utf8_encode (bool encode); -bool get_utf8_encode (void); - -/* ugly ugly ugly */ -void set_ugly_no_encode (bool ugly); +bool remote_to_utf8 (struct iri *i, const char *str, const char **new); +struct iri *iri_new (void); +void iri_free (struct iri *i); +void set_uri_encoding (struct iri *i, char *charset); +void set_content_encoding (struct iri *i, char *charset); #else /* ENABLE_IRI */ +struct iri dummy_iri; + #define parse_charset(str) NULL #define find_locale() NULL #define check_encoding_name(str) false #define locale_to_utf8(str) (str) -#define idn_encode(str,encoded) NULL +#define idn_encode(a,b,c) NULL #define idn_decode(str) NULL -#define get_remote_charset() NULL -#define get_current_charset() NULL -#define set_current_charset(str) -#define set_current_as_locale() -#define set_current_charset(str) -#define set_remote_charset(str) -#define set_remote_as_current() -#define remote_to_utf8(a,b) false -#define reset_utf8_encode() -#define set_utf8_encode(a) -#define get_utf8_encode() false -#define set_ugly_no_encode(a) +#define remote_to_utf8(a,b,c) false +#define iri_new() (&dummy_iri) +#define iri_free(a) +#define set_uri_encoding(a,b) +#define set_content_encoding(a,b) #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/main.c b/src/main.c index 6135a67d..8cee194c 100644 --- a/src/main.c +++ b/src/main.c @@ -57,7 +57,6 @@ as that of the covered work. */ #include "convert.h" #include "spider.h" #include "http.h" /* for save_cookies */ -#include "iri.h" #include #include @@ -1191,9 +1190,6 @@ WARNING: Can't reopen standard output in binary mode;\n\ char *filename = NULL, *redirected_URL = NULL; int dt; - set_current_as_locale (); - set_ugly_no_encode (false); - if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) { @@ -1209,8 +1205,11 @@ WARNING: Can't reopen standard output in binary mode;\n\ } else { - set_remote_as_current (); - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive); + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale); + status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, + opt.recursive, i); + iri_free (i); } if (opt.delete_after && file_exists_p(filename)) diff --git a/src/recur.c b/src/recur.c index 24b80ad4..e2f58d1c 100644 --- a/src/recur.c +++ b/src/recur.c @@ -61,7 +61,7 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ - char *remote_encoding; + struct iri *iri; /* sXXXav */ bool css_allowed; /* whether the document is allowed to be treated as CSS. */ struct queue_element *next; /* next element in queue */ @@ -95,12 +95,12 @@ url_queue_delete (struct url_queue *queue) into it. */ static void -url_enqueue (struct url_queue *queue, +url_enqueue (struct url_queue *queue, struct iri *i, const char *url, const char *referer, int depth, bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); - char *charset = get_current_charset (); + qel->iri = i; qel->url = url; qel->referer = referer; qel->depth = depth; @@ -108,11 +108,6 @@ url_enqueue (struct url_queue *queue, qel->css_allowed = css_allowed; qel->next = NULL; - if (charset) - qel->remote_encoding = xstrdup (charset); - else - qel->remote_encoding = NULL; - ++queue->count; if (queue->count > queue->maxcount) queue->maxcount = queue->count; @@ -120,7 +115,8 @@ url_enqueue (struct url_queue *queue, DEBUGP (("Enqueuing %s at depth %d\n", url, depth)); DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); - /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/ + if (i) + printf ("[Enqueuing %s with %s\n", url, i->uri_encoding); if (queue->tail) queue->tail->next = qel; @@ -134,7 +130,7 @@ url_enqueue (struct url_queue *queue, succeeded, or false if the queue is empty. */ static bool -url_dequeue (struct url_queue *queue, +url_dequeue (struct url_queue *queue, struct iri **i, const char **url, const char **referer, int *depth, bool *html_allowed, bool *css_allowed) { @@ -147,10 +143,7 @@ url_dequeue (struct url_queue *queue, if (!queue->head) queue->tail = NULL; - set_remote_charset (qel->remote_encoding); - if (qel->remote_encoding) - xfree (qel->remote_encoding); - + *i = qel->iri; *url = qel->url; *referer = qel->referer; *depth = qel->depth; @@ -167,9 +160,9 @@ url_dequeue (struct url_queue *queue, } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); static bool descend_redirect_p (const char *, const char *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); /* Retrieve a part of the web beginning with START_URL. This used to @@ -207,10 +200,10 @@ retrieve_tree (const char *start_url) int up_error_code; struct url *start_url_parsed; + struct iri *i = iri_new (); + set_uri_encoding (i, opt.locale); - set_ugly_no_encode (true); - start_url_parsed= url_parse (start_url, &up_error_code); - set_ugly_no_encode (false); + start_url_parsed = url_parse (start_url, &up_error_code, i); if (!start_url_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, @@ -223,7 +216,8 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); + url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, + false); string_set_add (blacklist, start_url_parsed->url); while (1) @@ -242,7 +236,7 @@ retrieve_tree (const char *start_url) /* Get the next URL from the queue... */ - if (!url_dequeue (queue, + if (!url_dequeue (queue, (struct iri **) &i, (const char **)&url, (const char **)&referer, &depth, &html_allowed, &css_allowed)) break; @@ -283,7 +277,8 @@ retrieve_tree (const char *start_url) int dt = 0; char *redirected = NULL; - status = retrieve_url (url, &file, &redirected, referer, &dt, false); + status = retrieve_url (url, &file, &redirected, referer, &dt, + false, i); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@ -311,7 +306,7 @@ retrieve_tree (const char *start_url) if (descend) { if (!descend_redirect_p (redirected, url, depth, - start_url_parsed, blacklist)) + start_url_parsed, blacklist, i)) descend = false; else /* Make sure that the old pre-redirect form gets @@ -363,7 +358,7 @@ retrieve_tree (const char *start_url) bool meta_disallow_follow = false; struct urlpos *children = is_css ? get_urls_css_file (file, url) : - get_urls_html (file, url, &meta_disallow_follow); + get_urls_html (file, url, &meta_disallow_follow, i); if (opt.use_robots && meta_disallow_follow) { @@ -374,9 +369,8 @@ retrieve_tree (const char *start_url) if (children) { struct urlpos *child = children; - set_ugly_no_encode (true); - struct url *url_parsed = url_parse (url, NULL); - set_ugly_no_encode (false); + struct url *url_parsed = url_parse (url, NULL, i); + struct iri *ci; char *referer_url = url; bool strip_auth = (url_parsed != NULL && url_parsed->user != NULL); @@ -393,9 +387,11 @@ retrieve_tree (const char *start_url) if (dash_p_leaf_HTML && !child->link_inline_p) continue; if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) + blacklist, i)) { - url_enqueue (queue, xstrdup (child->url->url), + ci = iri_new (); + set_uri_encoding (ci, i->content_encoding); + url_enqueue (queue, ci, xstrdup (child->url->url), xstrdup (referer_url), depth + 1, child->link_expect_html, child->link_expect_css); @@ -440,6 +436,7 @@ retrieve_tree (const char *start_url) xfree (url); xfree_null (referer); xfree_null (file); + iri_free (i); } /* If anything is left of the queue due to a premature exit, free it @@ -448,9 +445,11 @@ retrieve_tree (const char *start_url) char *d1, *d2; int d3; bool d4, d5; - while (url_dequeue (queue, + struct iri *d6; + while (url_dequeue (queue, (struct iri **)&d6, (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { + iri_free (d6); xfree (d1); xfree_null (d2); } @@ -479,7 +478,8 @@ retrieve_tree (const char *start_url) static bool download_child_p (const struct urlpos *upos, struct url *parent, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *u = upos->url; const char *url = u->url; @@ -620,7 +620,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (!specs) { char *rfile; - if (res_retrieve_file (url, &rfile)) + if (res_retrieve_file (url, &rfile, iri)) { specs = res_parse_from_file (rfile); @@ -675,25 +675,24 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, static bool descend_redirect_p (const char *redirected, const char *original, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { struct url *orig_parsed, *new_parsed; struct urlpos *upos; bool success; - set_ugly_no_encode (true); - orig_parsed = url_parse (original, NULL); + orig_parsed = url_parse (original, NULL, NULL); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, NULL); assert (new_parsed != NULL); - set_ugly_no_encode (false); upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist, iri); url_free (orig_parsed); url_free (new_parsed); diff --git a/src/res.c b/src/res.c index 8c35f0e1..69abd12d 100644 --- a/src/res.c +++ b/src/res.c @@ -532,21 +532,28 @@ res_get_specs (const char *host, int port) Return true if robots were retrieved OK, false otherwise. */ bool -res_retrieve_file (const char *url, char **file) +res_retrieve_file (const char *url, char **file, struct iri *iri) { + struct iri *i = iri_new (); uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; int saved_sp_val = opt.spider; + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding); + i->utf8_encode = false; + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false); + err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i); opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { diff --git a/src/res.h b/src/res.h index 94a57750..5439eaf9 100644 --- a/src/res.h +++ b/src/res.h @@ -40,7 +40,7 @@ bool res_match_path (const struct robot_specs *, const char *); void res_register_specs (const char *, int, struct robot_specs *); struct robot_specs *res_get_specs (const char *, int); -bool res_retrieve_file (const char *, char **); +bool res_retrieve_file (const char *, char **, struct iri *); bool is_robots_txt_url (const char *); diff --git a/src/retr.c b/src/retr.c index 7a28ea32..e70f6e6e 100644 --- a/src/retr.c +++ b/src/retr.c @@ -598,7 +598,7 @@ static char *getproxy (struct url *); uerr_t retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive) + const char *refurl, int *dt, bool recursive, struct iri *iri) { uerr_t result; char *url; @@ -626,10 +626,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - reset_utf8_encode (); - second_try: - u = url_parse (url, &up_error_code); + u = url_parse (url, &up_error_code, iri); if (!u) { logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); @@ -637,7 +635,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, return URLERROR; } - /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/ + printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, iri->uri_encoding, iri->utf8_encode); if (!refurl) refurl = opt.referer; @@ -652,11 +650,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { - /* sXXXav : support IRI for proxy */ + /* sXXXav : could a proxy include a path ??? */ + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - set_ugly_no_encode (true); - proxy_url = url_parse (proxy, &up_error_code); - set_ugly_no_encode (false); + proxy_url = url_parse (proxy, &up_error_code, NULL); if (!proxy_url) { logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), @@ -681,7 +681,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@ -731,10 +731,13 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; - reset_utf8_encode (); + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri); if (!newloc_parsed) { logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), @@ -782,10 +785,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, } /* Try to not encode in UTF-8 if fetching failed */ - if (!(*dt & RETROKF) && get_utf8_encode ()) + if (!(*dt & RETROKF) && iri->utf8_encode) { - set_utf8_encode (false); - /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/ + iri->utf8_encode = false; + printf ("[Fallbacking to non-utf8 for `%s'\n", url); goto second_try; } @@ -845,24 +848,28 @@ retrieve_from_file (const char *file, bool html, int *count) { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); char *input_file = NULL; const char *url = file; status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { uerr_t status; - status = retrieve_url (url, &input_file, NULL, NULL, NULL, false); + status = retrieve_url (url, &input_file, NULL, NULL, NULL, false, iri); if (status != RETROK) return status; } else input_file = (char *) file; - url_list = (html ? get_urls_html (input_file, NULL, NULL) + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) : get_urls_file (input_file)); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) @@ -892,7 +899,8 @@ retrieve_from_file (const char *file, bool html, int *count) opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); + status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, + &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { @@ -1064,9 +1072,10 @@ url_uses_proxy (const char *url) { bool ret; struct url *u; - set_ugly_no_encode(true); - u= url_parse (url, NULL); - set_ugly_no_encode(false); + struct iri *i = iri_new(); + /* url was given in the command line, so use locale as encoding */ + set_uri_encoding (i, opt.locale); + u= url_parse (url, NULL, i); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/retr.h b/src/retr.h index ec55cfda..bb2e66d3 100644 --- a/src/retr.h +++ b/src/retr.h @@ -51,7 +51,8 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int); char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool); +uerr_t retrieve_url (const char *, char **, char **, const char *, int *, + bool, struct iri *); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); diff --git a/src/url.c b/src/url.c index beaf0fb2..c7a3a721 100644 --- a/src/url.c +++ b/src/url.c @@ -641,7 +641,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error) +url_parse (const char *url, int *error, struct iri *iri) { struct url *u; const char *p; @@ -660,7 +660,7 @@ url_parse (const char *url, int *error) int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; + char *url_encoded = NULL, *new_url = NULL; int error_code; @@ -671,20 +671,20 @@ url_parse (const char *url, int *error) goto error; } - if (opt.enable_iri && get_utf8_encode ()) + if (iri && iri->utf8_encode) { - const char *new; - bool utf8_encode; url_unescape ((char *) url); - utf8_encode = remote_to_utf8 (url, &new); - set_utf8_encode (utf8_encode); - if (utf8_encode) - url = new; + iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url); + if (!iri->utf8_encode) + new_url = NULL; } - url_encoded = reencode_escapes (url); + url_encoded = reencode_escapes (new_url ? new_url : url); p = url_encoded; + if (new_url && url_encoded != new_url) + xfree (new_url); + p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); @@ -854,16 +854,17 @@ url_parse (const char *url, int *error) { url_unescape (u->host); host_modified = true; - } - if (opt.enable_iri) - { - char *new = idn_encode (u->host, get_utf8_encode ()); - if (new) + /* Apply IDNA regardless of iri->utf8_encode status */ + if (opt.enable_iri && iri) { - xfree (u->host); - u->host = new; - host_modified = true; + char *new = idn_encode (iri, u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } } } diff --git a/src/url.h b/src/url.h index 7c8bcfed..9c49c0b5 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *); +struct url *url_parse (const char *, int *, struct iri *iri); const char *url_error (int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); diff --git a/src/wget.h b/src/wget.h index d87dfcac..b17b6709 100644 --- a/src/wget.h +++ b/src/wget.h @@ -218,6 +218,9 @@ typedef double SUM_SIZE_INT; #include "quote.h" #include "quotearg.h" +/* Likewise for struct iri definition */ +#include "iri.h" + /* Useful macros used across the code: */ /* The number of elements in an array. For example: -- 2.39.2