From: Saint Xavier Date: Sat, 27 Sep 2008 09:13:21 +0000 (+0200) Subject: IRI requirement: do not percent-encode already percent-encoded values (try1) X-Git-Tag: v1.13~338^2~3 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=66dd4bda74bb78915b92cac4e7bfd32a3fe9d957 IRI requirement: do not percent-encode already percent-encoded values (try1) --- diff --git a/src/html-url.c b/src/html-url.c index c954cb97..e6ab2324 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -288,7 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - url = url_parse (link_uri, NULL, NULL); + url = url_parse (link_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -307,7 +307,7 @@ append_url (const char *link_uri, int position, int size, DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", ctx->document_file, base, link_uri, complete_uri)); - url = url_parse (complete_uri, NULL, NULL); + url = url_parse (complete_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -752,7 +752,7 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code, NULL); + url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { char *error = url_error (url_text, up_error_code); diff --git a/src/iri.c b/src/iri.c index e3909d50..b1e0bf89 100644 --- a/src/iri.c +++ b/src/iri.c @@ -298,6 +298,7 @@ iri_new (void) struct iri *i = xmalloc (sizeof (struct iri)); i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; i->content_encoding = NULL; + i->orig_url = NULL; i->utf8_encode = opt.enable_iri; return i; } @@ -308,6 +309,7 @@ iri_free (struct iri *i) { xfree_null (i->uri_encoding); xfree_null (i->content_encoding); + xfree_null (i->orig_url); xfree (i); } diff --git a/src/iri.h b/src/iri.h index c024de72..6ad2becf 100644 --- a/src/iri.h +++ b/src/iri.h @@ -33,6 +33,7 @@ as that of the covered work. */ struct iri { char *uri_encoding; /* Encoding of the uri to fetch */ char *content_encoding; /* Encoding of links inside the fetched file */ + char *orig_url; /* */ bool utf8_encode; /* Will/Is the current url encoded in utf8 */ }; diff --git a/src/recur.c b/src/recur.c index 78682458..95581486 100644 --- a/src/recur.c +++ b/src/recur.c @@ -214,7 +214,7 @@ retrieve_tree (const char *start_url, struct iri *pi) set_uri_encoding (i, opt.locale, true); #undef COPYSTR - start_url_parsed = url_parse (start_url, &up_error_code, i); + start_url_parsed = url_parse (start_url, &up_error_code, i, true); if (!start_url_parsed) { char *error = url_error (start_url, up_error_code); @@ -381,7 +381,7 @@ retrieve_tree (const char *start_url, struct iri *pi) if (children) { struct urlpos *child = children; - struct url *url_parsed = url_parse (url, NULL, i); + struct url *url_parsed = url_parse (url, NULL, i, false); struct iri *ci; char *referer_url = url; bool strip_auth = (url_parsed != NULL @@ -694,10 +694,10 @@ descend_redirect_p (const char *redirected, const char *original, int depth, struct urlpos *upos; bool success; - orig_parsed = url_parse (original, NULL, NULL); + orig_parsed = url_parse (original, NULL, NULL, false); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL, NULL); + new_parsed = url_parse (redirected, NULL, NULL, false); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); diff --git a/src/retr.c b/src/retr.c index 28a6d874..fe4e3e76 100644 --- a/src/retr.c +++ b/src/retr.c @@ -626,7 +626,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, *file = NULL; second_try: - u = url_parse (url, &up_error_code, iri); + u = url_parse (url, &up_error_code, iri, true); if (!u) { char *error = url_error (url, up_error_code); @@ -658,7 +658,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, pi->utf8_encode = false; /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code, NULL); + proxy_url = url_parse (proxy, &up_error_code, NULL, true); if (!proxy_url) { char *error = url_error (proxy, up_error_code); @@ -739,9 +739,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, the content encoding. */ iri->utf8_encode = opt.enable_iri; set_content_encoding (iri, NULL); + xfree_null (iri->orig_url); /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code, iri); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true); if (!newloc_parsed) { char *error = url_error (mynewloc, up_error_code); @@ -794,7 +795,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (!(*dt & RETROKF) && iri->utf8_encode) { iri->utf8_encode = false; - DEBUGP (("[IRI Fallbacking to non-utf8 for %s\n", quote (url))); + DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url))); goto second_try; } @@ -907,6 +908,8 @@ retrieve_from_file (const char *file, bool html, int *count) /* Reset UTF-8 encode status */ iri->utf8_encode = opt.enable_iri; + xfree_null (iri->orig_url); + iri->orig_url = NULL; if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) @@ -1100,7 +1103,7 @@ url_uses_proxy (const char *url) struct iri *i = iri_new(); /* url was given in the command line, so use locale as encoding */ set_uri_encoding (i, opt.locale, true); - u= url_parse (url, NULL, i); + u= url_parse (url, NULL, i, false); if (!u) return false; ret = getproxy (u) != NULL; diff --git a/src/url.c b/src/url.c index c937d056..8f067250 100644 --- a/src/url.c +++ b/src/url.c @@ -640,7 +640,7 @@ static const char *parse_errors[] = { error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */ struct url * -url_parse (const char *url, int *error, struct iri *iri) +url_parse (const char *url, int *error, struct iri *iri, bool percent_encode) { struct url *u; const char *p; @@ -672,13 +672,19 @@ url_parse (const char *url, int *error, struct iri *iri) if (iri && iri->utf8_encode) { - url_unescape ((char *) url); - iri->utf8_encode = remote_to_utf8 (iri, url, (const char **) &new_url); + iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url); if (!iri->utf8_encode) new_url = NULL; + else + iri->orig_url = xstrdup (url); } - url_encoded = reencode_escapes (new_url ? new_url : url); + /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/ + if (percent_encode) + url_encoded = reencode_escapes (new_url ? new_url : url); + else + url_encoded = new_url ? new_url : url; + p = url_encoded; if (new_url && url_encoded != new_url) @@ -1992,12 +1998,12 @@ schemes_are_similar_p (enum url_scheme a, enum url_scheme b) static int getchar_from_escaped_string (const char *str, char *c) -{ +{ const char *p = str; assert (str && *str); assert (c); - + if (p[0] == '%') { if (!c_isxdigit(p[1]) || !c_isxdigit(p[2])) @@ -2047,7 +2053,7 @@ are_urls_equal (const char *u1, const char *u2) p += pp; q += qq; } - + return (*p == 0 && *q == 0 ? true : false); } @@ -2156,7 +2162,7 @@ test_append_uri_pathel() } test_array[] = { { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" }, }; - + for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) { struct growable dest; diff --git a/src/url.h b/src/url.h index 0748e214..2fa8d51c 100644 --- a/src/url.h +++ b/src/url.h @@ -84,7 +84,7 @@ struct url char *url_escape (const char *); -struct url *url_parse (const char *, int *, struct iri *iri); +struct url *url_parse (const char *, int *, struct iri *iri, bool percent_encode); char *url_error (const char *, int); char *url_full_path (const struct url *); void url_set_dir (struct url *, const char *); diff --git a/tests/Test-iri.px b/tests/Test-iri.px index d228721c..ca6feddf 100755 --- a/tests/Test-iri.px +++ b/tests/Test-iri.px @@ -214,9 +214,9 @@ my %expected_downloaded_files = ( ############################################################################### my $the_test = HTTPTest->new (name => "Test-iri", - input => \%urls, - cmdline => $cmdline, - errcode => $expected_error_code, + input => \%urls, + cmdline => $cmdline, + errcode => $expected_error_code, output => \%expected_downloaded_files); exit $the_test->run();