From 5bb11da009c2f3bc4381bc8009c57007fd86534e Mon Sep 17 00:00:00 2001 From: Saint Xavier Date: Thu, 26 Jun 2008 17:59:07 +0200 Subject: [PATCH] Basic support of IRIs. --- src/ChangeLog | 9 ++++ src/iri.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/iri.h | 10 ++-- src/url.c | 20 +++++++- 4 files changed, 166 insertions(+), 7 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 6dcaa279..288ec11d 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2008-06-26 Xavier Saint + + * iri.c, iri.h : New functions locale_to_utf8() and + idn_encode() adding basic capabilities of IRI/IDN. + + * url.c : Convert URLs from locale to UTF-8 allowing a basic + support of IRI/IDN + + 2008-06-19 Xavier Saint * iri.c, iri.h : New function check_encoding_name() as diff --git a/src/iri.c b/src/iri.c index fea7b150..5fb06d09 100644 --- a/src/iri.c +++ b/src/iri.c @@ -34,13 +34,22 @@ as that of the covered work. */ #include #include #include - +#include #include +#include +#include #include "utils.h" #include "iri.h" +static iconv_t locale2utf8; + + +static bool open_locale_to_utf8 (void); +static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); + + /* Given a string containing "charset=XXX", return the encoding if found, or NULL otherwise */ char * @@ -77,7 +86,6 @@ parse_charset (char *str) return charset; } - /* Find the locale used, or fall back on a default value */ char * find_locale (void) @@ -86,7 +94,6 @@ find_locale (void) return (char *) stringprep_locale_charset (); } - /* Basic check of an encoding name. */ bool check_encoding_name (char *encoding) @@ -107,4 +114,125 @@ check_encoding_name (char *encoding) return true; } +/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */ +static bool +open_locale_to_utf8 (void) +{ + if (locale2utf8) + return true; + + /* sXXXav : That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); + opt.locale = find_locale (); + } + + if (!opt.locale) + return false; + + locale2utf8 = iconv_open ("UTF-8", opt.locale); + if (locale2utf8 != (iconv_t)(-1)) + return true; + + logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", + quote (opt.locale), quote("UTF-8")); + locale2utf8 = NULL; + return false; +} + +/* Return a new string */ +const char * +locale_to_utf8 (const char *str) +{ + char *new; + + if (!strcasecmp (opt.locale, "utf-8")) + return str; + + if (!open_locale_to_utf8 ()) + return str; + + if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new)) + return (const char *) new; + + return str; +} + +/* */ +static bool +do_conversion (iconv_t cd, char *in, size_t inlen, char **out) +{ + /* sXXXav : hummm hard to guess... */ + size_t len, done, outlen = inlen * 2; + int invalid = 0, tooshort = 0; + char *s; + + s = xmalloc (outlen + 1); + *out = s; + len = outlen; + done = 0; + + /* sXXXav : put a maximum looping factor ??? */ + for (;;) + { + if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) + { + *out = s; + *(s + len - outlen - done) = '\0'; + return true; + } + + /* Incomplete or invalid multibyte sequence */ + if (errno == EINVAL || errno == EILSEQ) + { + invalid++; + **out = *in; + in++; + inlen--; + (*out)++; + outlen--; + } + else if (errno == E2BIG) /* Output buffer full */ + { + char *new; + + tooshort++; + done = len; + outlen = done + inlen * 2; + new = xmalloc (outlen + 1); + memcpy (new, s, done); + xfree (s); + s = new; + len = outlen; + *out = s + done; + } + else /* Weird, we got an unspecified error */ + { + logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno); + break; + } + } + + return false; +} + +/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL + on error. */ +char *idn_encode (char *host) +{ + char *new; + int ret; + + /* toASCII UTF-8 NULL terminated string */ + ret = idna_to_ascii_8z (host, &new, 0); + if (ret != IDNA_SUCCESS) + { + logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} diff --git a/src/iri.h b/src/iri.h index 85a7fb7f..4488501d 100644 --- a/src/iri.h +++ b/src/iri.h @@ -35,12 +35,16 @@ as that of the covered work. */ char *parse_charset (char *str); char *find_locale (void); bool check_encoding_name (char *encoding); +const char *locale_to_utf8 (const char *str); +char *idn_encode (char *host); #else /* ENABLE_IRI */ -#define parse_charset(str) NULL -#define find_locale() NULL -#define check_encoding_name(str) false +#define parse_charset(str) NULL +#define find_locale() NULL +#define check_encoding_name(str) false +#define locale_to_utf8(str) (str) +#define idn_encode(str) NULL #endif /* ENABLE_IRI */ #endif /* IRI_H */ diff --git a/src/url.c b/src/url.c index f5d621f9..48b23d6c 100644 --- a/src/url.c +++ b/src/url.c @@ -42,6 +42,7 @@ as that of the covered work. */ #include "utils.h" #include "url.h" #include "host.h" /* for is_valid_ipv6_address */ +#include "iri.h" #ifdef TESTING #include "test.h" @@ -670,6 +671,12 @@ url_parse (const char *url, int *error) goto error; } + if (opt.enable_iri) + { + url_unescape ((char *) url); + url = locale_to_utf8(url); + } + url_encoded = reencode_escapes (url); p = url_encoded; @@ -844,6 +851,17 @@ url_parse (const char *url, int *error) host_modified = true; } + if (opt.enable_iri) + { + char *new = idn_encode (u->host); + if (new) + { + xfree (u->host); + u->host = new; + host_modified = true; + } + } + if (params_b) u->params = strdupdelim (params_b, params_e); if (query_b) @@ -851,7 +869,7 @@ url_parse (const char *url, int *error) if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); - if (path_modified || u->fragment || host_modified || path_b == path_e) + if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild -- 2.39.2