X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Firi.c;h=d6067497c86a4755e929547edf27d087a60eb0e6;hp=5fb06d0992bde05c5dda9768e68e2b9b8d5ad454;hb=a9a2b34b052cfa903462124f59fbfeed7eaf374b;hpb=d3007f1b3a5d033babe40bc4c56a899eb3b10bfa diff --git a/src/iri.c b/src/iri.c index 5fb06d09..d6067497 100644 --- a/src/iri.c +++ b/src/iri.c @@ -1,6 +1,5 @@ /* IRI related functions. - Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, - 2008 Free Software Foundation, Inc. + Copyright (C) 2008 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -40,13 +39,12 @@ as that of the covered work. */ #include #include "utils.h" -#include "iri.h" +/* RFC3987 section 3.1 mandates STD3 ASCII RULES */ +#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES -static iconv_t locale2utf8; +/* Note: locale encoding is kept in options struct (opt.locale) */ - -static bool open_locale_to_utf8 (void); static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); @@ -81,7 +79,7 @@ parse_charset (char *str) return NULL; } - logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset)); + /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ return charset; } @@ -90,7 +88,6 @@ parse_charset (char *str) char * find_locale (void) { - /* sXXXav, made our own function or use libidn one ?! */ return (char *) stringprep_locale_charset (); } @@ -102,9 +99,9 @@ check_encoding_name (char *encoding) while (*s) { - if (!c_isascii(*s) || c_isspace(*s)) + if (!c_isascii (*s) || c_isspace (*s)) { - logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding)); + logprintf (LOG_VERBOSE, _("Encoding %s isn't valid\n"), quote (encoding)); return false; } @@ -118,48 +115,44 @@ check_encoding_name (char *encoding) static bool open_locale_to_utf8 (void) { - if (locale2utf8) - return true; - - /* sXXXav : That shouldn't happen, just in case */ - if (!opt.locale) - { - logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n"); - opt.locale = find_locale (); - } - - if (!opt.locale) - return false; - locale2utf8 = iconv_open ("UTF-8", opt.locale); - if (locale2utf8 != (iconv_t)(-1)) - return true; - - logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n", - quote (opt.locale), quote("UTF-8")); - locale2utf8 = NULL; - return false; } -/* Return a new string */ +/* Try converting string str from locale to UTF-8. Return a new string + on success, or str on error or if conversion isn't needed. */ const char * locale_to_utf8 (const char *str) { + iconv_t l2u; char *new; - if (!strcasecmp (opt.locale, "utf-8")) - return str; + /* That shouldn't happen, just in case */ + if (!opt.locale) + { + logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n")); + opt.locale = find_locale (); + } - if (!open_locale_to_utf8 ()) + if (!opt.locale || !strcasecmp (opt.locale, "utf-8")) return str; - if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new)) + l2u = iconv_open ("UTF-8", opt.locale); + if (l2u != (iconv_t)(-1)) + { + logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"), + quote (opt.locale), quote ("UTF-8")); + return str; + } + + if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new)) return (const char *) new; return str; } -/* */ +/* Do the conversion according to the passed conversion descriptor cd. *out + will contain the transcoded string on success. *out content is + unspecified otherwise. */ static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out) { @@ -173,7 +166,6 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) len = outlen; done = 0; - /* sXXXav : put a maximum looping factor ??? */ for (;;) { if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) @@ -186,6 +178,10 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) /* Incomplete or invalid multibyte sequence */ if (errno == EINVAL || errno == EILSEQ) { + if (!invalid) + logprintf (LOG_VERBOSE, + _("Incomplete or invalid multibyte sequence encountered\n")); + invalid++; **out = *in; in++; @@ -193,7 +189,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) (*out)++; outlen--; } - else if (errno == E2BIG) /* Output buffer full */ + else if (errno == E2BIG) /* Output buffer full */ { char *new; @@ -209,7 +205,7 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) } else /* Weird, we got an unspecified error */ { - logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno); + logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno); break; } } @@ -217,18 +213,47 @@ do_conversion (iconv_t cd, char *in, size_t inlen, char **out) return false; } -/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL +/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL on error. */ -char *idn_encode (char *host) +char * +idn_encode (struct iri *i, char *host) { char *new; int ret; + /* Encode to UTF-8 if not done */ + if (!i->utf8_encode) + { + if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) + return NULL; /* Nothing to encode or an error occured */ + host = new; + } + /* toASCII UTF-8 NULL terminated string */ - ret = idna_to_ascii_8z (host, &new, 0); + ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS); + if (ret != IDNA_SUCCESS) + { + /* sXXXav : free new when needed ! */ + logprintf (LOG_VERBOSE, _("idn_encode failed (%d): %s\n"), ret, + quote (idna_strerror (ret))); + return NULL; + } + + return new; +} + +/* Try to decode an "ASCII encoded" host. Return the new domain in the locale + on success or NULL on error. */ +char * +idn_decode (char *host) +{ + char *new; + int ret; + + ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS); if (ret != IDNA_SUCCESS) { - logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret, + logprintf (LOG_VERBOSE, _("idn_decode failed (%d): %s\n"), ret, quote (idna_strerror (ret))); return NULL; } @@ -236,3 +261,101 @@ char *idn_encode (char *host) return new; } +/* Try to transcode string str from remote encoding to UTF-8. On success, *new + contains the transcoded string. *new content is unspecified otherwise. */ +bool +remote_to_utf8 (struct iri *i, const char *str, const char **new) +{ + iconv_t cd; + bool ret = false; + + if (!i->uri_encoding) + return false; + + cd = iconv_open ("UTF-8", i->uri_encoding); + if (cd == (iconv_t)(-1)) + return false; + + if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) + ret = true; + + iconv_close (cd); + + /* Test if something was converted */ + if (!strcmp (str, *new)) + { + xfree ((char *) *new); + return false; + } + + return ret; +} + +/* Allocate a new iri structure and return a pointer to it. */ +struct iri * +iri_new (void) +{ + struct iri *i = xmalloc (sizeof *i); + i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; + i->content_encoding = NULL; + i->orig_url = NULL; + i->utf8_encode = opt.enable_iri; + return i; +} + +struct iri *iri_dup (const struct iri *src) +{ + struct iri *i = xmalloc (sizeof *i); + i->uri_encoding = src->uri_encoding ? xstrdup (src->uri_encoding) : NULL; + i->content_encoding = (src->content_encoding ? + xstrdup (src->content_encoding) : NULL); + i->orig_url = src->orig_url ? xstrdup (src->orig_url) : NULL; + i->utf8_encode = src->utf8_encode; + return i; +} + +/* Completely free an iri structure. */ +void +iri_free (struct iri *i) +{ + xfree_null (i->uri_encoding); + xfree_null (i->content_encoding); + xfree_null (i->orig_url); + xfree (i); +} + +/* Set uri_encoding of struct iri i. If a remote encoding was specified, use + it unless force is true. */ +void +set_uri_encoding (struct iri *i, char *charset, bool force) +{ + DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None")); + if (!force && opt.encoding_remote) + return; + if (i->uri_encoding) + { + if (charset && !strcasecmp (i->uri_encoding, charset)) + return; + xfree (i->uri_encoding); + } + + i->uri_encoding = charset ? xstrdup (charset) : NULL; +} + +/* Set content_encoding of struct iri i. */ +void +set_content_encoding (struct iri *i, char *charset) +{ + DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None")); + if (opt.encoding_remote) + return; + if (i->content_encoding) + { + if (charset && !strcasecmp (i->content_encoding, charset)) + return; + xfree (i->content_encoding); + } + + i->content_encoding = charset ? xstrdup (charset) : NULL; +} +