1 /* IRI related functions.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
37 #include <stringprep.h>
44 /* RFC3987 section 3.1 mandates STD3 ASCII RULES */
45 #define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
47 /* Note: locale encoding is kept in options struct (opt.locale) */
49 static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
52 /* Given a string containing "charset=XXX", return the encoding if found,
55 parse_charset (char *str)
62 str = strcasestr (str, "charset=");
69 /* sXXXav: which chars should be banned ??? */
70 while (*charset && !c_isspace (*charset))
73 /* sXXXav: could strdupdelim return NULL ? */
74 charset = strdupdelim (str, charset);
76 /* Do a minimum check on the charset value */
77 if (!check_encoding_name (charset))
83 /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
88 /* Find the locale used, or fall back on a default value */
92 return (char *) stringprep_locale_charset ();
95 /* Basic check of an encoding name. */
97 check_encoding_name (char *encoding)
103 if (!c_isascii (*s) || c_isspace (*s))
105 logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
115 /* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
117 open_locale_to_utf8 (void)
122 /* Try converting string str from locale to UTF-8. Return a new string
123 on success, or str on error or if conversion isn't needed. */
125 locale_to_utf8 (const char *str)
130 /* That shouldn't happen, just in case */
133 logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
134 opt.locale = find_locale ();
137 if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
140 l2u = iconv_open ("UTF-8", opt.locale);
141 if (l2u != (iconv_t)(-1))
143 logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
144 quote (opt.locale), quote ("UTF-8"));
148 if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
149 return (const char *) new;
154 /* Do the conversion according to the passed conversion descriptor cd. *out
155 will contain the transcoded string on success. *out content is
156 unspecified otherwise. */
158 do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
160 /* sXXXav : hummm hard to guess... */
161 size_t len, done, outlen = inlen * 2;
162 int invalid = 0, tooshort = 0;
165 s = xmalloc (outlen + 1);
172 if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
175 *(s + len - outlen - done) = '\0';
179 /* Incomplete or invalid multibyte sequence */
180 if (errno == EINVAL || errno == EILSEQ)
183 logprintf (LOG_VERBOSE,
184 "Incomplete or invalide multibyte sequence encountered\n");
193 else if (errno == E2BIG) /* Output buffer full */
199 outlen = done + inlen * 2;
200 new = xmalloc (outlen + 1);
201 memcpy (new, s, done);
207 else /* Weird, we got an unspecified error */
209 logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
217 /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
220 idn_encode (struct iri *i, char *host)
225 /* Encode to UTF-8 if not done */
228 if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
229 return NULL; /* Nothing to encode or an error occured */
233 /* toASCII UTF-8 NULL terminated string */
234 ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
235 if (ret != IDNA_SUCCESS)
237 /* sXXXav : free new when needed ! */
238 logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
239 quote (idna_strerror (ret)));
246 /* Try to decode an "ASCII encoded" host. Return the new domain in the locale
247 on success or NULL on error. */
249 idn_decode (char *host)
254 ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
255 if (ret != IDNA_SUCCESS)
257 logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
258 quote (idna_strerror (ret)));
265 /* Try to transcode string str from remote encoding to UTF-8. On success, *new
266 contains the transcoded string. *new content is unspecified otherwise. */
268 remote_to_utf8 (struct iri *i, const char *str, const char **new)
273 if (!i->uri_encoding)
276 cd = iconv_open ("UTF-8", i->uri_encoding);
277 if (cd == (iconv_t)(-1))
280 if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
285 /* Test if something was converted */
286 if (!strcmp (str, *new))
288 xfree ((char *) *new);
295 /* Allocate a new iri structure and return a pointer to it. */
299 struct iri *i = xmalloc (sizeof (struct iri));
300 i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
301 i->content_encoding = NULL;
302 i->utf8_encode = opt.enable_iri;
306 /* Completely free an iri structure. */
308 iri_free (struct iri *i)
310 xfree_null (i->uri_encoding);
311 xfree_null (i->content_encoding);
315 /* Set uri_encoding of struct iri i. If a remote encoding was specified, use
316 it unless force is true. */
318 set_uri_encoding (struct iri *i, char *charset, bool force)
320 DEBUGP (("URI encoding = `%s'\n", charset ? quote (charset) : "None"));
321 if (!force && opt.encoding_remote)
325 if (charset && !strcasecmp (i->uri_encoding, charset))
327 xfree (i->uri_encoding);
330 i->uri_encoding = charset ? xstrdup (charset) : NULL;
333 /* Set content_encoding of struct iri i. */
335 set_content_encoding (struct iri *i, char *charset)
337 DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
338 if (opt.encoding_remote)
340 if (i->content_encoding)
342 if (charset && !strcasecmp (i->content_encoding, charset))
344 xfree (i->content_encoding);
347 i->content_encoding = charset ? xstrdup (charset) : NULL;