1 /* IRI related functions.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
37 #include <stringprep.h>
44 /* RFC3987 section 3.1 mandates STD3 ASCII RULES */
45 #define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES
47 /* Note: locale encoding is kept in options struct (opt.locale) */
49 /* Hold the encoding used for the current fetch */
52 /* Hold the encoding for the future found links */
55 /* Will/Is the current URL encoded in utf8 ? */
58 /* Force no utf8 encoding for url_parse () */
61 static iconv_t locale2utf8;
63 static bool open_locale_to_utf8 (void);
64 static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
67 /* Given a string containing "charset=XXX", return the encoding if found,
70 parse_charset (char *str)
77 str = strcasestr (str, "charset=");
84 /* sXXXav: which chars should be banned ??? */
85 while (*charset && !c_isspace (*charset))
88 /* sXXXav: could strdupdelim return NULL ? */
89 charset = strdupdelim (str, charset);
91 /* Do a minimum check on the charset value */
92 if (!check_encoding_name (charset))
98 /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
103 /* Find the locale used, or fall back on a default value */
107 return (char *) stringprep_locale_charset ();
110 /* Basic check of an encoding name. */
112 check_encoding_name (char *encoding)
118 if (!c_isascii (*s) || c_isspace (*s))
120 logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
130 /* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
132 open_locale_to_utf8 (void)
137 /* sXXXav : That shouldn't happen, just in case */
140 logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
141 opt.locale = find_locale ();
147 locale2utf8 = iconv_open ("UTF-8", opt.locale);
148 if (locale2utf8 != (iconv_t)(-1))
151 logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
152 quote (opt.locale), quote ("UTF-8"));
157 /* Try converting string str from locale to UTF-8. Return a new string
158 on success, or str on error or if conversion isn't needed. */
160 locale_to_utf8 (const char *str)
164 if (!strcasecmp (opt.locale, "utf-8"))
167 if (!open_locale_to_utf8 ())
170 if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
171 return (const char *) new;
176 /* Do the conversion according to the passed conversion descriptor cd. *out
177 will containes the transcoded string on success. *out content is
178 unspecified otherwise. */
180 do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
182 /* sXXXav : hummm hard to guess... */
183 size_t len, done, outlen = inlen * 2;
184 int invalid = 0, tooshort = 0;
187 s = xmalloc (outlen + 1);
194 if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
197 *(s + len - outlen - done) = '\0';
201 /* Incomplete or invalid multibyte sequence */
202 if (errno == EINVAL || errno == EILSEQ)
205 logprintf (LOG_VERBOSE,
206 "Incomplete or invalide multibyte sequence encountered\n");
215 else if (errno == E2BIG) /* Output buffer full */
221 outlen = done + inlen * 2;
222 new = xmalloc (outlen + 1);
223 memcpy (new, s, done);
229 else /* Weird, we got an unspecified error */
231 logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
239 /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
242 idn_encode (char *host, bool utf8_encoded)
247 /* Encode to UTF-8 if not done using current remote */
250 if (!remote_to_utf8 ((const char *) host, (const char **) &new))
252 /* Nothing to encode or an error occured */
259 /* toASCII UTF-8 NULL terminated string */
260 ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
261 if (ret != IDNA_SUCCESS)
263 /* sXXXav : free new when needed ! */
264 logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
265 quote (idna_strerror (ret)));
272 /* Try to decode an "ASCII encoded" host. Return the new domain in the locale
273 on success or NULL on error. */
275 idn_decode (char *host)
280 ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
281 if (ret != IDNA_SUCCESS)
283 logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
284 quote (idna_strerror (ret)));
291 /* Try to transcode string str from remote encoding to UTF-8. On success, *new
292 contains the transcoded string. *new content is unspecified otherwise. */
294 remote_to_utf8 (const char *str, const char **new)
300 if (opt.encoding_remote)
301 r = opt.encoding_remote;
307 cd = iconv_open ("UTF-8", r);
308 if (cd == (iconv_t)(-1))
311 if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
316 /* Test if something was converted */
317 if (!strcmp (str, *new))
319 xfree ((char *) *new);
326 char *get_remote_charset (void)
331 char *get_current_charset (void)
336 void set_current_charset (char *charset)
338 /*printf("[ current = `%s'\n", charset);*/
341 /* Do nothing if already equal */
342 if (!strcasecmp (current, charset))
347 current = charset ? xstrdup (charset) : NULL;
350 void set_current_as_locale (void)
352 /* sXXXav : assert opt.locale NULL ? */
353 /*printf("[ current = locale = `%s'\n", opt.locale);*/
356 if (!strcasecmp (current, opt.locale))
361 current = xstrdup (opt.locale);
365 set_remote_charset (char *charset)
367 /*printf("[ remote = `%s'\n", charset);*/
370 /* Do nothing if already equal */
371 if (!strcasecmp (remote, charset))
375 remote = charset ? xstrdup (charset) : NULL;
379 set_remote_as_current (void)
381 /*printf("[ remote = current = `%s'\n", current);*/
384 /* Do nothing if already equal */
385 if (current && !strcasecmp (remote, current))
390 remote = current ? xstrdup (current) : NULL;
393 void reset_utf8_encode (void)
395 set_utf8_encode (opt.enable_iri);
398 void set_utf8_encode (bool encode)
400 utf8_encode = encode;
403 bool get_utf8_encode (void)
405 return (!ugly_no_encode && utf8_encode);
408 void set_ugly_no_encode (bool ugly)
410 ugly_no_encode = ugly;