1 /* IRI related functions.
2 Copyright (C) 2008 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
37 #include <stringprep.h>
44 /* Note: locale encoding is kept in options struct (opt.locale) */
46 /* Hold the encoding used for the current fetch */
49 /* Hold the encoding for the future found links */
52 /* Will/Is the current URL encoded in utf8 ? */
55 /* Force no utf8 encoding for url_parse () */
58 static iconv_t locale2utf8;
60 static bool open_locale_to_utf8 (void);
61 static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
64 /* Given a string containing "charset=XXX", return the encoding if found,
67 parse_charset (char *str)
74 str = strcasestr (str, "charset=");
81 /* sXXXav: which chars should be banned ??? */
82 while (*charset && !c_isspace (*charset))
85 /* sXXXav: could strdupdelim return NULL ? */
86 charset = strdupdelim (str, charset);
88 /* Do a minimum check on the charset value */
89 if (!check_encoding_name (charset))
95 /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
100 /* Find the locale used, or fall back on a default value */
104 return (char *) stringprep_locale_charset ();
107 /* Basic check of an encoding name. */
109 check_encoding_name (char *encoding)
115 if (!c_isascii (*s) || c_isspace (*s))
117 logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
127 /* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
129 open_locale_to_utf8 (void)
134 /* sXXXav : That shouldn't happen, just in case */
137 logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
138 opt.locale = find_locale ();
144 locale2utf8 = iconv_open ("UTF-8", opt.locale);
145 if (locale2utf8 != (iconv_t)(-1))
148 logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
149 quote (opt.locale), quote ("UTF-8"));
154 /* Try converting string str from locale to UTF-8. Return a new string
155 on success, or str on error or if conversion isn't needed. */
157 locale_to_utf8 (const char *str)
161 if (!strcasecmp (opt.locale, "utf-8"))
164 if (!open_locale_to_utf8 ())
167 if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
168 return (const char *) new;
173 /* Do the conversion according to the passed conversion descriptor cd. *out
174 will containes the transcoded string on success. *out content is
175 unspecified otherwise. */
177 do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
179 /* sXXXav : hummm hard to guess... */
180 size_t len, done, outlen = inlen * 2;
181 int invalid = 0, tooshort = 0;
184 s = xmalloc (outlen + 1);
191 if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
194 *(s + len - outlen - done) = '\0';
198 /* Incomplete or invalid multibyte sequence */
199 if (errno == EINVAL || errno == EILSEQ)
202 logprintf (LOG_VERBOSE,
203 "Incomplete or invalide multibyte sequence encountered\n");
212 else if (errno == E2BIG) /* Output buffer full */
218 outlen = done + inlen * 2;
219 new = xmalloc (outlen + 1);
220 memcpy (new, s, done);
226 else /* Weird, we got an unspecified error */
228 logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
236 /* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
239 idn_encode (char *host, bool utf8_encoded)
244 /* Encode to UTF-8 if not done using current remote */
247 if (!remote_to_utf8 ((const char *) host, (const char **) &new))
249 /* Nothing to encode or an error occured */
256 /* toASCII UTF-8 NULL terminated string */
257 ret = idna_to_ascii_8z (host, &new, 0);
258 if (ret != IDNA_SUCCESS)
260 /* sXXXav : free new when needed ! */
261 logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
262 quote (idna_strerror (ret)));
269 /* Try to decode an "ASCII encoded" host. Return the new domain in the locale
270 on success or NULL on error. */
272 idn_decode (char *host)
277 ret = idna_to_unicode_8zlz (host, &new, 0);
278 if (ret != IDNA_SUCCESS)
280 logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
281 quote (idna_strerror (ret)));
288 /* Try to transcode string str from remote encoding to UTF-8. On success, *new
289 contains the transcoded string. *new content is unspecified otherwise. */
291 remote_to_utf8 (const char *str, const char **new)
297 if (opt.encoding_remote)
298 r = opt.encoding_remote;
304 cd = iconv_open ("UTF-8", r);
305 if (cd == (iconv_t)(-1))
308 if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
313 /* Test if something was converted */
314 if (!strcmp (str, *new))
316 xfree ((char *) *new);
323 char *get_remote_charset (void)
328 char *get_current_charset (void)
333 void set_current_charset (char *charset)
335 /*printf("[ current = `%s'\n", charset);*/
339 current = charset ? xstrdup (charset) : NULL;
342 void set_current_as_locale (void)
344 /*printf("[ current = locale = `%s'\n", opt.locale);*/
348 /* sXXXav : assert opt.locale NULL ? */
349 current = xstrdup (opt.locale);
353 set_remote_charset (char *charset)
355 /*printf("[ remote = `%s'\n", charset);*/
359 remote = charset ? xstrdup (charset) : NULL;
363 set_remote_as_current (void)
365 /*printf("[ remote = current = `%s'\n", current);*/
369 remote = current ? xstrdup (current) : NULL;
372 void reset_utf8_encode (void)
374 set_utf8_encode (opt.enable_iri);
377 void set_utf8_encode (bool encode)
379 utf8_encode = encode;
382 bool get_utf8_encode (void)
384 return (!ugly_no_encode && utf8_encode);
387 void set_ugly_no_encode (bool ugly)
389 ugly_no_encode = ugly;