/* IRI related functions.
- Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
- 2008 Free Software Foundation, Inc.
+ Copyright (C) 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
#include "utils.h"
#include "iri.h"
+/* Note: locale encoding is kept in options struct (opt.locale) */
-static iconv_t locale2utf8;
+/* Hold the encoding used for the current fetch */
+char *remote;
+
+/* Hold the encoding for the future found links */
+char *current;
+
+/* Will/Is the current URL encoded in utf8 ? */
+bool utf8_encode;
+/* Force no utf8 encoding for url_parse () */
+bool ugly_no_encode;
+
+static iconv_t locale2utf8;
static bool open_locale_to_utf8 (void);
static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
return NULL;
}
- logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
+ /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
return charset;
}
char *
find_locale (void)
{
- /* sXXXav, made our own function or use libidn one ?! */
return (char *) stringprep_locale_charset ();
}
while (*s)
{
- if (!c_isascii(*s) || c_isspace(*s))
+ if (!c_isascii (*s) || c_isspace (*s))
{
- logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding));
+ logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote (encoding));
return false;
}
return true;
logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
- quote (opt.locale), quote("UTF-8"));
+ quote (opt.locale), quote ("UTF-8"));
locale2utf8 = NULL;
return false;
}
-/* Return a new string */
+/* Try converting string str from locale to UTF-8. Return a new string
+ on success, or str on error or if conversion isn't needed. */
const char *
locale_to_utf8 (const char *str)
{
return str;
}
-/* */
+/* Do the conversion according to the passed conversion descriptor cd. *out
+ will containes the transcoded string on success. *out content is
+ unspecified otherwise. */
static bool
do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
{
len = outlen;
done = 0;
- /* sXXXav : put a maximum looping factor ??? */
for (;;)
{
if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
/* Incomplete or invalid multibyte sequence */
if (errno == EINVAL || errno == EILSEQ)
{
+ if (!invalid)
+ logprintf (LOG_VERBOSE,
+ "Incomplete or invalide multibyte sequence encountered\n");
+
invalid++;
**out = *in;
in++;
(*out)++;
outlen--;
}
- else if (errno == E2BIG) /* Output buffer full */
+ else if (errno == E2BIG) /* Output buffer full */
{
char *new;
return false;
}
-/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL
+/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
on error. */
-char *idn_encode (char *host)
+char *
+idn_encode (char *host, bool utf8_encoded)
{
char *new;
int ret;
+ /* Encode to UTF-8 if not done using current remote */
+ if (!utf8_encoded)
+ {
+ if (!remote_to_utf8 ((const char *) host, (const char **) &new))
+ {
+ /* Nothing to encode or an error occured */
+ return NULL;
+ }
+
+ host = new;
+ }
+
/* toASCII UTF-8 NULL terminated string */
ret = idna_to_ascii_8z (host, &new, 0);
if (ret != IDNA_SUCCESS)
{
+ /* sXXXav : free new when needed ! */
logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
quote (idna_strerror (ret)));
return NULL;
return new;
}
+/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
+ on success or NULL on error. */
+char *
+idn_decode (char *host)
+{
+ char *new;
+ int ret;
+
+ ret = idna_to_unicode_8zlz (host, &new, 0);
+ if (ret != IDNA_SUCCESS)
+ {
+ logprintf (LOG_VERBOSE, "idn_decode failed (%d): %s\n", ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
+/* Try to transcode string str from remote encoding to UTF-8. On success, *new
+ contains the transcoded string. *new content is unspecified otherwise. */
+bool
+remote_to_utf8 (const char *str, const char **new)
+{
+ char *r;
+ iconv_t cd;
+ bool ret = false;
+
+ if (opt.encoding_remote)
+ r = opt.encoding_remote;
+ else if (current)
+ r = current;
+ else
+ return false;
+
+ cd = iconv_open ("UTF-8", r);
+ if (cd == (iconv_t)(-1))
+ return false;
+
+ if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
+ ret = true;
+
+ iconv_close (cd);
+
+ /* Test if something was converted */
+ if (!strcmp (str, *new))
+ {
+ xfree ((char *) *new);
+ return false;
+ }
+
+ return ret;
+}
+
+char *get_remote_charset (void)
+{
+ return remote;
+}
+
+char *get_current_charset (void)
+{
+ return current;
+}
+
+void set_current_charset (char *charset)
+{
+ /*printf("[ current = `%s'\n", charset);*/
+ if (current)
+ xfree (current);
+
+ current = charset ? xstrdup (charset) : NULL;
+}
+
+void set_current_as_locale (void)
+{
+ /*printf("[ current = locale = `%s'\n", opt.locale);*/
+ if (current)
+ xfree (current);
+
+ /* sXXXav : assert opt.locale NULL ? */
+ current = xstrdup (opt.locale);
+}
+
+void
+set_remote_charset (char *charset)
+{
+ /*printf("[ remote = `%s'\n", charset);*/
+ if (remote)
+ xfree (remote);
+
+ remote = charset ? xstrdup (charset) : NULL;
+}
+
+void
+set_remote_as_current (void)
+{
+ /*printf("[ remote = current = `%s'\n", current);*/
+ if (remote)
+ xfree (remote);
+
+ remote = current ? xstrdup (current) : NULL;
+}
+
+void reset_utf8_encode (void)
+{
+ set_utf8_encode (opt.enable_iri);
+}
+
+void set_utf8_encode (bool encode)
+{
+ utf8_encode = encode;
+}
+
+bool get_utf8_encode (void)
+{
+ return (!ugly_no_encode && utf8_encode);
+}
+
+void set_ugly_no_encode (bool ugly)
+{
+ ugly_no_encode = ugly;
+}
+