+2008-06-14 Xavier Saint <wget@sxav.eu>
+
+ * configure.ac: Add support for IRIs
+
2008-05-29 Micah Cowan <micah@cowan.name>
* po/*.po: Updated from TP (the 1.11.3 set).
fi
AC_SUBST(COMMENT_IF_NO_POD2MAN)
+
+dnl
+dnl Check for IDN/IRIs
+dnl
+
+AC_ARG_ENABLE(iri,
+ AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
+ [case "${enable_iri}" in
+ no)
+ dnl Disable IRIs checking
+ AC_MSG_NOTICE([disabling IRIs at user request])
+ iri=no
+ ;;
+ yes)
+ dnl IRIs explicitly enabled
+ iri=yes
+ force_iri=yes
+ ;;
+ auto)
+ dnl Auto-detect IRI
+ iri=yes
+ ;;
+ *)
+ AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
+ ;;
+ esac
+ ], [
+ dnl If nothing is specified, assume auto-detection
+ iri=yes
+ ]
+)
+
+AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
+ [Support IDN/IRIs (needs GNU Libidn)]),
+ libidn=$withval, libidn="")
+if test "X$iri" != "Xno"; then
+ if test "$libidn" != ""; then
+ LDFLAGS="${LDFLAGS} -L$libidn/lib"
+ CPPFLAGS="${CPPFLAGS} -I$libidn/include"
+ fi
+ AC_CHECK_HEADER(idna.h,
+ AC_CHECK_LIB(idn, stringprep_check_version,
+ [iri=yes LIBS="${LIBS} -lidn"], iri=no),
+ iri=no)
+
+ if test "X$iri" != "Xno" ; then
+ AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
+ AC_MSG_NOTICE([Enabling support for IRI.])
+ else
+ AC_MSG_WARN([Libidn not found])
+ fi
+fi
+
+
+dnl Needed by src/Makefile.am
+AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
+
+
dnl
dnl Create output
dnl
+2008-06-26 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New functions locale_to_utf8() and
+ idn_encode() adding basic capabilities of IRI/IDN.
+
+ * url.c : Convert URLs from locale to UTF-8 allowing a basic
+ support of IRI/IDN
+
2008-06-24 Steven Schubiger <stsc@members.fsf.org>
* http.c (http_loop): Replace escnonprint() occurence with
* http.c: Make -nv --spider include the file's name when it
exists.
-
+
2008-06-22 Micah Cowan <micah@cowan.name>
* Makefile.am (version.c): Fixed version string invocation so it
string vars pointers-to-const, and moved line lengths
below 80 (in Makefile.am, not in version.c).
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New function check_encoding_name() as
+ a preliminary encoding name check.
+
+ * main.c, iri.c : Make use of check_encoding_name().
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c : Include missing stringprep.h file and add a
+ cast.
+
+ * init.c : set a default initial value for opt.enable_iri,
+ opt.locale and opt.encoding_remote.
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : Add a new function find_locale() to find
+ out the local system encoding.
+
+ * main.c : Make use of find_locale().
+
+2008-06-19 Xavier Saint <wget@sxav.eu>
+
+ * html-url.c : Add "content-type" meta tag parsing for
+ retrieving page encoding.
+
+ * iri.h : Make no-op version of parse_charset() return
+ NULL.
+
2008-06-16 Micah Cowan <micah@cowan.name>
* http.c (http_loop): When hstat.len is higher than the
successfully completed content's length, but it's because we
_set_ it that way, don't abort.
+2008-06-14 Xavier Saint <wget@sxav.eu>
+
+ * iri.c, iri.h : New files.
+
+ * Makefile.am : Add files iri.h and conditional iri.c.
+
+ * build_info.c : Add compiled feature "iri".
+
+ * http.c : include iri.h and parse charset from Content-Type
+ header.
+
+ * init.c, main.c, options.h : if an options isn't supported
+ at compiled time, don't get rid off it and show a dummy
+ message instead if they are used.
+
2008-06-13 Micah Cowan <micah@cowan.name>
* build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
default.
2008-05-17 Kenny Parnell <k.parnell@gmail.com>
-
+
(cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
2008-05-17 Micah Cowan <micah@cowan.name>
-
+
* main.c (main): Handle Ctrl-D on command-line.
2008-05-15 Steven Schubiger <schubiger@gmail.com>
* options.h: Add an according boolean member to the options
struct.
-
+
* sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
out, because they're now defined independently by config.h.
# Version: @VERSION@
#
+if IRI_IS_ENABLED
+IRI_OBJ = iri.c
+endif
+
# The following line is losing on some versions of make!
DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
LIBS = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c snprintf.c spider.c url.c \
- utils.c \
+ utils.c $(IRI_OBJ) \
connect.h convert.h cookies.h \
ftp.h gen-md5.h hash.h host.h html-parse.h \
- http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
+ http.h http-ntlm.h init.h iri.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \
spider.h ssl.h sysdep.h url.h utils.h wget.h
nodist_wget_SOURCES = version.c
#else
"-gettext",
#endif
+
+#ifdef ENABLE_IRI
+ "+iri",
+#else
+ "-iri",
+#endif
+
/* sentinel value */
NULL
};
#include "hash.h"
#include "convert.h"
#include "recur.h" /* declaration of get_urls_html */
+#include "iri.h"
struct map_context;
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
+
+ /* sXXXav: Not used yet */
+ xfree (mcharset);
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
#include "retr.h"
#include "connect.h"
#include "netrc.h"
+#include "iri.h"
#ifdef HAVE_SSL
# include "ssl.h"
#endif
char *tmp = strchr (type, ';');
if (tmp)
{
+ /* sXXXav: only needed if IRI support is enabled */
+ char *tmp2 = tmp + 1;
+
while (tmp > type && c_isspace (tmp[-1]))
--tmp;
*tmp = '\0';
+
+ /* Try to get remote encoding if needed */
+ if (opt.enable_iri && !opt.encoding_remote)
+ /* xxx = */ parse_charset (tmp2);
}
}
hs->newloc = resp_header_strdup (resp, "Location");
{ "inet6only", &opt.ipv6_only, cmd_boolean },
#endif
{ "input", &opt.input_filename, cmd_file },
+ { "iri", &opt.enable_iri, cmd_boolean },
{ "keepsessioncookies", &opt.keep_session_cookies, cmd_boolean },
{ "limitrate", &opt.limit_rate, cmd_bytes },
{ "loadcookies", &opt.cookies_input, cmd_file },
+ { "locale", &opt.locale, cmd_string },
{ "logfile", &opt.lfilename, cmd_file },
{ "login", &opt.ftp_user, cmd_string },/* deprecated*/
{ "maxredirect", &opt.max_redirect, cmd_number },
{ "referer", &opt.referer, cmd_string },
{ "reject", &opt.rejects, cmd_vector },
{ "relativeonly", &opt.relative_only, cmd_boolean },
+ { "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
{ "restrictfilenames", NULL, cmd_spec_restrict_file_names },
{ "retrsymlinks", &opt.retr_symlinks, cmd_boolean },
opt.restrict_files_case = restrict_no_case_restriction;
opt.max_redirect = 20;
+
+#ifdef ENABLE_IRI
+ opt.enable_iri = true;
+#else
+ opt.enable_iri = false;
+#endif
+ opt.locale = NULL;
+ opt.encoding_remote = NULL;
}
\f
/* Return the user's home directory (strdup-ed), or NULL if none is
--- /dev/null
+/* IRI related functions.
+ Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
+ 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <iconv.h>
+#include <stringprep.h>
+#include <idna.h>
+#include <errno.h>
+
+#include "utils.h"
+#include "iri.h"
+
+
+static iconv_t locale2utf8;
+
+
+static bool open_locale_to_utf8 (void);
+static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
+
+
+/* Given a string containing "charset=XXX", return the encoding if found,
+ or NULL otherwise */
+char *
+parse_charset (char *str)
+{
+ char *charset;
+
+ if (!str || !*str)
+ return NULL;
+
+ str = strcasestr (str, "charset=");
+ if (!str)
+ return NULL;
+
+ str += 8;
+ charset = str;
+
+ /* sXXXav: which chars should be banned ??? */
+ while (*charset && !c_isspace (*charset))
+ charset++;
+
+ /* sXXXav: could strdupdelim return NULL ? */
+ charset = strdupdelim (str, charset);
+
+ /* Do a minimum check on the charset value */
+ if (!check_encoding_name (charset))
+ {
+ xfree (charset);
+ return NULL;
+ }
+
+ logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));
+
+ return charset;
+}
+
+/* Find the locale used, or fall back on a default value */
+char *
+find_locale (void)
+{
+ /* sXXXav, made our own function or use libidn one ?! */
+ return (char *) stringprep_locale_charset ();
+}
+
+/* Basic check of an encoding name. */
+bool
+check_encoding_name (char *encoding)
+{
+ char *s = encoding;
+
+ while (*s)
+ {
+ if (!c_isascii(*s) || c_isspace(*s))
+ {
+ logprintf (LOG_VERBOSE, "Encoding %s isn't valid\n", quote(encoding));
+ return false;
+ }
+
+ s++;
+ }
+
+ return true;
+}
+
+/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
+static bool
+open_locale_to_utf8 (void)
+{
+ if (locale2utf8)
+ return true;
+
+ /* sXXXav : That shouldn't happen, just in case */
+ if (!opt.locale)
+ {
+ logprintf (LOG_VERBOSE, "open_locale_to_utf8: locale is unset\n");
+ opt.locale = find_locale ();
+ }
+
+ if (!opt.locale)
+ return false;
+
+ locale2utf8 = iconv_open ("UTF-8", opt.locale);
+ if (locale2utf8 != (iconv_t)(-1))
+ return true;
+
+ logprintf (LOG_VERBOSE, "Conversion from %s to %s isn't supported\n",
+ quote (opt.locale), quote("UTF-8"));
+ locale2utf8 = NULL;
+ return false;
+}
+
+/* Return a new string */
+const char *
+locale_to_utf8 (const char *str)
+{
+ char *new;
+
+ if (!strcasecmp (opt.locale, "utf-8"))
+ return str;
+
+ if (!open_locale_to_utf8 ())
+ return str;
+
+ if (do_conversion (locale2utf8, (char *) str, strlen ((char *) str), &new))
+ return (const char *) new;
+
+ return str;
+}
+
+/* */
+static bool
+do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
+{
+ /* sXXXav : hummm hard to guess... */
+ size_t len, done, outlen = inlen * 2;
+ int invalid = 0, tooshort = 0;
+ char *s;
+
+ s = xmalloc (outlen + 1);
+ *out = s;
+ len = outlen;
+ done = 0;
+
+ /* sXXXav : put a maximum looping factor ??? */
+ for (;;)
+ {
+ if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
+ {
+ *out = s;
+ *(s + len - outlen - done) = '\0';
+ return true;
+ }
+
+ /* Incomplete or invalid multibyte sequence */
+ if (errno == EINVAL || errno == EILSEQ)
+ {
+ invalid++;
+ **out = *in;
+ in++;
+ inlen--;
+ (*out)++;
+ outlen--;
+ }
+ else if (errno == E2BIG) /* Output buffer full */
+ {
+ char *new;
+
+ tooshort++;
+ done = len;
+ outlen = done + inlen * 2;
+ new = xmalloc (outlen + 1);
+ memcpy (new, s, done);
+ xfree (s);
+ s = new;
+ len = outlen;
+ *out = s + done;
+ }
+ else /* Weird, we got an unspecified error */
+ {
+ logprintf (LOG_VERBOSE, "Unhandled errno %d\n", errno);
+ break;
+ }
+ }
+
+ return false;
+}
+
+/* Try to encode UTF-8 host to ASCII. Return the new domain on success or NULL
+ on error. */
+char *idn_encode (char *host)
+{
+ char *new;
+ int ret;
+
+ /* toASCII UTF-8 NULL terminated string */
+ ret = idna_to_ascii_8z (host, &new, 0);
+ if (ret != IDNA_SUCCESS)
+ {
+ logprintf (LOG_VERBOSE, "idn_encode failed (%d): %s\n", ret,
+ quote (idna_strerror (ret)));
+ return NULL;
+ }
+
+ return new;
+}
+
--- /dev/null
+/* Internationalization related declarations.
+ Copyright (C) 2000, 2007, 2008 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#ifndef IRI_H
+#define IRI_H
+
+#ifdef ENABLE_IRI
+
+char *parse_charset (char *str);
+char *find_locale (void);
+bool check_encoding_name (char *encoding);
+const char *locale_to_utf8 (const char *str);
+char *idn_encode (char *host);
+
+#else /* ENABLE_IRI */
+
+#define parse_charset(str) NULL
+#define find_locale() NULL
+#define check_encoding_name(str) false
+#define locale_to_utf8(str) (str)
+#define idn_encode(str) NULL
+
+#endif /* ENABLE_IRI */
+#endif /* IRI_H */
#include <assert.h>
#include <errno.h>
#include <time.h>
+#ifdef ENABLE_IRI
+#include <langinfo.h>
+#endif
#include "utils.h"
#include "init.h"
#include "convert.h"
#include "spider.h"
#include "http.h" /* for save_cookies */
+#include "iri.h"
#include <getopt.h>
#include <getpass.h>
{ "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
#endif
{ "input-file", 'i', OPT_VALUE, "input", -1 },
+ { "iri", 0, OPT_BOOLEAN, "iri", -1 },
{ "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
{ "level", 'l', OPT_VALUE, "reclevel", -1 },
{ "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
{ "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+ { "locale", 0, OPT_VALUE, "locale", -1 },
{ "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
{ "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
{ "no", 'n', OPT__NO, NULL, required_argument },
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+ { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
{ "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
{ "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
exit (1);
}
+#ifdef ENABLE_IRI
+ if (opt.enable_iri)
+ {
+ if (opt.locale && !check_encoding_name(opt.locale))
+ opt.locale = NULL;
+
+ if (!opt.locale)
+ opt.locale = find_locale ();
+
+ if (opt.encoding_remote && !check_encoding_name(opt.encoding_remote))
+ opt.encoding_remote = NULL;
+
+ logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));
+ }
+#else
+ if (opt.enable_iri || opt.locale || opt.encoding_remote)
+ {
+ /* sXXXav : be more specific... */
+ printf(_("This version does not have support for IRIs\n"));
+ exit(1);
+ }
+#endif
+
if (opt.ask_passwd)
{
opt.passwd = prompt_for_password ();
bool content_disposition; /* Honor HTTP Content-Disposition header. */
bool auth_without_challenge; /* Issue Basic authentication creds without
waiting for a challenge. */
+
+ bool enable_iri;
+ char *encoding_remote;
+ char *locale;
};
extern struct options opt;
#include "utils.h"
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+#include "iri.h"
#ifdef TESTING
#include "test.h"
goto error;
}
+ if (opt.enable_iri)
+ {
+ url_unescape ((char *) url);
+ url = locale_to_utf8(url);
+ }
+
url_encoded = reencode_escapes (url);
p = url_encoded;
host_modified = true;
}
+ if (opt.enable_iri)
+ {
+ char *new = idn_encode (u->host);
+ if (new)
+ {
+ xfree (u->host);
+ u->host = new;
+ host_modified = true;
+ }
+ }
+
if (params_b)
u->params = strdupdelim (params_b, params_e);
if (query_b)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
- if (path_modified || u->fragment || host_modified || path_b == path_e)
+ if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild