--- /dev/null
+dnl Template file for GNU Autoconf
+dnl Copyright (C) 1995, 1996, 1997, 2001, 2007,
+dnl 2008 Free Software Foundation, Inc.
+
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.
+
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+dnl GNU General Public License for more details.
+
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+dnl Additional permission under GNU GPL version 3 section 7
+
+dnl If you modify this program, or any covered work, by linking or
+dnl combining it with the OpenSSL project's OpenSSL library (or a
+dnl modified version of that library), containing parts covered by the
+dnl terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+dnl grants you additional permission to convey the resulting work.
+dnl Corresponding Source for a non-source form of such a combination
+dnl shall include the source code for the parts of OpenSSL used as well
+dnl as that of the covered work.
+
+dnl
+dnl Process this file with autoconf to produce a configure script.
+dnl
+
+AC_INIT([wget],
+ [1.12-devel],
+ [bug-wget@gnu.org])
+AC_PREREQ(2.61)
+
+dnl
+dnl What version of Wget are we building?
+dnl
+AC_MSG_NOTICE([configuring for GNU Wget $PACKAGE_VERSION])
+
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_AUX_DIR([.])
+
+dnl
+dnl Automake setup
+dnl
+AM_INIT_AUTOMAKE(1.9)
+
+dnl
+dnl Gettext
+dnl
+AM_GNU_GETTEXT([external],[need-ngettext])
+AM_GNU_GETTEXT_VERSION([0.16.1])
+
+dnl
+dnl Get cannonical host
+dnl
+AC_CANONICAL_HOST
+AC_DEFINE_UNQUOTED([OS_TYPE], "$host_os",
+ [Define to be the name of the operating system.])
+
+dnl
+dnl Process features.
+dnl
+
+AC_ARG_WITH(ssl,
+[[ --without-ssl disable SSL autodetection]])
+
+AC_ARG_ENABLE(opie,
+[ --disable-opie disable support for opie or s/key FTP login],
+ENABLE_OPIE=$enableval, ENABLE_OPIE=yes)
+test x"${ENABLE_OPIE}" = xyes && AC_DEFINE([ENABLE_OPIE], 1,
+ [Define if you want the Opie support for FTP compiled in.])
+
+AC_ARG_ENABLE(digest,
+[ --disable-digest disable support for HTTP digest authorization],
+ENABLE_DIGEST=$enableval, ENABLE_DIGEST=yes)
+test x"${ENABLE_DIGEST}" = xyes && AC_DEFINE([ENABLE_DIGEST], 1,
+ [Define if you want the HTTP Digest Authorization compiled in.])
+
+AC_ARG_ENABLE(ntlm,
+[ --disable-ntlm disable support for NTLM authorization],
+[ENABLE_NTLM=$enableval], [ENABLE_NTLM=auto])
+
+AC_ARG_ENABLE(debug,
+[ --disable-debug disable support for debugging output],
+ENABLE_DEBUG=$enableval, ENABLE_DEBUG=yes)
+test x"${ENABLE_DEBUG}" = xyes && AC_DEFINE([ENABLE_DEBUG], 1,
+ [Define if you want the debug output support compiled in.])
+
+wget_need_md5=no
+
+case "${ENABLE_OPIE}${ENABLE_DIGEST}" in
+*yes*)
+ wget_need_md5=yes
+esac
+
+dnl
+dnl Find the compiler
+dnl
+
+dnl We want these before the checks, so the checks can modify their values.
+test -z "$CFLAGS" && CFLAGS= auto_cflags=1
+test -z "$CC" && cc_specified=yes
+
+AC_PROG_CC
+AM_PROG_CC_C_O
+AC_AIX
+gl_EARLY
+md5_EARLY
+
+AC_PROG_RANLIB
+
++AC_PROG_LEX
++
+dnl Turn on optimization by default. Specifically:
+dnl
+dnl if the user hasn't specified CFLAGS, then
+dnl if compiler is gcc, then
+dnl use -O2 and some warning flags
+dnl else
+dnl use os-specific flags or -O
+if test -n "$auto_cflags"; then
+ if test -n "$GCC"; then
+ CFLAGS="$CFLAGS -O2 -Wall"
+ else
+ case "$host_os" in
+ *hpux*) CFLAGS="$CFLAGS +O3" ;;
+ *ultrix* | *osf*) CFLAGS="$CFLAGS -O -Olimit 2000" ;;
+ *) CFLAGS="$CFLAGS -O" ;;
+ esac
+ fi
+fi
+
+dnl
+dnl Checks for basic compiler characteristics.
+dnl
+AC_C_CONST
+AC_C_INLINE
+AC_C_VOLATILE
+
+dnl Check for basic headers, even though we expect them to exist and
+dnl #include them unconditionally in the code. Their detection is
+dnl still needed because test programs used by Autoconf macros check
+dnl for STDC_HEADERS, HAVE_SYS_TYPES_H, etc. before using them.
+dnl Without the checks they will fail to be included in test programs,
+dnl which will subsequently fail.
+AC_HEADER_STDC
+
+dnl Check for large file support. This check needs to come fairly
+dnl early because it could (in principle) affect whether functions and
+dnl headers are available, whether they work, etc.
+AC_SYS_LARGEFILE
+AC_CHECK_SIZEOF(off_t)
+
+dnl
+dnl Checks for system header files that might be missing.
+dnl
+AC_HEADER_STDBOOL
+AC_CHECK_HEADERS(unistd.h sys/time.h)
+AC_CHECK_HEADERS(termios.h sys/ioctl.h sys/select.h utime.h sys/utime.h)
+AC_CHECK_HEADERS(stdint.h inttypes.h pwd.h wchar.h)
+
+dnl
+dnl Check sizes of integer types. These are used to find n-bit
+dnl integral types on older systems that fail to provide intN_t and
+dnl uintN_t typedefs.
+dnl
+AC_CHECK_SIZEOF(short)
+AC_CHECK_SIZEOF(int)
+AC_CHECK_SIZEOF(long)
+AC_CHECK_SIZEOF(long long)
+AC_CHECK_SIZEOF(void *)
+
+dnl
+dnl Checks for non-universal or system-specific types.
+dnl
+AC_TYPE_SIZE_T
+AC_TYPE_PID_T
+AC_CHECK_TYPES([uint32_t, uintptr_t, intptr_t, int64_t])
+AC_CHECK_TYPES(sig_atomic_t, [], [], [
+#include <stdio.h>
+#include <sys/types.h>
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#include <signal.h>
+])
+
+# gnulib
+gl_INIT
+
+dnl
+dnl Checks for library functions.
+dnl
+AC_FUNC_ALLOCA
+AC_FUNC_MMAP
+AC_FUNC_FSEEKO
+AC_CHECK_FUNCS(strptime timegm snprintf vsnprintf vasprintf drand48)
+AC_CHECK_FUNCS(strtoll usleep ftello sigblock sigsetjmp memrchr wcwidth mbtowc)
+
+if test x"$ENABLE_OPIE" = xyes; then
+ AC_LIBOBJ([ftp-opie])
+fi
+
+dnl We expect to have these functions on Unix-like systems configure
+dnl runs on. The defines are provided to get them in config.h.in so
+dnl Wget can still be ported to non-Unix systems (such as Windows)
+dnl that lack some of these functions.
+AC_DEFINE([HAVE_STRCASECMP], 1, [Define to 1 if you have the `strcasecmp' function.])
+AC_DEFINE([HAVE_STRNCASECMP], 1, [Define to 1 if you have the `strncasecmp' function.])
+AC_DEFINE([HAVE_STRDUP], 1, [Define to 1 if you have the `strdup' function.])
+AC_DEFINE([HAVE_ISATTY], 1, [Define to 1 if you have the `isatty' function.])
+AC_DEFINE([HAVE_SYMLINK], 1, [Define to 1 if you have the `symlink' function.])
+
+dnl
+dnl Call Wget-specific macros defined in aclocal.
+dnl
+WGET_STRUCT_UTIMBUF
+WGET_SOCKLEN_T
+WGET_FNMATCH
+WGET_NANOSLEEP
+WGET_POSIX_CLOCK
+WGET_NSL_SOCKET
+
+dnl
+dnl Checks for libraries.
+dnl
+
+AS_IF([test x"$with_ssl" = xgnutls], [
+ dnl Now actually check for -lssl
+ AC_LIB_HAVE_LINKFLAGS([gnutls], [], [
+#include <gnutls/gnutls.h>
+ ], [gnutls_global_init()])
+ if test x"$LIBGNUTLS" != x
+ then
+ AC_MSG_NOTICE([compiling in support for SSL via GnuTLS])
+ AC_LIBOBJ([gnutls])
+ else
+ AC_MSG_ERROR([--with-ssl=gnutls was given, but GNUTLS is not available.])
+ fi
+], [
+ # --with-ssl is not gnutls: check if it's no
+ AS_IF([test x"$with_ssl" != xno], [
+ dnl As of this writing (OpenSSL 0.9.6), the libcrypto shared library
+ dnl doesn't record its dependency on libdl, so we need to make sure
+ dnl -ldl ends up in LIBS on systems that have it. Most OSes use
+ dnl dlopen(), but HP-UX uses shl_load().
+ AC_CHECK_LIB(dl, dlopen, [], [
+ AC_CHECK_LIB(dl, shl_load)
+ ])
+
+ dnl Now actually check for -lssl
+ AC_LIB_HAVE_LINKFLAGS([ssl], [crypto], [
+ #include <openssl/ssl.h>
+ #include <openssl/x509.h>
+ #include <openssl/err.h>
+ #include <openssl/rand.h>
+ #include <openssl/des.h>
+ #include <openssl/md4.h>
+ #include <openssl/md5.h>
+ ], [SSL_library_init ()])
+ if test x"$LIBSSL" != x
+ then
+ AC_MSG_NOTICE([compiling in support for SSL via OpenSSL])
+ AC_LIBOBJ([openssl])
+ elif test x"$with_ssl" != x
+ then
+ AC_MSG_ERROR([--with-ssl was given, but SSL is not available.])
+ fi
+ ]) # endif: --with-ssl == no?
+]) # endif: --with-ssl == gnutls?
+
+
+dnl Enable NTLM if requested and if SSL is available.
+if test x"$LIBSSL" != x
+then
+ if test x"$ENABLE_NTLM" != xno
+ then
+ AC_DEFINE([ENABLE_NTLM], 1,
+ [Define if you want the NTLM authorization support compiled in.])
+ AC_LIBOBJ([http-ntlm])
+ fi
+else
+ dnl If SSL is unavailable and the user explicitly requested NTLM,
+ dnl abort.
+ if test x"$ENABLE_NTLM" = xyes
+ then
+ AC_MSG_ERROR([NTLM authorization requested and OpenSSL not found; aborting])
+ fi
+fi
+
+dnl
+dnl Find an MD5 implementation. Since Wget rarely needs MD5, we try
+dnl to use an existing library implementation to save on code size.
+dnl
+
+if test x"$wget_need_md5" = xyes
+then
+ dnl This should be moved to an AC_DEFUN, but I'm not sure how to
+ dnl manipulate MD5_OBJ from the defun.
+
+ AC_LIBOBJ([gen-md5])
+ found_md5=no
+
+ dnl Check for the system MD5 library on Solaris. We don't check for
+ dnl something simple like "MD5Update" because there are a number of
+ dnl MD5 implementations that use that name, but have an otherwise
+ dnl incompatible interface. md5_calc is, hopefully, specific to the
+ dnl Solaris MD5 library.
+ if test x"$found_md5" = xno; then
+ AC_CHECK_LIB(md5, md5_calc, [
+ dnl Some installations have bogus <md5.h> in the compiler's
+ dnl include path, making the system md5 library useless.
+ AC_MSG_CHECKING([for working md5.h])
+ AC_COMPILE_IFELSE([#include <md5.h>
+ ], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE([HAVE_SOLARIS_MD5], 1, [Define when using Solaris MD5.])
+ LIBS="-lmd5 $LIBS"
+ found_md5=yes
+ AC_MSG_NOTICE([using the Solaris MD5 implementation])
+ ], [AC_MSG_RESULT(no)])
+ ])
+ fi
+
+ dnl Then see if we're linking OpenSSL anyway; if yes, use its md5
+ dnl implementation.
+ if test x"$found_md5" = xno; then
+ if test x"$LIBSSL" != x; then
+ AC_DEFINE([HAVE_OPENSSL_MD5], 1, [Define when using OpenSSL MD5.])
+ found_md5=yes
+ AC_MSG_NOTICE([using the OpenSSL MD5 implementation])
+ fi
+ fi
+
+ dnl If none of the above worked, use the one we ship with Wget.
+ if test x"$found_md5" = xno; then
+ AC_DEFINE([HAVE_BUILTIN_MD5], 1, [Define when using built-in MD5.])
+ found_md5=yes
+ AC_MSG_NOTICE([using the built-in (GNU) MD5 implementation])
+ AC_C_BIGENDIAN
+
+ AC_SUBST(MD5_CPPFLAGS, '-I $(top_srcdir)/md5')
+ AC_SUBST(MD5_LDADD, '../md5/libmd5.a')
+ AC_SUBST(MD5_SUBDIR, md5)
+ md5_INIT
+ fi
+ AC_DEFINE([HAVE_MD5], 1, [Define if we're compiling support for MD5.])
+fi
+
+dnl **********************************************************************
+dnl Checks for IPv6
+dnl **********************************************************************
+
+dnl
+dnl We test for IPv6 by checking, in turn, for availability of
+dnl getaddrinfo, presence of the INET6 address/protocol family, and
+dnl the existence of struct sockaddr_in6. If any of them is missing,
+dnl IPv6 is disabled, and the code reverts to old-style gethostbyname.
+dnl
+dnl If --enable-ipv6 is explicitly specified on the configure command
+dnl line, we check for IPv6 and abort if not found. If --disable-ipv6
+dnl is specified, we disable IPv6 and don't check for it. The default
+dnl is to autodetect IPv6 and use it where available.
+dnl
+
+AC_ARG_ENABLE(ipv6,
+ AC_HELP_STRING([--disable-ipv6],[disable IPv6 support]),
+ [case "${enable_ipv6}" in
+ no)
+ AC_MSG_NOTICE([disabling IPv6 at user request])
+ dnl Disable IPv6 checking
+ ipv6=no
+ ;;
+ yes)
+ dnl IPv6 explicitly enabled: force its use (abort if unavailable).
+ ipv6=yes
+ force_ipv6=yes
+ ;;
+ auto)
+ dnl Auto-detect IPv6, i.e. check for IPv6, but don't force it.
+ ipv6=yes
+ ;;
+ *)
+ AC_MSG_ERROR([Invalid --enable-ipv6 argument \`$enable_ipv6'])
+ ;;
+ esac
+ ], [
+ dnl If nothing is specified, assume auto-detection.
+ ipv6=yes
+ ]
+)
+
+if test "X$ipv6" = "Xyes"; then
+ AC_CHECK_FUNCS(getaddrinfo, [], [
+ AC_MSG_NOTICE([Disabling IPv6 support: your system does not support getaddrinfo(3)])
+ ipv6=no
+ ])
+fi
+
+if test "X$ipv6" = "Xyes"; then
+ PROTO_INET6([], [
+ AC_MSG_NOTICE([Disabling IPv6 support: your system does not support the PF_INET6 protocol family])
+ ipv6=no
+ ])
+fi
+
+if test "X$ipv6" = "Xyes"; then
+ TYPE_STRUCT_SOCKADDR_IN6([],[
+ AC_MSG_NOTICE([Disabling IPv6 support: your system does not support \`struct sockaddr_in6'])
+ ipv6=no
+ ])
+ if test "X$ipv6" = "Xyes"; then
+ WGET_STRUCT_SOCKADDR_STORAGE
+ MEMBER_SIN6_SCOPE_ID
+ fi
+fi
+
+if test "X$ipv6" = "Xyes"; then
+ AC_DEFINE([ENABLE_IPV6], 1, [Define if IPv6 support is enabled.])
+ AC_MSG_NOTICE([Enabling support for IPv6.])
+elif test "x$force_ipv6" = "xyes"; then
+ AC_MSG_ERROR([IPv6 support requested but not found; aborting])
+fi
+
+
+dnl
+dnl Set of available languages.
+dnl
+dnl Originally this used to be static, looking like this:
+dnl ALL_LINGUAS="cs de hr it ..."
+dnl The downside was that configure needed to be rebuilt whenever a
+dnl new language was added.
+dnl
+ALL_LINGUAS="en@quot en@boldquot en_US $(cd ${srcdir}/po && ls *.po | grep -v 'en@.*quot' | grep -v 'en_US\.po' | sed -e 's/\.po$//' | tr '\012' ' ')"
+
+dnl
+dnl Find makeinfo. We used to provide support for Emacs processing
+dnl Texinfo using `emacs -batch -eval ...' where makeinfo is
+dnl unavailable, but that broke with the addition of makeinfo-specific
+dnl command-line options, such as `-I'. Now we depend on makeinfo to
+dnl build the Info documentation.
+dnl
+
+AC_CHECK_PROGS(MAKEINFO, [makeinfo], [true])
+
+dnl
+dnl Find perl and pod2man
+dnl
+
+AC_PATH_PROGS(PERL, [perl5 perl], no)
+AC_PATH_PROG(POD2MAN, pod2man, no)
+
+if test "x${POD2MAN}" = xno; then
+ COMMENT_IF_NO_POD2MAN="# "
+else
+ COMMENT_IF_NO_POD2MAN=
+fi
+AC_SUBST(COMMENT_IF_NO_POD2MAN)
+
+dnl
+dnl Create output
+dnl
+AC_CONFIG_FILES([Makefile src/Makefile doc/Makefile util/Makefile
+ po/Makefile.in tests/Makefile tests/WgetTest.pm
+ lib/Makefile md5/Makefile windows/Makefile])
+AC_CONFIG_HEADERS([src/config.h])
+AC_OUTPUT
--- /dev/null
- wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c ftp-basic.c \
+# Makefile for `wget' utility
+# Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+# 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with Wget. If not, see <http://www.gnu.org/licenses/>.
+
+# Additional permission under GNU GPL version 3 section 7
+
+# If you modify this program, or any covered work, by linking or
+# combining it with the OpenSSL project's OpenSSL library (or a
+# modified version of that library), containing parts covered by the
+# terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+# grants you additional permission to convey the resulting work.
+# Corresponding Source for a non-source form of such a combination
+# shall include the source code for the parts of OpenSSL used as well
+# as that of the covered work.
+
+#
+# Version: @VERSION@
+#
+
+# The following line is losing on some versions of make!
+DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
+LIBS = @LIBS@ @LIBSSL@ @LIBGNUTLS@ @LIBINTL@
+
+bin_PROGRAMS = wget
- connect.h convert.h cookies.h \
- ftp.h gen-md5.h hash.h host.h html-parse.h \
++wget_SOURCES = cmpt.c connect.c convert.c cookies.c \
++ css.lex css-url.c \
++ ftp.c ftp-basic.c \
+ ftp-ls.c hash.c host.c html-parse.c html-url.c http.c \
+ init.c log.c main.c netrc.c progress.c ptimer.c recur.c \
+ res.c retr.c snprintf.c spider.c url.c \
+ utils.c xmalloc.c \
++ css-url.h connect.h convert.h cookies.h \
++ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \
+ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
+ options.h progress.h ptimer.h recur.h res.h retr.h \
+ spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
+nodist_wget_SOURCES = version.c
+EXTRA_wget_SOURCES = mswindows.c
+LDADD = $(ALLOCA) $(LIBOBJS) ../lib/libgnu.a @MD5_LDADD@
+AM_CPPFLAGS = -I $(top_srcdir)/lib @MD5_CPPFLAGS@
+
+version.c: $(wget_SOURCES) $(LDADD) $(srcdir)/Makefile.am
+ echo 'const char *version_string = "@VERSION@"' > $@
+ -hg log -r . --template='" ({node|short})"\n' 2>/dev/null >> $@
+ echo ';' >> $@
+
+check_LIBRARIES = libunittest.a
+libunittest_a_SOURCES = $(wget_SOURCES) test.c test.h
+nodist_libunittest_a_SOURCES = version.c
+libunittest_a_CPPFLAGS = -DTESTING -I$(top_srcdir)/lib
+libunittest_a_LIBADD = $(ALLOCA) $(LIBOBJS)
+
+CLEANFILES = *~ *.bak core core.[0-9]* version.c
char **file_array;
cnt = 0;
- if (downloaded_html_set)
- cnt = hash_table_count (downloaded_html_set);
+ if (downloaded_set)
+ cnt = hash_table_count (downloaded_set);
if (cnt == 0)
- return;
+ goto cleanup;
file_array = alloca_array (char *, cnt);
- string_set_to_array (downloaded_html_set, file_array);
+ string_set_to_array (downloaded_set, file_array);
for (i = 0; i < cnt; i++)
{
/* Free the data. */
free_urlpos (urls);
}
+ }
+
+ /* This function is called when the retrieval is done to convert the
+ links that have been downloaded. It has to be called at the end of
+ the retrieval, because only then does Wget know conclusively which
+ URLs have been downloaded, and which not, so it can tell which
+ direction to convert to.
+
+ The "direction" means that the URLs to the files that have been
+ downloaded get converted to the relative URL which will point to
+ that file. And the other URLs get converted to the remote URL on
+ the server.
+
+ All the downloaded HTMLs are kept in downloaded_html_files, and
+ downloaded URLs in urls_downloaded. All the information is
+ extracted from these two lists. */
+
+ void
+ convert_all_links (void)
+ {
+ double secs;
+ int file_count = 0;
+
+ struct ptimer *timer = ptimer_new ();
+
+ convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
+ convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
secs = ptimer_measure (timer);
- ptimer_destroy (timer);
logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
file_count, print_decimal (secs));
+cleanup:
+ ptimer_destroy (timer);
}
static void write_backup_file (const char *, downloaded_file_t);
COMPACT="compact">. Even if such attributes are not
useful to Wget, we need to support them, so that the
tags containing them can be parsed correctly. */
- attr_raw_value_begin = attr_value_begin = attr_name_begin;
- attr_raw_value_end = attr_value_end = attr_name_end;
- }
- else if (*p == '=')
- {
- ADVANCE (p);
- SKIP_WS (p);
- if (*p == '\"' || *p == '\'')
- {
- bool newline_seen = false;
- char quote_char = *p;
- attr_raw_value_begin = p;
- ADVANCE (p);
- attr_value_begin = p; /* <foo bar="baz"> */
- /* ^ */
- while (*p != quote_char)
- {
- if (!newline_seen && *p == '\n')
- {
- /* If a newline is seen within the quotes, it
- is most likely that someone forgot to close
- the quote. In that case, we back out to
- the value beginning, and terminate the tag
- at either `>' or the delimiter, whichever
- comes first. Such a tag terminated at `>'
- is discarded. */
- p = attr_value_begin;
- newline_seen = true;
- continue;
- }
- else if (newline_seen && *p == '>')
- break;
- ADVANCE (p);
- }
- attr_value_end = p; /* <foo bar="baz"> */
- /* ^ */
- if (*p == quote_char)
- ADVANCE (p);
- else
- goto look_for_tag;
- attr_raw_value_end = p; /* <foo bar="baz"> */
- /* ^ */
- operation = AP_DECODE_ENTITIES;
- if (flags & MHT_TRIM_VALUES)
- operation |= AP_TRIM_BLANKS;
- }
- else
- {
- attr_value_begin = p; /* <foo bar=baz> */
- /* ^ */
- /* According to SGML, a name token should consist only
- of alphanumerics, . and -. However, this is often
- violated by, for instance, `%' in `width=75%'.
- We'll be liberal and allow just about anything as
- an attribute value. */
- while (!ISSPACE (*p) && *p != '>')
- ADVANCE (p);
- attr_value_end = p; /* <foo bar=baz qux=quix> */
- /* ^ */
- if (attr_value_begin == attr_value_end)
- /* <foo bar=> */
- /* ^ */
- goto backout_tag;
- attr_raw_value_begin = attr_value_begin;
- attr_raw_value_end = attr_value_end;
- operation = AP_DECODE_ENTITIES;
- }
- }
- else
- {
- /* We skipped the whitespace and found something that is
- neither `=' nor the beginning of the next attribute's
- name. Back out. */
- goto backout_tag; /* <foo bar [... */
- /* ^ */
- }
-
- /* If we're not interested in the tag, don't bother with any
+ attr_raw_value_begin = attr_value_begin = attr_name_begin;
+ attr_raw_value_end = attr_value_end = attr_name_end;
+ }
+ else if (*p == '=')
+ {
+ ADVANCE (p);
+ SKIP_WS (p);
+ if (*p == '\"' || *p == '\'')
+ {
+ bool newline_seen = false;
+ char quote_char = *p;
+ attr_raw_value_begin = p;
+ ADVANCE (p);
+ attr_value_begin = p; /* <foo bar="baz"> */
+ /* ^ */
+ while (*p != quote_char)
+ {
+ if (!newline_seen && *p == '\n')
+ {
+ /* If a newline is seen within the quotes, it
+ is most likely that someone forgot to close
+ the quote. In that case, we back out to
+ the value beginning, and terminate the tag
+ at either `>' or the delimiter, whichever
+ comes first. Such a tag terminated at `>'
+ is discarded. */
+ p = attr_value_begin;
+ newline_seen = true;
+ continue;
+ }
+ else if (newline_seen && *p == '>')
+ break;
+ ADVANCE (p);
+ }
+ attr_value_end = p; /* <foo bar="baz"> */
+ /* ^ */
+ if (*p == quote_char)
+ ADVANCE (p);
+ else
+ goto look_for_tag;
+ attr_raw_value_end = p; /* <foo bar="baz"> */
+ /* ^ */
+ operation = AP_DECODE_ENTITIES;
+ if (flags & MHT_TRIM_VALUES)
+ operation |= AP_TRIM_BLANKS;
+ }
+ else
+ {
+ attr_value_begin = p; /* <foo bar=baz> */
+ /* ^ */
+ /* According to SGML, a name token should consist only
+ of alphanumerics, . and -. However, this is often
+ violated by, for instance, `%' in `width=75%'.
+ We'll be liberal and allow just about anything as
+ an attribute value. */
+ while (!c_isspace (*p) && *p != '>')
+ ADVANCE (p);
+ attr_value_end = p; /* <foo bar=baz qux=quix> */
+ /* ^ */
+ if (attr_value_begin == attr_value_end)
+ /* <foo bar=> */
+ /* ^ */
+ goto backout_tag;
+ attr_raw_value_begin = attr_value_begin;
+ attr_raw_value_end = attr_value_end;
+ operation = AP_DECODE_ENTITIES;
+ }
+ }
+ else
+ {
+ /* We skipped the whitespace and found something that is
+ neither `=' nor the beginning of the next attribute's
+ name. Back out. */
+ goto backout_tag; /* <foo bar [... */
+ /* ^ */
+ }
+
+ /* If we're not interested in the tag, don't bother with any
of the attributes. */
- if (uninteresting_tag)
- continue;
+ if (uninteresting_tag)
+ continue;
- /* If we aren't interested in the attribute, skip it. We
+ /* If we aren't interested in the attribute, skip it. We
cannot do this test any sooner, because our text pointer
needs to correctly advance over the attribute. */
- if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
- continue;
+ if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
+ continue;
- GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
- struct attr_pair);
+ GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
+ struct attr_pair);
- pairs[nattrs].name_pool_index = pool.tail;
- convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
+ pairs[nattrs].name_pool_index = pool.tail;
+ convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
- pairs[nattrs].value_pool_index = pool.tail;
- convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
- pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
- pairs[nattrs].value_raw_size = (attr_raw_value_end
- - attr_raw_value_begin);
- ++nattrs;
+ pairs[nattrs].value_pool_index = pool.tail;
+ convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
+ pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
+ pairs[nattrs].value_raw_size = (attr_raw_value_end
+ - attr_raw_value_begin);
+ ++nattrs;
}
+ if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+ {
+ tail->contents_begin = p+1;
+ }
+
if (uninteresting_tag)
{
- ADVANCE (p);
- goto look_for_tag;
+ ADVANCE (p);
+ goto look_for_tag;
}
/* By now, we have a valid tag with a name and zero or more
from the information above. However, some places in the code refer
to the attributes not mentioned here. We add them manually. */
static const char *additional_attributes[] = {
- "rel", /* used by tag_handle_link */
- "http-equiv", /* used by tag_handle_meta */
- "name", /* used by tag_handle_meta */
- "content", /* used by tag_handle_meta */
- "action" /* used by tag_handle_form */
- "rel", /* used by tag_handle_link */
- "http-equiv", /* used by tag_handle_meta */
- "name", /* used by tag_handle_meta */
- "content", /* used by tag_handle_meta */
- "action", /* used by tag_handle_form */
- "style" /* used by check_style_attr */
++ "rel", /* used by tag_handle_link */
++ "http-equiv", /* used by tag_handle_meta */
++ "name", /* used by tag_handle_meta */
++ "content", /* used by tag_handle_meta */
++ "action", /* used by tag_handle_form */
++ "style" /* used by check_style_attr */
};
static struct hash_table *interesting_tags;
const int size = countof (tag_url_attributes);
/* If you're cringing at the inefficiency of the nested loops,
- remember that they both iterate over a very small number of
- items. The worst-case inner loop is for the IMG tag, which
- has three attributes. */
+ remember that they both iterate over a very small number of
+ items. The worst-case inner loop is for the IMG tag, which
+ has three attributes. */
for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
- {
- if (0 == strcasecmp (tag->attrs[attrind].name,
- tag_url_attributes[i].attr_name))
- {
- struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
+ {
+ if (0 == strcasecmp (tag->attrs[attrind].name,
+ tag_url_attributes[i].attr_name))
+ {
- struct urlpos *up = append_url (link, tag, attrind, ctx);
++ struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
- if (up)
- {
- int flags = tag_url_attributes[i].flags;
- if (flags & ATTR_INLINE)
- up->link_inline_p = 1;
- if (flags & ATTR_HTML)
- up->link_expect_html = 1;
- }
- }
- }
+ if (up)
+ {
+ int flags = tag_url_attributes[i].flags;
+ if (flags & ATTR_INLINE)
+ up->link_inline_p = 1;
+ if (flags & ATTR_HTML)
+ up->link_expect_html = 1;
+ }
+ }
+ }
}
}
{
int attrind;
char *action = find_attr (tag, "action", &attrind);
+
if (action)
{
- struct urlpos *up = append_url (action, tag, attrind, ctx);
+ struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (up)
- up->ignore_when_downloading = 1;
+ up->ignore_when_downloading = 1;
}
}
*/
if (href)
{
- struct urlpos *up = append_url (href, tag, attrind, ctx);
+ struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (up)
- {
- char *rel = find_attr (tag, "rel", NULL);
- if (rel)
+ {
+ char *rel = find_attr (tag, "rel", NULL);
- if (rel
- && (0 == strcasecmp (rel, "stylesheet")
- || 0 == strcasecmp (rel, "shortcut icon")))
- up->link_inline_p = 1;
++ if (rel)
+ {
- if (0 == strcasecmp (rel, "stylesheet"))
++ if (0 == strcasecmp (rel, "stylesheet"))
+ {
+ up->link_inline_p = 1;
+ up->link_expect_css = 1;
+ }
- else if (0 == strcasecmp (rel, "shortcut icon"))
++ else if (0 == strcasecmp (rel, "shortcut icon"))
+ {
+ up->link_inline_p = 1;
+ }
+ }
- else
- /* The external ones usually point to HTML pages, such as
- <link rel="next" href="..."> */
- up->link_expect_html = 1;
- }
+ else
+ /* The external ones usually point to HTML pages, such as
+ <link rel="next" href="..."> */
+ up->link_expect_html = 1;
+ }
}
}
char *refresh = find_attr (tag, "content", &attrind);
if (!refresh)
- return;
+ return;
- for (p = refresh; ISDIGIT (*p); p++)
- timeout = 10 * timeout + *p - '0';
+ for (p = refresh; c_isdigit (*p); p++)
+ timeout = 10 * timeout + *p - '0';
if (*p++ != ';')
- return;
-
- while (ISSPACE (*p))
- ++p;
- if (!( TOUPPER (*p) == 'U'
- && TOUPPER (*(p + 1)) == 'R'
- && TOUPPER (*(p + 2)) == 'L'
- && *(p + 3) == '='))
- return;
+ return;
+
+ while (c_isspace (*p))
+ ++p;
+ if (!( c_toupper (*p) == 'U'
+ && c_toupper (*(p + 1)) == 'R'
+ && c_toupper (*(p + 2)) == 'L'
+ && *(p + 3) == '='))
+ return;
p += 4;
- while (ISSPACE (*p))
- ++p;
+ while (c_isspace (*p))
+ ++p;
- entry = append_url (p, tag, attrind, ctx);
+ entry = append_url (p, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
if (entry)
- {
- entry->link_refresh_p = 1;
- entry->refresh_timeout = timeout;
- entry->link_expect_html = 1;
- }
+ {
+ entry->link_refresh_p = 1;
+ entry->refresh_timeout = timeout;
+ entry->link_expect_html = 1;
+ }
}
else if (name && 0 == strcasecmp (name, "robots"))
{
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;
+ /* the NULL here used to be interesting_tags */
map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
- interesting_tags, interesting_attributes);
- NULL, interesting_attributes);
++ NULL, interesting_attributes);
DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
if (meta_disallow_follow)
extern char *version_string;
+/* Forward decls. */
+static char *create_authorization_line (const char *, const char *,
+ const char *, const char *,
+ const char *, bool *);
+static char *basic_authentication_encode (const char *, const char *);
+static bool known_authentication_scheme_p (const char *, const char *);
++static void ensure_extension (struct http_stat *, const char *, int *);
+static void load_cookies (void);
+
#ifndef MIN
# define MIN(x, y) ((x) > (y) ? (y) : (x))
#endif
/* Functions for maintaining the URL queue. */
struct queue_element {
- const char *url; /* the URL to download */
- const char *url; /* the URL to download */
- const char *referer; /* the referring document */
- int depth; /* the depth */
- bool html_allowed; /* whether the document is allowed to
- be treated as HTML. */
- bool css_allowed; /* whether the document is allowed to
- be treated as CSS. */
- struct queue_element *next; /* next element in queue */
+ const char *referer; /* the referring document */
+ int depth; /* the depth */
+ bool html_allowed; /* whether the document is allowed to
+ be treated as HTML. */
-
++ bool css_allowed; /* whether the document is allowed to
++ be treated as CSS. */
+ struct queue_element *next; /* next element in queue */
};
struct url_queue {
static void
url_enqueue (struct url_queue *queue,
- const char *url, const char *referer, int depth, bool html_allowed)
- const char *url, const char *referer, int depth,
++ const char *url, const char *referer, int depth,
+ bool html_allowed, bool css_allowed)
{
struct queue_element *qel = xnew (struct queue_element);
qel->url = url;
static bool
url_dequeue (struct url_queue *queue,
- const char **url, const char **referer, int *depth,
- bool *html_allowed, bool *css_allowed)
+ const char **url, const char **referer, int *depth,
- bool *html_allowed)
++ bool *html_allowed, bool *css_allowed)
{
struct queue_element *qel = queue->head;
/* Get the next URL from the queue... */
if (!url_dequeue (queue,
- (const char **)&url, (const char **)&referer,
- &depth, &html_allowed, &css_allowed))
- break;
+ (const char **)&url, (const char **)&referer,
- &depth, &html_allowed))
++ &depth, &html_allowed, &css_allowed))
+ break;
/* ...and download it. Note that this download is in most cases
- unconditional, as download_child_p already makes sure a file
- doesn't get enqueued twice -- and yet this check is here, and
- not in download_child_p. This is so that if you run `wget -r
- URL1 URL2', and a random URL is encountered once under URL1
- and again under URL2, but at a different (possibly smaller)
- depth, we want the URL's children to be taken into account
- the second time. */
+ unconditional, as download_child_p already makes sure a file
+ doesn't get enqueued twice -- and yet this check is here, and
+ not in download_child_p. This is so that if you run `wget -r
+ URL1 URL2', and a random URL is encountered once under URL1
+ and again under URL2, but at a different (possibly smaller)
+ depth, we want the URL's children to be taken into account
+ the second time. */
if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
- {
- file = xstrdup (hash_table_get (dl_url_file_map, url));
+ {
+ file = xstrdup (hash_table_get (dl_url_file_map, url));
- DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
- url, file));
+ DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
+ url, file));
- if (html_allowed
- && downloaded_html_set
- && string_set_contains (downloaded_html_set, file))
+ /* this sucks, needs to be combined! */
- descend = true;
+ if (html_allowed
+ && downloaded_html_set
+ && string_set_contains (downloaded_html_set, file))
- descend = 1;
+ {
+ descend = true;
+ is_css = false;
+ }
+ if (css_allowed
+ && downloaded_css_set
+ && string_set_contains (downloaded_css_set, file))
+ {
- }
++ descend = true;
+ is_css = true;
+ }
+ }
else
- {
- int dt = 0;
- char *redirected = NULL;
+ {
+ int dt = 0;
+ char *redirected = NULL;
- status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+ status = retrieve_url (url, &file, &redirected, referer, &dt, false);
- if (html_allowed && file && status == RETROK
- && (dt & RETROKF) && (dt & TEXTHTML))
+ if (html_allowed && file && status == RETROK
+ && (dt & RETROKF) && (dt & TEXTHTML))
- descend = true;
+ {
+ descend = true;
+ is_css = false;
+ }
+
+ /* a little different, css_allowed can override content type
+ lots of web servers serve css with an incorrect content type
+ */
+ if (file && status == RETROK
+ && (dt & RETROKF) &&
+ ((dt & TEXTCSS) || css_allowed))
+ {
+ descend = true;
+ is_css = false;
+ }
- if (redirected)
- {
- /* We have been redirected, possibly to another host, or
- different path, or wherever. Check whether we really
- want to follow it. */
- if (descend)
- {
- if (!descend_redirect_p (redirected, url, depth,
- start_url_parsed, blacklist))
- descend = false;
- else
- /* Make sure that the old pre-redirect form gets
- blacklisted. */
- string_set_add (blacklist, url);
- }
-
- xfree (url);
- url = redirected;
- }
- }
+ if (redirected)
+ {
+ /* We have been redirected, possibly to another host, or
+ different path, or wherever. Check whether we really
+ want to follow it. */
+ if (descend)
+ {
+ if (!descend_redirect_p (redirected, url, depth,
+ start_url_parsed, blacklist))
+ descend = false;
+ else
+ /* Make sure that the old pre-redirect form gets
+ blacklisted. */
+ string_set_add (blacklist, url);
+ }
+
+ xfree (url);
+ url = redirected;
+ }
+ }
if (opt.spider)
- {
+ {
visited_url (url, referer);
- }
+ }
if (descend
- && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
- {
- if (opt.page_requisites
- && (depth == opt.reclevel || depth == opt.reclevel + 1))
- {
- /* When -p is specified, we are allowed to exceed the
- maximum depth, but only for the "inline" links,
- i.e. those that are needed to display the page.
- Originally this could exceed the depth at most by
- one, but we allow one more level so that the leaf
- pages that contain frames can be loaded
- correctly. */
- dash_p_leaf_HTML = true;
- }
- else
- {
- /* Either -p wasn't specified or it was and we've
- already spent the two extra (pseudo-)levels that it
- affords us, so we need to bail out. */
- DEBUGP (("Not descending further; at depth %d, max. %d.\n",
- depth, opt.reclevel));
- descend = false;
- }
- }
+ && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
+ {
+ if (opt.page_requisites
+ && (depth == opt.reclevel || depth == opt.reclevel + 1))
+ {
+ /* When -p is specified, we are allowed to exceed the
+ maximum depth, but only for the "inline" links,
+ i.e. those that are needed to display the page.
+ Originally this could exceed the depth at most by
+ one, but we allow one more level so that the leaf
+ pages that contain frames can be loaded
+ correctly. */
+ dash_p_leaf_HTML = true;
+ }
+ else
+ {
+ /* Either -p wasn't specified or it was and we've
+ already spent the two extra (pseudo-)levels that it
+ affords us, so we need to bail out. */
+ DEBUGP (("Not descending further; at depth %d, max. %d.\n",
+ depth, opt.reclevel));
+ descend = false;
+ }
+ }
- /* If the downloaded document was HTML, parse it and enqueue the
+ /* If the downloaded document was HTML or CSS, parse it and enqueue the
- links it contains. */
+ links it contains. */
if (descend)
- {
- bool meta_disallow_follow = false;
- struct urlpos *children
- = is_css ? get_urls_css_file (file, url) :
+ {
+ bool meta_disallow_follow = false;
+ struct urlpos *children
- = get_urls_html (file, url, &meta_disallow_follow);
++ = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, &meta_disallow_follow);
- if (opt.use_robots && meta_disallow_follow)
- {
- free_urlpos (children);
- children = NULL;
- }
-
- if (children)
- {
- struct urlpos *child = children;
- struct url *url_parsed = url_parsed = url_parse (url, NULL);
- assert (url_parsed != NULL);
-
- for (; child; child = child->next)
- {
- if (child->ignore_when_downloading)
- continue;
- if (dash_p_leaf_HTML && !child->link_inline_p)
- continue;
- if (download_child_p (child, url_parsed, depth, start_url_parsed,
- blacklist))
- {
- url_enqueue (queue, xstrdup (child->url->url),
- xstrdup (url), depth + 1,
- child->link_expect_html,
- child->link_expect_css);
- /* We blacklist the URL we have enqueued, because we
- don't want to enqueue (and hence download) the
- same URL twice. */
- string_set_add (blacklist, child->url->url);
- }
- }
-
- url_free (url_parsed);
- free_urlpos (children);
- }
- }
+ if (opt.use_robots && meta_disallow_follow)
+ {
+ free_urlpos (children);
+ children = NULL;
+ }
+
+ if (children)
+ {
+ struct urlpos *child = children;
+ struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ char *referer_url = url;
+ bool strip_auth = (url_parsed != NULL
+ && url_parsed->user != NULL);
+ assert (url_parsed != NULL);
+
+ /* Strip auth info if present */
+ if (strip_auth)
+ referer_url = url_string (url_parsed, URL_AUTH_HIDE);
+
+ for (; child; child = child->next)
+ {
+ if (child->ignore_when_downloading)
+ continue;
+ if (dash_p_leaf_HTML && !child->link_inline_p)
+ continue;
+ if (download_child_p (child, url_parsed, depth, start_url_parsed,
+ blacklist))
+ {
+ url_enqueue (queue, xstrdup (child->url->url),
+ xstrdup (referer_url), depth + 1,
- child->link_expect_html);
++ child->link_expect_html,
++ child->link_expect_css);
+ /* We blacklist the URL we have enqueued, because we
+ don't want to enqueue (and hence download) the
+ same URL twice. */
+ string_set_add (blacklist, child->url->url);
+ }
+ }
+
+ if (strip_auth)
+ xfree (referer_url);
+ url_free (url_parsed);
+ free_urlpos (children);
+ }
+ }
if (file
&& (opt.delete_after
{
char *d1, *d2;
int d3;
- bool d4;
+ bool d4, d5;
while (url_dequeue (queue,
- (const char **)&d1, (const char **)&d2, &d3, &d4))
- (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
++ (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
{
- xfree (d1);
- xfree_null (d2);
+ xfree (d1);
+ xfree_null (d2);
}
}
url_queue_delete (queue);
if (local_file)
{
if (*dt & RETROKF)
- {
- register_download (u->url, local_file);
- if (redirection_count && 0 != strcmp (origurl, u->url))
- register_redirection (origurl, u->url);
- if (*dt & TEXTHTML)
- register_html (u->url, local_file);
- if (*dt & TEXTCSS)
- register_css (u->url, local_file);
- }
+ {
+ register_download (u->url, local_file);
+ if (redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+ if (*dt & TEXTHTML)
+ register_html (u->url, local_file);
++ if (*dt & TEXTCSS)
++ register_css (u->url, local_file);
+ }
}
if (file)