From: Micah Cowan Date: Tue, 22 Apr 2008 08:28:15 +0000 (-0700) Subject: Merging Ted Mielczarek's CSS changes with tip. X-Git-Tag: v1.13~429 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=caae3b70f46bd519857b595f7f06ea0179551336 Merging Ted Mielczarek's CSS changes with tip. --- caae3b70f46bd519857b595f7f06ea0179551336 diff --cc configure.ac index a49de3cd,00000000..cf201aea mode 100644,000000..100644 --- a/configure.ac +++ b/configure.ac @@@ -1,469 -1,0 +1,471 @@@ +dnl Template file for GNU Autoconf +dnl Copyright (C) 1995, 1996, 1997, 2001, 2007, +dnl 2008 Free Software Foundation, Inc. + +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. + +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. + +dnl You should have received a copy of the GNU General Public License +dnl along with this program. If not, see . + +dnl Additional permission under GNU GPL version 3 section 7 + +dnl If you modify this program, or any covered work, by linking or +dnl combining it with the OpenSSL project's OpenSSL library (or a +dnl modified version of that library), containing parts covered by the +dnl terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +dnl grants you additional permission to convey the resulting work. +dnl Corresponding Source for a non-source form of such a combination +dnl shall include the source code for the parts of OpenSSL used as well +dnl as that of the covered work. + +dnl +dnl Process this file with autoconf to produce a configure script. +dnl + +AC_INIT([wget], + [1.12-devel], + [bug-wget@gnu.org]) +AC_PREREQ(2.61) + +dnl +dnl What version of Wget are we building? +dnl +AC_MSG_NOTICE([configuring for GNU Wget $PACKAGE_VERSION]) + +AC_CONFIG_MACRO_DIR([m4]) +AC_CONFIG_AUX_DIR([.]) + +dnl +dnl Automake setup +dnl +AM_INIT_AUTOMAKE(1.9) + +dnl +dnl Gettext +dnl +AM_GNU_GETTEXT([external],[need-ngettext]) +AM_GNU_GETTEXT_VERSION([0.16.1]) + +dnl +dnl Get cannonical host +dnl +AC_CANONICAL_HOST +AC_DEFINE_UNQUOTED([OS_TYPE], "$host_os", + [Define to be the name of the operating system.]) + +dnl +dnl Process features. +dnl + +AC_ARG_WITH(ssl, +[[ --without-ssl disable SSL autodetection]]) + +AC_ARG_ENABLE(opie, +[ --disable-opie disable support for opie or s/key FTP login], +ENABLE_OPIE=$enableval, ENABLE_OPIE=yes) +test x"${ENABLE_OPIE}" = xyes && AC_DEFINE([ENABLE_OPIE], 1, + [Define if you want the Opie support for FTP compiled in.]) + +AC_ARG_ENABLE(digest, +[ --disable-digest disable support for HTTP digest authorization], +ENABLE_DIGEST=$enableval, ENABLE_DIGEST=yes) +test x"${ENABLE_DIGEST}" = xyes && AC_DEFINE([ENABLE_DIGEST], 1, + [Define if you want the HTTP Digest Authorization compiled in.]) + +AC_ARG_ENABLE(ntlm, +[ --disable-ntlm disable support for NTLM authorization], +[ENABLE_NTLM=$enableval], [ENABLE_NTLM=auto]) + +AC_ARG_ENABLE(debug, +[ --disable-debug disable support for debugging output], +ENABLE_DEBUG=$enableval, ENABLE_DEBUG=yes) +test x"${ENABLE_DEBUG}" = xyes && AC_DEFINE([ENABLE_DEBUG], 1, + [Define if you want the debug output support compiled in.]) + +wget_need_md5=no + +case "${ENABLE_OPIE}${ENABLE_DIGEST}" in +*yes*) + wget_need_md5=yes +esac + +dnl +dnl Find the compiler +dnl + +dnl We want these before the checks, so the checks can modify their values. +test -z "$CFLAGS" && CFLAGS= auto_cflags=1 +test -z "$CC" && cc_specified=yes + +AC_PROG_CC +AM_PROG_CC_C_O +AC_AIX +gl_EARLY +md5_EARLY + +AC_PROG_RANLIB + ++AC_PROG_LEX ++ +dnl Turn on optimization by default. Specifically: +dnl +dnl if the user hasn't specified CFLAGS, then +dnl if compiler is gcc, then +dnl use -O2 and some warning flags +dnl else +dnl use os-specific flags or -O +if test -n "$auto_cflags"; then + if test -n "$GCC"; then + CFLAGS="$CFLAGS -O2 -Wall" + else + case "$host_os" in + *hpux*) CFLAGS="$CFLAGS +O3" ;; + *ultrix* | *osf*) CFLAGS="$CFLAGS -O -Olimit 2000" ;; + *) CFLAGS="$CFLAGS -O" ;; + esac + fi +fi + +dnl +dnl Checks for basic compiler characteristics. +dnl +AC_C_CONST +AC_C_INLINE +AC_C_VOLATILE + +dnl Check for basic headers, even though we expect them to exist and +dnl #include them unconditionally in the code. Their detection is +dnl still needed because test programs used by Autoconf macros check +dnl for STDC_HEADERS, HAVE_SYS_TYPES_H, etc. before using them. +dnl Without the checks they will fail to be included in test programs, +dnl which will subsequently fail. +AC_HEADER_STDC + +dnl Check for large file support. This check needs to come fairly +dnl early because it could (in principle) affect whether functions and +dnl headers are available, whether they work, etc. +AC_SYS_LARGEFILE +AC_CHECK_SIZEOF(off_t) + +dnl +dnl Checks for system header files that might be missing. +dnl +AC_HEADER_STDBOOL +AC_CHECK_HEADERS(unistd.h sys/time.h) +AC_CHECK_HEADERS(termios.h sys/ioctl.h sys/select.h utime.h sys/utime.h) +AC_CHECK_HEADERS(stdint.h inttypes.h pwd.h wchar.h) + +dnl +dnl Check sizes of integer types. These are used to find n-bit +dnl integral types on older systems that fail to provide intN_t and +dnl uintN_t typedefs. +dnl +AC_CHECK_SIZEOF(short) +AC_CHECK_SIZEOF(int) +AC_CHECK_SIZEOF(long) +AC_CHECK_SIZEOF(long long) +AC_CHECK_SIZEOF(void *) + +dnl +dnl Checks for non-universal or system-specific types. +dnl +AC_TYPE_SIZE_T +AC_TYPE_PID_T +AC_CHECK_TYPES([uint32_t, uintptr_t, intptr_t, int64_t]) +AC_CHECK_TYPES(sig_atomic_t, [], [], [ +#include +#include +#if HAVE_INTTYPES_H +# include +#endif +#include +]) + +# gnulib +gl_INIT + +dnl +dnl Checks for library functions. +dnl +AC_FUNC_ALLOCA +AC_FUNC_MMAP +AC_FUNC_FSEEKO +AC_CHECK_FUNCS(strptime timegm snprintf vsnprintf vasprintf drand48) +AC_CHECK_FUNCS(strtoll usleep ftello sigblock sigsetjmp memrchr wcwidth mbtowc) + +if test x"$ENABLE_OPIE" = xyes; then + AC_LIBOBJ([ftp-opie]) +fi + +dnl We expect to have these functions on Unix-like systems configure +dnl runs on. The defines are provided to get them in config.h.in so +dnl Wget can still be ported to non-Unix systems (such as Windows) +dnl that lack some of these functions. +AC_DEFINE([HAVE_STRCASECMP], 1, [Define to 1 if you have the `strcasecmp' function.]) +AC_DEFINE([HAVE_STRNCASECMP], 1, [Define to 1 if you have the `strncasecmp' function.]) +AC_DEFINE([HAVE_STRDUP], 1, [Define to 1 if you have the `strdup' function.]) +AC_DEFINE([HAVE_ISATTY], 1, [Define to 1 if you have the `isatty' function.]) +AC_DEFINE([HAVE_SYMLINK], 1, [Define to 1 if you have the `symlink' function.]) + +dnl +dnl Call Wget-specific macros defined in aclocal. +dnl +WGET_STRUCT_UTIMBUF +WGET_SOCKLEN_T +WGET_FNMATCH +WGET_NANOSLEEP +WGET_POSIX_CLOCK +WGET_NSL_SOCKET + +dnl +dnl Checks for libraries. +dnl + +AS_IF([test x"$with_ssl" = xgnutls], [ + dnl Now actually check for -lssl + AC_LIB_HAVE_LINKFLAGS([gnutls], [], [ +#include + ], [gnutls_global_init()]) + if test x"$LIBGNUTLS" != x + then + AC_MSG_NOTICE([compiling in support for SSL via GnuTLS]) + AC_LIBOBJ([gnutls]) + else + AC_MSG_ERROR([--with-ssl=gnutls was given, but GNUTLS is not available.]) + fi +], [ + # --with-ssl is not gnutls: check if it's no + AS_IF([test x"$with_ssl" != xno], [ + dnl As of this writing (OpenSSL 0.9.6), the libcrypto shared library + dnl doesn't record its dependency on libdl, so we need to make sure + dnl -ldl ends up in LIBS on systems that have it. Most OSes use + dnl dlopen(), but HP-UX uses shl_load(). + AC_CHECK_LIB(dl, dlopen, [], [ + AC_CHECK_LIB(dl, shl_load) + ]) + + dnl Now actually check for -lssl + AC_LIB_HAVE_LINKFLAGS([ssl], [crypto], [ + #include + #include + #include + #include + #include + #include + #include + ], [SSL_library_init ()]) + if test x"$LIBSSL" != x + then + AC_MSG_NOTICE([compiling in support for SSL via OpenSSL]) + AC_LIBOBJ([openssl]) + elif test x"$with_ssl" != x + then + AC_MSG_ERROR([--with-ssl was given, but SSL is not available.]) + fi + ]) # endif: --with-ssl == no? +]) # endif: --with-ssl == gnutls? + + +dnl Enable NTLM if requested and if SSL is available. +if test x"$LIBSSL" != x +then + if test x"$ENABLE_NTLM" != xno + then + AC_DEFINE([ENABLE_NTLM], 1, + [Define if you want the NTLM authorization support compiled in.]) + AC_LIBOBJ([http-ntlm]) + fi +else + dnl If SSL is unavailable and the user explicitly requested NTLM, + dnl abort. + if test x"$ENABLE_NTLM" = xyes + then + AC_MSG_ERROR([NTLM authorization requested and OpenSSL not found; aborting]) + fi +fi + +dnl +dnl Find an MD5 implementation. Since Wget rarely needs MD5, we try +dnl to use an existing library implementation to save on code size. +dnl + +if test x"$wget_need_md5" = xyes +then + dnl This should be moved to an AC_DEFUN, but I'm not sure how to + dnl manipulate MD5_OBJ from the defun. + + AC_LIBOBJ([gen-md5]) + found_md5=no + + dnl Check for the system MD5 library on Solaris. We don't check for + dnl something simple like "MD5Update" because there are a number of + dnl MD5 implementations that use that name, but have an otherwise + dnl incompatible interface. md5_calc is, hopefully, specific to the + dnl Solaris MD5 library. + if test x"$found_md5" = xno; then + AC_CHECK_LIB(md5, md5_calc, [ + dnl Some installations have bogus in the compiler's + dnl include path, making the system md5 library useless. + AC_MSG_CHECKING([for working md5.h]) + AC_COMPILE_IFELSE([#include + ], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_SOLARIS_MD5], 1, [Define when using Solaris MD5.]) + LIBS="-lmd5 $LIBS" + found_md5=yes + AC_MSG_NOTICE([using the Solaris MD5 implementation]) + ], [AC_MSG_RESULT(no)]) + ]) + fi + + dnl Then see if we're linking OpenSSL anyway; if yes, use its md5 + dnl implementation. + if test x"$found_md5" = xno; then + if test x"$LIBSSL" != x; then + AC_DEFINE([HAVE_OPENSSL_MD5], 1, [Define when using OpenSSL MD5.]) + found_md5=yes + AC_MSG_NOTICE([using the OpenSSL MD5 implementation]) + fi + fi + + dnl If none of the above worked, use the one we ship with Wget. + if test x"$found_md5" = xno; then + AC_DEFINE([HAVE_BUILTIN_MD5], 1, [Define when using built-in MD5.]) + found_md5=yes + AC_MSG_NOTICE([using the built-in (GNU) MD5 implementation]) + AC_C_BIGENDIAN + + AC_SUBST(MD5_CPPFLAGS, '-I $(top_srcdir)/md5') + AC_SUBST(MD5_LDADD, '../md5/libmd5.a') + AC_SUBST(MD5_SUBDIR, md5) + md5_INIT + fi + AC_DEFINE([HAVE_MD5], 1, [Define if we're compiling support for MD5.]) +fi + +dnl ********************************************************************** +dnl Checks for IPv6 +dnl ********************************************************************** + +dnl +dnl We test for IPv6 by checking, in turn, for availability of +dnl getaddrinfo, presence of the INET6 address/protocol family, and +dnl the existence of struct sockaddr_in6. If any of them is missing, +dnl IPv6 is disabled, and the code reverts to old-style gethostbyname. +dnl +dnl If --enable-ipv6 is explicitly specified on the configure command +dnl line, we check for IPv6 and abort if not found. If --disable-ipv6 +dnl is specified, we disable IPv6 and don't check for it. The default +dnl is to autodetect IPv6 and use it where available. +dnl + +AC_ARG_ENABLE(ipv6, + AC_HELP_STRING([--disable-ipv6],[disable IPv6 support]), + [case "${enable_ipv6}" in + no) + AC_MSG_NOTICE([disabling IPv6 at user request]) + dnl Disable IPv6 checking + ipv6=no + ;; + yes) + dnl IPv6 explicitly enabled: force its use (abort if unavailable). + ipv6=yes + force_ipv6=yes + ;; + auto) + dnl Auto-detect IPv6, i.e. check for IPv6, but don't force it. + ipv6=yes + ;; + *) + AC_MSG_ERROR([Invalid --enable-ipv6 argument \`$enable_ipv6']) + ;; + esac + ], [ + dnl If nothing is specified, assume auto-detection. + ipv6=yes + ] +) + +if test "X$ipv6" = "Xyes"; then + AC_CHECK_FUNCS(getaddrinfo, [], [ + AC_MSG_NOTICE([Disabling IPv6 support: your system does not support getaddrinfo(3)]) + ipv6=no + ]) +fi + +if test "X$ipv6" = "Xyes"; then + PROTO_INET6([], [ + AC_MSG_NOTICE([Disabling IPv6 support: your system does not support the PF_INET6 protocol family]) + ipv6=no + ]) +fi + +if test "X$ipv6" = "Xyes"; then + TYPE_STRUCT_SOCKADDR_IN6([],[ + AC_MSG_NOTICE([Disabling IPv6 support: your system does not support \`struct sockaddr_in6']) + ipv6=no + ]) + if test "X$ipv6" = "Xyes"; then + WGET_STRUCT_SOCKADDR_STORAGE + MEMBER_SIN6_SCOPE_ID + fi +fi + +if test "X$ipv6" = "Xyes"; then + AC_DEFINE([ENABLE_IPV6], 1, [Define if IPv6 support is enabled.]) + AC_MSG_NOTICE([Enabling support for IPv6.]) +elif test "x$force_ipv6" = "xyes"; then + AC_MSG_ERROR([IPv6 support requested but not found; aborting]) +fi + + +dnl +dnl Set of available languages. +dnl +dnl Originally this used to be static, looking like this: +dnl ALL_LINGUAS="cs de hr it ..." +dnl The downside was that configure needed to be rebuilt whenever a +dnl new language was added. +dnl +ALL_LINGUAS="en@quot en@boldquot en_US $(cd ${srcdir}/po && ls *.po | grep -v 'en@.*quot' | grep -v 'en_US\.po' | sed -e 's/\.po$//' | tr '\012' ' ')" + +dnl +dnl Find makeinfo. We used to provide support for Emacs processing +dnl Texinfo using `emacs -batch -eval ...' where makeinfo is +dnl unavailable, but that broke with the addition of makeinfo-specific +dnl command-line options, such as `-I'. Now we depend on makeinfo to +dnl build the Info documentation. +dnl + +AC_CHECK_PROGS(MAKEINFO, [makeinfo], [true]) + +dnl +dnl Find perl and pod2man +dnl + +AC_PATH_PROGS(PERL, [perl5 perl], no) +AC_PATH_PROG(POD2MAN, pod2man, no) + +if test "x${POD2MAN}" = xno; then + COMMENT_IF_NO_POD2MAN="# " +else + COMMENT_IF_NO_POD2MAN= +fi +AC_SUBST(COMMENT_IF_NO_POD2MAN) + +dnl +dnl Create output +dnl +AC_CONFIG_FILES([Makefile src/Makefile doc/Makefile util/Makefile + po/Makefile.in tests/Makefile tests/WgetTest.pm + lib/Makefile md5/Makefile windows/Makefile]) +AC_CONFIG_HEADERS([src/config.h]) +AC_OUTPUT diff --cc src/Makefile.am index f598d908,00000000..2403f671 mode 100644,000000..100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@@ -1,64 -1,0 +1,66 @@@ +# Makefile for `wget' utility +# Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, +# 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with Wget. If not, see . + +# Additional permission under GNU GPL version 3 section 7 + +# If you modify this program, or any covered work, by linking or +# combining it with the OpenSSL project's OpenSSL library (or a +# modified version of that library), containing parts covered by the +# terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +# grants you additional permission to convey the resulting work. +# Corresponding Source for a non-source form of such a combination +# shall include the source code for the parts of OpenSSL used as well +# as that of the covered work. + +# +# Version: @VERSION@ +# + +# The following line is losing on some versions of make! +DEFS = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\" +LIBS = @LIBS@ @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ + +bin_PROGRAMS = wget - wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c ftp-basic.c \ ++wget_SOURCES = cmpt.c connect.c convert.c cookies.c \ ++ css.lex css-url.c \ ++ ftp.c ftp-basic.c \ + ftp-ls.c hash.c host.c html-parse.c html-url.c http.c \ + init.c log.c main.c netrc.c progress.c ptimer.c recur.c \ + res.c retr.c snprintf.c spider.c url.c \ + utils.c xmalloc.c \ - connect.h convert.h cookies.h \ - ftp.h gen-md5.h hash.h host.h html-parse.h \ ++ css-url.h connect.h convert.h cookies.h \ ++ ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h \ + http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ + options.h progress.h ptimer.h recur.h res.h retr.h \ + spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h +nodist_wget_SOURCES = version.c +EXTRA_wget_SOURCES = mswindows.c +LDADD = $(ALLOCA) $(LIBOBJS) ../lib/libgnu.a @MD5_LDADD@ +AM_CPPFLAGS = -I $(top_srcdir)/lib @MD5_CPPFLAGS@ + +version.c: $(wget_SOURCES) $(LDADD) $(srcdir)/Makefile.am + echo 'const char *version_string = "@VERSION@"' > $@ + -hg log -r . --template='" ({node|short})"\n' 2>/dev/null >> $@ + echo ';' >> $@ + +check_LIBRARIES = libunittest.a +libunittest_a_SOURCES = $(wget_SOURCES) test.c test.h +nodist_libunittest_a_SOURCES = version.c +libunittest_a_CPPFLAGS = -DTESTING -I$(top_srcdir)/lib +libunittest_a_LIBADD = $(ALLOCA) $(LIBOBJS) + +CLEANFILES = *~ *.bak core core.[0-9]* version.c diff --cc src/convert.c index 2811bff7,7b38550b..4f90bb3b --- a/src/convert.c +++ b/src/convert.c @@@ -83,12 -71,12 +70,12 @@@ convert_links_in_hashtable (struct hash char **file_array; cnt = 0; - if (downloaded_html_set) - cnt = hash_table_count (downloaded_html_set); + if (downloaded_set) + cnt = hash_table_count (downloaded_set); if (cnt == 0) - return; + goto cleanup; file_array = alloca_array (char *, cnt); - string_set_to_array (downloaded_html_set, file_array); + string_set_to_array (downloaded_set, file_array); for (i = 0; i < cnt; i++) { @@@ -165,12 -154,38 +153,39 @@@ /* Free the data. */ free_urlpos (urls); } + } + + /* This function is called when the retrieval is done to convert the + links that have been downloaded. It has to be called at the end of + the retrieval, because only then does Wget know conclusively which + URLs have been downloaded, and which not, so it can tell which + direction to convert to. + + The "direction" means that the URLs to the files that have been + downloaded get converted to the relative URL which will point to + that file. And the other URLs get converted to the remote URL on + the server. + + All the downloaded HTMLs are kept in downloaded_html_files, and + downloaded URLs in urls_downloaded. All the information is + extracted from these two lists. */ + + void + convert_all_links (void) + { + double secs; + int file_count = 0; + + struct ptimer *timer = ptimer_new (); + + convert_links_in_hashtable (downloaded_html_set, 0, &file_count); + convert_links_in_hashtable (downloaded_css_set, 1, &file_count); secs = ptimer_measure (timer); - ptimer_destroy (timer); logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"), file_count, print_decimal (secs)); +cleanup: + ptimer_destroy (timer); } static void write_backup_file (const char *, downloaded_file_t); diff --cc src/html-parse.c index ade82f2b,8254c6dc..f744597b --- a/src/html-parse.c +++ b/src/html-parse.c @@@ -881,112 -984,117 +984,117 @@@ map_html_tags (const char *text, int si COMPACT="compact">. Even if such attributes are not useful to Wget, we need to support them, so that the tags containing them can be parsed correctly. */ - attr_raw_value_begin = attr_value_begin = attr_name_begin; - attr_raw_value_end = attr_value_end = attr_name_end; - } - else if (*p == '=') - { - ADVANCE (p); - SKIP_WS (p); - if (*p == '\"' || *p == '\'') - { - bool newline_seen = false; - char quote_char = *p; - attr_raw_value_begin = p; - ADVANCE (p); - attr_value_begin = p; /* */ - /* ^ */ - while (*p != quote_char) - { - if (!newline_seen && *p == '\n') - { - /* If a newline is seen within the quotes, it - is most likely that someone forgot to close - the quote. In that case, we back out to - the value beginning, and terminate the tag - at either `>' or the delimiter, whichever - comes first. Such a tag terminated at `>' - is discarded. */ - p = attr_value_begin; - newline_seen = true; - continue; - } - else if (newline_seen && *p == '>') - break; - ADVANCE (p); - } - attr_value_end = p; /* */ - /* ^ */ - if (*p == quote_char) - ADVANCE (p); - else - goto look_for_tag; - attr_raw_value_end = p; /* */ - /* ^ */ - operation = AP_DECODE_ENTITIES; - if (flags & MHT_TRIM_VALUES) - operation |= AP_TRIM_BLANKS; - } - else - { - attr_value_begin = p; /* */ - /* ^ */ - /* According to SGML, a name token should consist only - of alphanumerics, . and -. However, this is often - violated by, for instance, `%' in `width=75%'. - We'll be liberal and allow just about anything as - an attribute value. */ - while (!ISSPACE (*p) && *p != '>') - ADVANCE (p); - attr_value_end = p; /* */ - /* ^ */ - if (attr_value_begin == attr_value_end) - /* */ - /* ^ */ - goto backout_tag; - attr_raw_value_begin = attr_value_begin; - attr_raw_value_end = attr_value_end; - operation = AP_DECODE_ENTITIES; - } - } - else - { - /* We skipped the whitespace and found something that is - neither `=' nor the beginning of the next attribute's - name. Back out. */ - goto backout_tag; /* */ + /* ^ */ + while (*p != quote_char) + { + if (!newline_seen && *p == '\n') + { + /* If a newline is seen within the quotes, it + is most likely that someone forgot to close + the quote. In that case, we back out to + the value beginning, and terminate the tag + at either `>' or the delimiter, whichever + comes first. Such a tag terminated at `>' + is discarded. */ + p = attr_value_begin; + newline_seen = true; + continue; + } + else if (newline_seen && *p == '>') + break; + ADVANCE (p); + } + attr_value_end = p; /* */ + /* ^ */ + if (*p == quote_char) + ADVANCE (p); + else + goto look_for_tag; + attr_raw_value_end = p; /* */ + /* ^ */ + operation = AP_DECODE_ENTITIES; + if (flags & MHT_TRIM_VALUES) + operation |= AP_TRIM_BLANKS; + } + else + { + attr_value_begin = p; /* */ + /* ^ */ + /* According to SGML, a name token should consist only + of alphanumerics, . and -. However, this is often + violated by, for instance, `%' in `width=75%'. + We'll be liberal and allow just about anything as + an attribute value. */ + while (!c_isspace (*p) && *p != '>') + ADVANCE (p); + attr_value_end = p; /* */ + /* ^ */ + if (attr_value_begin == attr_value_end) + /* */ + /* ^ */ + goto backout_tag; + attr_raw_value_begin = attr_value_begin; + attr_raw_value_end = attr_value_end; + operation = AP_DECODE_ENTITIES; + } + } + else + { + /* We skipped the whitespace and found something that is + neither `=' nor the beginning of the next attribute's + name. Back out. */ + goto backout_tag; /* tagname_begin == tag_name_begin)) + { + tail->contents_begin = p+1; + } + if (uninteresting_tag) { - ADVANCE (p); - goto look_for_tag; + ADVANCE (p); + goto look_for_tag; } /* By now, we have a valid tag with a name and zero or more diff --cc src/html-url.c index e9f2773a,ebf8494d..c9cf28f6 --- a/src/html-url.c +++ b/src/html-url.c @@@ -163,11 -163,12 +163,12 @@@ static struct from the information above. However, some places in the code refer to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { - "rel", /* used by tag_handle_link */ - "http-equiv", /* used by tag_handle_meta */ - "name", /* used by tag_handle_meta */ - "content", /* used by tag_handle_meta */ - "action" /* used by tag_handle_form */ - "rel", /* used by tag_handle_link */ - "http-equiv", /* used by tag_handle_meta */ - "name", /* used by tag_handle_meta */ - "content", /* used by tag_handle_meta */ - "action", /* used by tag_handle_form */ - "style" /* used by check_style_attr */ ++ "rel", /* used by tag_handle_link */ ++ "http-equiv", /* used by tag_handle_meta */ ++ "name", /* used by tag_handle_meta */ ++ "content", /* used by tag_handle_meta */ ++ "action", /* used by tag_handle_form */ ++ "style" /* used by check_style_attr */ }; static struct hash_table *interesting_tags; @@@ -385,25 -390,26 +390,26 @@@ tag_find_urls (int tagid, struct taginf const int size = countof (tag_url_attributes); /* If you're cringing at the inefficiency of the nested loops, - remember that they both iterate over a very small number of - items. The worst-case inner loop is for the IMG tag, which - has three attributes. */ + remember that they both iterate over a very small number of + items. The worst-case inner loop is for the IMG tag, which + has three attributes. */ for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) - { - if (0 == strcasecmp (tag->attrs[attrind].name, - tag_url_attributes[i].attr_name)) - { - struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), + { + if (0 == strcasecmp (tag->attrs[attrind].name, + tag_url_attributes[i].attr_name)) + { - struct urlpos *up = append_url (link, tag, attrind, ctx); ++ struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); - if (up) - { - int flags = tag_url_attributes[i].flags; - if (flags & ATTR_INLINE) - up->link_inline_p = 1; - if (flags & ATTR_HTML) - up->link_expect_html = 1; - } - } - } + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } + } + } } } @@@ -439,11 -446,13 +446,13 @@@ tag_handle_form (int tagid, struct tagi { int attrind; char *action = find_attr (tag, "action", &attrind); + if (action) { - struct urlpos *up = append_url (action, tag, attrind, ctx); + struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) - up->ignore_when_downloading = 1; + up->ignore_when_downloading = 1; } } @@@ -464,19 -473,28 +473,28 @@@ tag_handle_link (int tagid, struct tagi */ if (href) { - struct urlpos *up = append_url (href, tag, attrind, ctx); + struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) - { - char *rel = find_attr (tag, "rel", NULL); - if (rel) + { + char *rel = find_attr (tag, "rel", NULL); - if (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))) - up->link_inline_p = 1; ++ if (rel) + { - if (0 == strcasecmp (rel, "stylesheet")) ++ if (0 == strcasecmp (rel, "stylesheet")) + { + up->link_inline_p = 1; + up->link_expect_css = 1; + } - else if (0 == strcasecmp (rel, "shortcut icon")) ++ else if (0 == strcasecmp (rel, "shortcut icon")) + { + up->link_inline_p = 1; + } + } - else - /* The external ones usually point to HTML pages, such as - */ - up->link_expect_html = 1; - } + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; + } } } @@@ -507,31 -525,32 +525,32 @@@ tag_handle_meta (int tagid, struct tagi char *refresh = find_attr (tag, "content", &attrind); if (!refresh) - return; + return; - for (p = refresh; ISDIGIT (*p); p++) - timeout = 10 * timeout + *p - '0'; + for (p = refresh; c_isdigit (*p); p++) + timeout = 10 * timeout + *p - '0'; if (*p++ != ';') - return; - - while (ISSPACE (*p)) - ++p; - if (!( TOUPPER (*p) == 'U' - && TOUPPER (*(p + 1)) == 'R' - && TOUPPER (*(p + 2)) == 'L' - && *(p + 3) == '=')) - return; + return; + + while (c_isspace (*p)) + ++p; + if (!( c_toupper (*p) == 'U' + && c_toupper (*(p + 1)) == 'R' + && c_toupper (*(p + 2)) == 'L' + && *(p + 3) == '=')) + return; p += 4; - while (ISSPACE (*p)) - ++p; + while (c_isspace (*p)) + ++p; - entry = append_url (p, tag, attrind, ctx); + entry = append_url (p, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (entry) - { - entry->link_refresh_p = 1; - entry->refresh_timeout = timeout; - entry->link_expect_html = 1; - } + { + entry->link_refresh_p = 1; + entry->refresh_timeout = timeout; + entry->link_expect_html = 1; + } } else if (name && 0 == strcasecmp (name, "robots")) { @@@ -618,8 -652,9 +652,9 @@@ get_urls_html (const char *file, const if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; + /* the NULL here used to be interesting_tags */ map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, - interesting_tags, interesting_attributes); - NULL, interesting_attributes); ++ NULL, interesting_attributes); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) diff --cc src/http.c index ec815c8f,d3f6704f..fb8184f1 --- a/src/http.c +++ b/src/http.c @@@ -68,14 -67,6 +68,15 @@@ as that of the covered work. * extern char *version_string; +/* Forward decls. */ +static char *create_authorization_line (const char *, const char *, + const char *, const char *, + const char *, bool *); +static char *basic_authentication_encode (const char *, const char *); +static bool known_authentication_scheme_p (const char *, const char *); ++static void ensure_extension (struct http_stat *, const char *, int *); +static void load_cookies (void); + #ifndef MIN # define MIN(x, y) ((x) > (y) ? (y) : (x)) #endif diff --cc src/recur.c index c11cfdad,024073ce..daf8a374 --- a/src/recur.c +++ b/src/recur.c @@@ -53,13 -55,14 +55,13 @@@ as that of the covered work. * /* Functions for maintaining the URL queue. */ struct queue_element { - const char *url; /* the URL to download */ - const char *url; /* the URL to download */ - const char *referer; /* the referring document */ - int depth; /* the depth */ - bool html_allowed; /* whether the document is allowed to - be treated as HTML. */ - bool css_allowed; /* whether the document is allowed to - be treated as CSS. */ - struct queue_element *next; /* next element in queue */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + bool html_allowed; /* whether the document is allowed to + be treated as HTML. */ - ++ bool css_allowed; /* whether the document is allowed to ++ be treated as CSS. */ + struct queue_element *next; /* next element in queue */ }; struct url_queue { @@@ -91,7 -94,8 +93,8 @@@ url_queue_delete (struct url_queue *que static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth, bool html_allowed) - const char *url, const char *referer, int depth, ++ const char *url, const char *referer, int depth, + bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); qel->url = url; @@@ -120,8 -125,8 +124,8 @@@ static bool url_dequeue (struct url_queue *queue, - const char **url, const char **referer, int *depth, - bool *html_allowed, bool *css_allowed) + const char **url, const char **referer, int *depth, - bool *html_allowed) ++ bool *html_allowed, bool *css_allowed) { struct queue_element *qel = queue->head; @@@ -219,146 -226,164 +225,173 @@@ retrieve_tree (const char *start_url /* Get the next URL from the queue... */ if (!url_dequeue (queue, - (const char **)&url, (const char **)&referer, - &depth, &html_allowed, &css_allowed)) - break; + (const char **)&url, (const char **)&referer, - &depth, &html_allowed)) ++ &depth, &html_allowed, &css_allowed)) + break; /* ...and download it. Note that this download is in most cases - unconditional, as download_child_p already makes sure a file - doesn't get enqueued twice -- and yet this check is here, and - not in download_child_p. This is so that if you run `wget -r - URL1 URL2', and a random URL is encountered once under URL1 - and again under URL2, but at a different (possibly smaller) - depth, we want the URL's children to be taken into account - the second time. */ + unconditional, as download_child_p already makes sure a file + doesn't get enqueued twice -- and yet this check is here, and + not in download_child_p. This is so that if you run `wget -r + URL1 URL2', and a random URL is encountered once under URL1 + and again under URL2, but at a different (possibly smaller) + depth, we want the URL's children to be taken into account + the second time. */ if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) - { - file = xstrdup (hash_table_get (dl_url_file_map, url)); + { + file = xstrdup (hash_table_get (dl_url_file_map, url)); - DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", - url, file)); + DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", + url, file)); + /* this sucks, needs to be combined! */ - if (html_allowed - && downloaded_html_set - && string_set_contains (downloaded_html_set, file)) + if (html_allowed + && downloaded_html_set + && string_set_contains (downloaded_html_set, file)) - descend = true; + { + descend = true; + is_css = false; + } + if (css_allowed + && downloaded_css_set + && string_set_contains (downloaded_css_set, file)) + { - descend = 1; ++ descend = true; + is_css = true; + } - } + } else - { - int dt = 0; - char *redirected = NULL; + { + int dt = 0; + char *redirected = NULL; - status = retrieve_url (url, &file, &redirected, referer, &dt, false); + status = retrieve_url (url, &file, &redirected, referer, &dt, false); - if (html_allowed && file && status == RETROK - && (dt & RETROKF) && (dt & TEXTHTML)) + if (html_allowed && file && status == RETROK + && (dt & RETROKF) && (dt & TEXTHTML)) - descend = true; + { + descend = true; + is_css = false; + } + + /* a little different, css_allowed can override content type + lots of web servers serve css with an incorrect content type + */ + if (file && status == RETROK + && (dt & RETROKF) && + ((dt & TEXTCSS) || css_allowed)) + { + descend = true; + is_css = false; + } - if (redirected) - { - /* We have been redirected, possibly to another host, or - different path, or wherever. Check whether we really - want to follow it. */ - if (descend) - { - if (!descend_redirect_p (redirected, url, depth, - start_url_parsed, blacklist)) - descend = false; - else - /* Make sure that the old pre-redirect form gets - blacklisted. */ - string_set_add (blacklist, url); - } - - xfree (url); - url = redirected; - } - } + if (redirected) + { + /* We have been redirected, possibly to another host, or + different path, or wherever. Check whether we really + want to follow it. */ + if (descend) + { + if (!descend_redirect_p (redirected, url, depth, + start_url_parsed, blacklist)) + descend = false; + else + /* Make sure that the old pre-redirect form gets + blacklisted. */ + string_set_add (blacklist, url); + } + + xfree (url); + url = redirected; + } + } if (opt.spider) - { + { visited_url (url, referer); - } + } if (descend - && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) - { - if (opt.page_requisites - && (depth == opt.reclevel || depth == opt.reclevel + 1)) - { - /* When -p is specified, we are allowed to exceed the - maximum depth, but only for the "inline" links, - i.e. those that are needed to display the page. - Originally this could exceed the depth at most by - one, but we allow one more level so that the leaf - pages that contain frames can be loaded - correctly. */ - dash_p_leaf_HTML = true; - } - else - { - /* Either -p wasn't specified or it was and we've - already spent the two extra (pseudo-)levels that it - affords us, so we need to bail out. */ - DEBUGP (("Not descending further; at depth %d, max. %d.\n", - depth, opt.reclevel)); - descend = false; - } - } + && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) + { + if (opt.page_requisites + && (depth == opt.reclevel || depth == opt.reclevel + 1)) + { + /* When -p is specified, we are allowed to exceed the + maximum depth, but only for the "inline" links, + i.e. those that are needed to display the page. + Originally this could exceed the depth at most by + one, but we allow one more level so that the leaf + pages that contain frames can be loaded + correctly. */ + dash_p_leaf_HTML = true; + } + else + { + /* Either -p wasn't specified or it was and we've + already spent the two extra (pseudo-)levels that it + affords us, so we need to bail out. */ + DEBUGP (("Not descending further; at depth %d, max. %d.\n", + depth, opt.reclevel)); + descend = false; + } + } - /* If the downloaded document was HTML, parse it and enqueue the + /* If the downloaded document was HTML or CSS, parse it and enqueue the - links it contains. */ + links it contains. */ if (descend) - { - bool meta_disallow_follow = false; - struct urlpos *children - = is_css ? get_urls_css_file (file, url) : + { + bool meta_disallow_follow = false; + struct urlpos *children - = get_urls_html (file, url, &meta_disallow_follow); ++ = is_css ? get_urls_css_file (file, url) : + get_urls_html (file, url, &meta_disallow_follow); - if (opt.use_robots && meta_disallow_follow) - { - free_urlpos (children); - children = NULL; - } - - if (children) - { - struct urlpos *child = children; - struct url *url_parsed = url_parsed = url_parse (url, NULL); - assert (url_parsed != NULL); - - for (; child; child = child->next) - { - if (child->ignore_when_downloading) - continue; - if (dash_p_leaf_HTML && !child->link_inline_p) - continue; - if (download_child_p (child, url_parsed, depth, start_url_parsed, - blacklist)) - { - url_enqueue (queue, xstrdup (child->url->url), - xstrdup (url), depth + 1, - child->link_expect_html, - child->link_expect_css); - /* We blacklist the URL we have enqueued, because we - don't want to enqueue (and hence download) the - same URL twice. */ - string_set_add (blacklist, child->url->url); - } - } - - url_free (url_parsed); - free_urlpos (children); - } - } + if (opt.use_robots && meta_disallow_follow) + { + free_urlpos (children); + children = NULL; + } + + if (children) + { + struct urlpos *child = children; + struct url *url_parsed = url_parsed = url_parse (url, NULL); + char *referer_url = url; + bool strip_auth = (url_parsed != NULL + && url_parsed->user != NULL); + assert (url_parsed != NULL); + + /* Strip auth info if present */ + if (strip_auth) + referer_url = url_string (url_parsed, URL_AUTH_HIDE); + + for (; child; child = child->next) + { + if (child->ignore_when_downloading) + continue; + if (dash_p_leaf_HTML && !child->link_inline_p) + continue; + if (download_child_p (child, url_parsed, depth, start_url_parsed, + blacklist)) + { + url_enqueue (queue, xstrdup (child->url->url), + xstrdup (referer_url), depth + 1, - child->link_expect_html); ++ child->link_expect_html, ++ child->link_expect_css); + /* We blacklist the URL we have enqueued, because we + don't want to enqueue (and hence download) the + same URL twice. */ + string_set_add (blacklist, child->url->url); + } + } + + if (strip_auth) + xfree (referer_url); + url_free (url_parsed); + free_urlpos (children); + } + } if (file && (opt.delete_after @@@ -394,12 -419,12 +427,12 @@@ { char *d1, *d2; int d3; - bool d4; + bool d4, d5; while (url_dequeue (queue, - (const char **)&d1, (const char **)&d2, &d3, &d4)) - (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) ++ (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { - xfree (d1); - xfree_null (d2); + xfree (d1); + xfree_null (d2); } } url_queue_delete (queue); diff --cc src/retr.c index 179430ac,245eb129..7bdd4193 --- a/src/retr.c +++ b/src/retr.c @@@ -772,13 -779,15 +773,15 @@@ retrieve_url (const char *origurl, cha if (local_file) { if (*dt & RETROKF) - { - register_download (u->url, local_file); - if (redirection_count && 0 != strcmp (origurl, u->url)) - register_redirection (origurl, u->url); - if (*dt & TEXTHTML) - register_html (u->url, local_file); - if (*dt & TEXTCSS) - register_css (u->url, local_file); - } + { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); ++ if (*dt & TEXTCSS) ++ register_css (u->url, local_file); + } } if (file)