+2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * configure.in: Test for MMAP.
+
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
* windows/config.h.ms: snprintf and vsnprintf exist under Windows.
* Make `-k' check for files that were downloaded in the past and convert links
to them in newly-downloaded documents.
-* -k should convert relative references to absolute if not downloaded.
-
-* -k should convert "hostless absolute" URLs, like <A HREF="/index.html">.
- However, Brian McMahon <bm@iucr.org> wants the old incorrect behavior to still
- be available as an option, as he depends on it to allow mirrors of his site to
- send CGI queries to his original site, but still get graphics off of the
- mirror site. Perhaps this would be better dealt with by adding an option to
- tell -k not to convert certain URL patterns?
-
* Add option to clobber existing file names (no `.N' suffixes).
* Introduce a concept of "boolean" options. For instance, every
* Allow size limit to files (perhaps with an option to download oversize files
up through the limit or not at all, to get more functionality than [u]limit.
-* Recognize HTML comments correctly. Add more options for handling
- bogus HTML found all over the 'net.
-
* Implement breadth-first retrieval.
* Download to .in* when mirroring.
fi
+for ac_hdr in unistd.h
+do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "configure:2048: checking for $ac_hdr" >&5
+if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 2053 "configure"
+#include "confdefs.h"
+#include <$ac_hdr>
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:2058: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+ rm -rf conftest*
+ eval "ac_cv_header_$ac_safe=yes"
+else
+ echo "$ac_err" >&5
+ echo "configure: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+ echo "$ac_t""yes" 1>&6
+ ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'`
+ cat >> confdefs.h <<EOF
+#define $ac_tr_hdr 1
+EOF
+
+else
+ echo "$ac_t""no" 1>&6
+fi
+done
+
+for ac_func in getpagesize
+do
+echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
+echo "configure:2087: checking for $ac_func" >&5
+if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ cat > conftest.$ac_ext <<EOF
+#line 2092 "configure"
+#include "confdefs.h"
+/* System header to define __stub macros and hopefully few prototypes,
+ which can conflict with char $ac_func(); below. */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error. */
+/* We use char because int might match the return type of a gcc2
+ builtin and then its argument prototype would still apply. */
+char $ac_func();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+ to always fail with ENOSYS. Some functions are actually named
+ something starting with __ and the normal name is an alias. */
+#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
+choke me
+#else
+$ac_func();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo configure:2115: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ eval "ac_cv_func_$ac_func=yes"
+else
+ echo "configure: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -rf conftest*
+ eval "ac_cv_func_$ac_func=no"
+fi
+rm -f conftest*
+fi
+
+if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then
+ echo "$ac_t""yes" 1>&6
+ ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
+ cat >> confdefs.h <<EOF
+#define $ac_tr_func 1
+EOF
+
+else
+ echo "$ac_t""no" 1>&6
+fi
+done
+
+echo $ac_n "checking for working mmap""... $ac_c" 1>&6
+echo "configure:2140: checking for working mmap" >&5
+if eval "test \"`echo '$''{'ac_cv_func_mmap_fixed_mapped'+set}'`\" = set"; then
+ echo $ac_n "(cached) $ac_c" 1>&6
+else
+ if test "$cross_compiling" = yes; then
+ ac_cv_func_mmap_fixed_mapped=no
+else
+ cat > conftest.$ac_ext <<EOF
+#line 2148 "configure"
+#include "confdefs.h"
+
+/* Thanks to Mike Haertel and Jim Avera for this test.
+ Here is a matrix of mmap possibilities:
+ mmap private not fixed
+ mmap private fixed at somewhere currently unmapped
+ mmap private fixed at somewhere already mapped
+ mmap shared not fixed
+ mmap shared fixed at somewhere currently unmapped
+ mmap shared fixed at somewhere already mapped
+ For private mappings, we should verify that changes cannot be read()
+ back from the file, nor mmap's back from the file at a different
+ address. (There have been systems where private was not correctly
+ implemented like the infamous i386 svr4.0, and systems where the
+ VM page cache was not coherent with the filesystem buffer cache
+ like early versions of FreeBSD and possibly contemporary NetBSD.)
+ For shared mappings, we should conversely verify that changes get
+ propogated back to all the places they're supposed to be.
+
+ Grep wants private fixed already mapped.
+ The main things grep needs to know about mmap are:
+ * does it exist and is it safe to write into the mmap'd area
+ * how to use it (BSD variants) */
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+/* This mess was copied from the GNU getpagesize.h. */
+#ifndef HAVE_GETPAGESIZE
+# ifdef HAVE_UNISTD_H
+# include <unistd.h>
+# endif
+
+/* Assume that all systems that can run configure have sys/param.h. */
+# ifndef HAVE_SYS_PARAM_H
+# define HAVE_SYS_PARAM_H 1
+# endif
+
+# ifdef _SC_PAGESIZE
+# define getpagesize() sysconf(_SC_PAGESIZE)
+# else /* no _SC_PAGESIZE */
+# ifdef HAVE_SYS_PARAM_H
+# include <sys/param.h>
+# ifdef EXEC_PAGESIZE
+# define getpagesize() EXEC_PAGESIZE
+# else /* no EXEC_PAGESIZE */
+# ifdef NBPG
+# define getpagesize() NBPG * CLSIZE
+# ifndef CLSIZE
+# define CLSIZE 1
+# endif /* no CLSIZE */
+# else /* no NBPG */
+# ifdef NBPC
+# define getpagesize() NBPC
+# else /* no NBPC */
+# ifdef PAGESIZE
+# define getpagesize() PAGESIZE
+# endif /* PAGESIZE */
+# endif /* no NBPC */
+# endif /* no NBPG */
+# endif /* no EXEC_PAGESIZE */
+# else /* no HAVE_SYS_PARAM_H */
+# define getpagesize() 8192 /* punt totally */
+# endif /* no HAVE_SYS_PARAM_H */
+# endif /* no _SC_PAGESIZE */
+
+#endif /* no HAVE_GETPAGESIZE */
+
+#ifdef __cplusplus
+extern "C" { void *malloc(unsigned); }
+#else
+char *malloc();
+#endif
+
+int
+main()
+{
+ char *data, *data2, *data3;
+ int i, pagesize;
+ int fd;
+
+ pagesize = getpagesize();
+
+ /*
+ * First, make a file with some known garbage in it.
+ */
+ data = malloc(pagesize);
+ if (!data)
+ exit(1);
+ for (i = 0; i < pagesize; ++i)
+ *(data + i) = rand();
+ umask(0);
+ fd = creat("conftestmmap", 0600);
+ if (fd < 0)
+ exit(1);
+ if (write(fd, data, pagesize) != pagesize)
+ exit(1);
+ close(fd);
+
+ /*
+ * Next, try to mmap the file at a fixed address which
+ * already has something else allocated at it. If we can,
+ * also make sure that we see the same garbage.
+ */
+ fd = open("conftestmmap", O_RDWR);
+ if (fd < 0)
+ exit(1);
+ data2 = malloc(2 * pagesize);
+ if (!data2)
+ exit(1);
+ data2 += (pagesize - ((int) data2 & (pagesize - 1))) & (pagesize - 1);
+ if (data2 != mmap(data2, pagesize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_FIXED, fd, 0L))
+ exit(1);
+ for (i = 0; i < pagesize; ++i)
+ if (*(data + i) != *(data2 + i))
+ exit(1);
+
+ /*
+ * Finally, make sure that changes to the mapped area
+ * do not percolate back to the file as seen by read().
+ * (This is a bug on some variants of i386 svr4.0.)
+ */
+ for (i = 0; i < pagesize; ++i)
+ *(data2 + i) = *(data2 + i) + 1;
+ data3 = malloc(pagesize);
+ if (!data3)
+ exit(1);
+ if (read(fd, data3, pagesize) != pagesize)
+ exit(1);
+ for (i = 0; i < pagesize; ++i)
+ if (*(data + i) != *(data3 + i))
+ exit(1);
+ close(fd);
+ unlink("conftestmmap");
+ exit(0);
+}
+
+EOF
+if { (eval echo configure:2288: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+then
+ ac_cv_func_mmap_fixed_mapped=yes
+else
+ echo "configure: failed program was:" >&5
+ cat conftest.$ac_ext >&5
+ rm -fr conftest*
+ ac_cv_func_mmap_fixed_mapped=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$ac_cv_func_mmap_fixed_mapped" 1>&6
+if test $ac_cv_func_mmap_fixed_mapped = yes; then
+ cat >> confdefs.h <<\EOF
+#define HAVE_MMAP 1
+EOF
+
+fi
+
for ac_func in strdup strstr strcasecmp strncasecmp
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2047: checking for $ac_func" >&5
+echo "configure:2313: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2052 "configure"
+#line 2318 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2075: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2341: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
for ac_func in gettimeofday mktime strptime
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2102: checking for $ac_func" >&5
+echo "configure:2368: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2107 "configure"
+#line 2373 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2130: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2396: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
for ac_func in strerror snprintf vsnprintf select signal symlink access isatty
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2157: checking for $ac_func" >&5
+echo "configure:2423: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2162 "configure"
+#line 2428 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2185: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2451: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
for ac_func in uname gethostname
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2212: checking for $ac_func" >&5
+echo "configure:2478: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2217 "configure"
+#line 2483 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2240: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2506: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
for ac_func in gethostbyname
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2268: checking for $ac_func" >&5
+echo "configure:2534: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2273 "configure"
+#line 2539 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2296: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2562: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
else
echo "$ac_t""no" 1>&6
echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6
-echo "configure:2318: checking for gethostbyname in -lnsl" >&5
+echo "configure:2584: checking for gethostbyname in -lnsl" >&5
ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lnsl $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2326 "configure"
+#line 2592 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
gethostbyname()
; return 0; }
EOF
-if { (eval echo configure:2337: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2603: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
echo $ac_n "checking for socket in -lsocket""... $ac_c" 1>&6
-echo "configure:2371: checking for socket in -lsocket" >&5
+echo "configure:2637: checking for socket in -lsocket" >&5
ac_lib_var=`echo socket'_'socket | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lsocket $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2379 "configure"
+#line 2645 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
socket()
; return 0; }
EOF
-if { (eval echo configure:2390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2656: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
if test "x${with_socks}" = xyes
then
echo $ac_n "checking for main in -lresolv""... $ac_c" 1>&6
-echo "configure:2421: checking for main in -lresolv" >&5
+echo "configure:2687: checking for main in -lresolv" >&5
ac_lib_var=`echo resolv'_'main | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lresolv $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2429 "configure"
+#line 2695 "configure"
#include "confdefs.h"
int main() {
main()
; return 0; }
EOF
-if { (eval echo configure:2436: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2702: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
fi
echo $ac_n "checking for Rconnect in -lsocks""... $ac_c" 1>&6
-echo "configure:2464: checking for Rconnect in -lsocks" >&5
+echo "configure:2730: checking for Rconnect in -lsocks" >&5
ac_lib_var=`echo socks'_'Rconnect | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lsocks $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2472 "configure"
+#line 2738 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
Rconnect()
; return 0; }
EOF
-if { (eval echo configure:2483: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2749: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
ALL_LINGUAS="cs de hr it no pl pt_BR ru"
echo $ac_n "checking whether NLS is requested""... $ac_c" 1>&6
-echo "configure:2515: checking whether NLS is requested" >&5
+echo "configure:2781: checking whether NLS is requested" >&5
# Check whether --enable-nls or --disable-nls was given.
if test "${enable_nls+set}" = set; then
enableval="$enable_nls"
# Extract the first word of "msgfmt", so it can be a program name with args.
set dummy msgfmt; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2532: checking for $ac_word" >&5
+echo "configure:2798: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_path_MSGFMT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
# Extract the first word of "xgettext", so it can be a program name with args.
set dummy xgettext; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2566: checking for $ac_word" >&5
+echo "configure:2832: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_path_XGETTEXT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
# Extract the first word of "gmsgfmt", so it can be a program name with args.
set dummy gmsgfmt; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2601: checking for $ac_word" >&5
+echo "configure:2867: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_path_GMSGFMT'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:2651: checking for $ac_hdr" >&5
+echo "configure:2917: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2656 "configure"
+#line 2922 "configure"
#include "confdefs.h"
#include <$ac_hdr>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:2661: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:2927: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
for ac_func in gettext
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:2691: checking for $ac_func" >&5
+echo "configure:2957: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 2696 "configure"
+#line 2962 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
; return 0; }
EOF
-if { (eval echo configure:2719: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:2985: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
else
echo "$ac_t""no" 1>&6
echo $ac_n "checking for gettext in -lintl""... $ac_c" 1>&6
-echo "configure:2741: checking for gettext in -lintl" >&5
+echo "configure:3007: checking for gettext in -lintl" >&5
ac_lib_var=`echo intl'_'gettext | sed 'y%./+-%__p_%'`
if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
ac_save_LIBS="$LIBS"
LIBS="-lintl $LIBS"
cat > conftest.$ac_ext <<EOF
-#line 2749 "configure"
+#line 3015 "configure"
#include "confdefs.h"
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
gettext()
; return 0; }
EOF
-if { (eval echo configure:2760: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:3026: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_lib_$ac_lib_var=yes"
else
# Extract the first word of "$ac_prog", so it can be a program name with args.
set dummy $ac_prog; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:2828: checking for $ac_word" >&5
+echo "configure:3094: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_MAKEINFO'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
dnl Checks for library functions.
dnl
AC_FUNC_ALLOCA
+AC_FUNC_MMAP
AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp)
AC_CHECK_FUNCS(gettimeofday mktime strptime)
AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty)
+2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * wget.texi (Robots): Document that we now support the meta tag
+ exclusion.
+
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
* wget.texi: Use --- consistently.
This is explained in some detail at
@url{http://info.webcrawler.com/mak/projects/robots/meta-user.html}.
-Unfortunately, Wget does not support this method of robot exclusion yet,
-but it will be implemented in the next release.
+Wget supports this method of robot exclusion in addition to the usual
+@file{/robots.txt} exclusion.
@node Security Considerations, Contributors, Robots, Appendices
@section Security Considerations
+2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * retr.c (get_contents): If use_expected, make sure that the
+ appropriate amount of data is being read.
+
+ * http.c (gethttp): Check for both `Keep-Alive: ...' and
+ `Connection: Keep-Alive'.
+
+ * wget.h (DEBUGP): Call debug_logprintf only if opt.debug is
+ turned on.
+
+2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * http.c (connection_available_p): Use it.
+
+ * connect.c (test_socket_open): New function.
+
+ * http.c (gethttp): Support persistent connections. Based on the
+ ideas, and partly on code, by Sam Horrocks <sam@daemoninc.com>.
+ (register_persistent): New function.
+ (connection_available_p): Ditto.
+ (invalidate_connection): Ditto.
+
+2000-11-19 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * url.c (convert_links): Handle UREL2ABS case.
+
+ * recur.c (recursive_retrieve): Instead of the list
+ urls_downloaded, use hash tables dl_file_url_map and
+ dl_url_file_map.
+ (convert_all_links): Use them to retrieve data.
+
+ * host.c (clean_hosts): Free the hash tables.
+
+ * main.c (private_initialize): Call host_init().
+
+ * host.c (store_hostaddress): Use a saner, hash table-based data
+ model.
+ (realhost): Ditto.
+ (host_init): Initialize the hash tables.
+
+2000-11-18 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * utils.c (slist_append): Eviscerate NOSORT. Hash tables are now
+ used for what the sorted slists used to be used for.
+ (slist_contains): Don't rely on the list being sorted.
+ (slist_append): Simplify the code.
+
+ * recur.c (recursive_cleanup): Use free_string_set.
+
+ * utils.c (string_set_add, string_set_exists, string_set_free):
+ New functions for easier freeing of hash tables whose keys are
+ strdup'ed strings.
+
+ * recur.c (recursive_retrieve): Use the hash table functions for
+ storing undesirable URLs.
+
+ * hash.c: New file.
+
+2000-11-17 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * main.c (private_initialize): Call url_init.
+ (main): Call private_initialize.
+
+ * url.c (unsafe_char_table): New table.
+ (UNSAFE_CHAR): Use it.
+ (init_unsafe_char_table): New function.
+ (url_init): New function; call init_unsafe_char_table.
+
+2000-11-15 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * html-url.c (handle_link): Handle HTML fragment identifiers.
+
+ * recur.c (recursive_retrieve): If norobot info is respected and
+ the file is specified not to be followed by robots, respect that.
+
+ * html-url.c (collect_tags_mapper): Handle <meta name=robots
+ content=X>. For us the important cases are where X is NONE or
+ where X contains NOFOLLOW.
+ (get_urls_html): Propagate that information to the caller.
+
+2000-11-13 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * url.c (convert_links): Unlink the file we might be reading from
+ before writing to it.
+ (convert_links): Use alloca instead of malloc for
+ filename_plus_orig_suffix.
+
+2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * url.c (get_urls_file): Ditto.
+ (convert_links): Ditto.
+
+ * html-url.c (get_urls_html): Use read_file() instead of
+ load_file().
+
+ * utils.c (read_file): New function, instead of the old
+ load_file().
+ (read_file_free): Ditto.
+
+ * url.c (findurl): Search only for the supported protocols.
+ (convert_links): Use fwrite() when writing out a region of
+ characters.
+
+2000-11-10 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * ftp-ls.c: Move html_quote_string and ftp_index here.
+
+ * url.c: Remove get_urls_html, since that's now in html-url.c.
+
+ * html-url.c: New file.
+
+ * html-parse.c: New file.
+
2000-11-16 Hrvoje Niksic <hniksic@arsdigita.com>
* mswindows.h: Define snprintf and vsnprintf to _snprintf and
OPIE_OBJ = @OPIE_OBJ@
OBJ = $(ALLOCA) cmpt$o connect$o fnmatch$o ftp$o ftp-basic$o \
- ftp-ls$o $(OPIE_OBJ) getopt$o headers$o host$o html$o \
- http$o init$o log$o main$o $(MD5_OBJ) netrc$o rbuf$o \
- recur$o retr$o snprintf$o url$o utils$o version$o
+ ftp-ls$o $(OPIE_OBJ) getopt$o hash$o headers$o host$o \
+ html-parse$o html-url$o http$o init$o log$o main$o \
+ $(MD5_OBJ) netrc$o rbuf$o recur$o retr$o snprintf$o \
+ url$o utils$o version$o
.SUFFIXES:
.SUFFIXES: .c .o ._c ._o
# DO NOT DELETE THIS LINE -- make depend depends on it.
-cmpt$o: config.h wget.h sysdep.h options.h
-connect$o: config.h wget.h sysdep.h options.h connect.h host.h
-fnmatch$o: config.h wget.h sysdep.h options.h fnmatch.h
-ftp-basic$o: config.h wget.h sysdep.h options.h utils.h rbuf.h connect.h host.h
-ftp-ls$o: config.h wget.h sysdep.h options.h utils.h ftp.h rbuf.h
-ftp-opie$o: config.h wget.h sysdep.h options.h md5.h
-ftp$o: config.h wget.h sysdep.h options.h utils.h url.h rbuf.h retr.h ftp.h html.h connect.h host.h fnmatch.h netrc.h
-getopt$o: wget.h sysdep.h options.h
-headers$o: config.h wget.h sysdep.h options.h connect.h rbuf.h headers.h
-host$o: config.h wget.h sysdep.h options.h utils.h host.h url.h
-html$o: config.h wget.h sysdep.h options.h url.h utils.h ftp.h rbuf.h html.h
-http$o: config.h wget.h sysdep.h options.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h
-init$o: config.h wget.h sysdep.h options.h utils.h init.h host.h recur.h netrc.h
-log$o: config.h wget.h sysdep.h options.h utils.h
-main$o: config.h wget.h sysdep.h options.h utils.h getopt.h init.h retr.h rbuf.h recur.h host.h
-md5$o: wget.h sysdep.h options.h md5.h
-mswindows$o: config.h winsock.h wget.h sysdep.h options.h url.h
-netrc$o: wget.h sysdep.h options.h utils.h netrc.h init.h
-rbuf$o: config.h wget.h sysdep.h options.h rbuf.h connect.h
-recur$o: config.h wget.h sysdep.h options.h url.h recur.h utils.h retr.h rbuf.h ftp.h fnmatch.h host.h
-retr$o: config.h wget.h sysdep.h options.h utils.h retr.h rbuf.h url.h recur.h ftp.h host.h connect.h
-url$o: config.h wget.h sysdep.h options.h utils.h url.h host.h html.h
-utils$o: config.h wget.h sysdep.h options.h utils.h fnmatch.h
+cmpt$o: wget.h
+connect$o: wget.h connect.h host.h
+fnmatch$o: wget.h fnmatch.h
+ftp-basic$o: wget.h utils.h rbuf.h connect.h host.h
+ftp-ls$o: wget.h utils.h ftp.h url.h
+ftp-opie$o: wget.h md5.h
+ftp$o: wget.h utils.h url.h rbuf.h retr.h ftp.h connect.h host.h fnmatch.h netrc.h
+getopt$o: wget.h getopt.h
+hash$o: wget.h utils.h hash.h
+headers$o: wget.h connect.h rbuf.h headers.h
+host$o: wget.h utils.h host.h url.h hash.h
+html-parse$o: wget.h html-parse.h
+html-url$o: wget.h html-parse.h url.h utils.h
+html$o: wget.h url.h utils.h ftp.h
+http$o: wget.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h md5.h
+init$o: wget.h utils.h init.h host.h recur.h netrc.h
+log$o: wget.h utils.h
+main$o: wget.h utils.h getopt.h init.h retr.h recur.h host.h
+md5$o: wget.h md5.h
+mswindows$o: wget.h url.h
+netrc$o: wget.h utils.h netrc.h init.h
+rbuf$o: wget.h rbuf.h connect.h
+recur$o: wget.h url.h recur.h utils.h retr.h ftp.h fnmatch.h host.h hash.h
+retr$o: wget.h utils.h retr.h url.h recur.h ftp.h host.h connect.h hash.h
+snprintf$o:
+url$o: wget.h utils.h url.h host.h
+utils$o: wget.h utils.h fnmatch.h hash.h
+version$o:
/* Define if you have the uname function. */
#undef HAVE_UNAME
+/* Define if you have a working version of mmap. */
+#undef HAVE_MMAP
+
/* Define if you have the gethostname function. */
#undef HAVE_GETHOSTNAME
return NOCONERROR;
}
+int
+test_socket_open (int sock)
+{
+#ifdef HAVE_SELECT
+ fd_set check_set;
+ struct timeval to;
+
+ /* Check if we still have a valid (non-EOF) connection. From Andrew
+ * Maholski's code in the Unix Socket FAQ. */
+
+ FD_ZERO (&check_set);
+ FD_SET (sock, &check_set);
+
+ /* Wait one microsecond */
+ to.tv_sec = 0;
+ to.tv_usec = 1;
+
+ /* If we get a timeout, then that means still connected */
+ if (select (sock + 1, &check_set, NULL, NULL, &to) == 0)
+ {
+ /* Connection is valid (not EOF), so continue */
+ return 1;
+ }
+ else
+ return 0;
+#else
+ /* Without select, it's hard to know for sure. */
+ return 1;
+#endif
+}
+
/* Bind the local port PORT. This does all the necessary work, which
is creating a socket, setting SO_REUSEADDR option on it, then
calling bind() and listen(). If *PORT is 0, a random port is
#include "wget.h"
#include "utils.h"
#include "ftp.h"
+#include "url.h"
/* Converts symbolic permissions to number-style ones, e.g. string
rwxr-xr-x to 755. For now, it knows nothing of
{
return ftp_parse_unix_ls (file);
}
+\f
+/* Stuff for creating FTP index. */
+
+/* The function returns the pointer to the malloc-ed quoted version of
+ string s. It will recognize and quote numeric and special graphic
+ entities, as per RFC1866:
+
+ `&' -> `&'
+ `<' -> `<'
+ `>' -> `>'
+ `"' -> `"'
+
+ No other entities are recognized or replaced. */
+static char *
+html_quote_string (const char *s)
+{
+ const char *b = s;
+ char *p, *res;
+ int i;
+
+ /* Pass through the string, and count the new size. */
+ for (i = 0; *s; s++, i++)
+ {
+ if (*s == '&')
+ i += 4; /* `amp;' */
+ else if (*s == '<' || *s == '>')
+ i += 3; /* `lt;' and `gt;' */
+ else if (*s == '\"')
+ i += 5; /* `quot;' */
+ }
+ res = (char *)xmalloc (i + 1);
+ s = b;
+ for (p = res; *s; s++)
+ {
+ switch (*s)
+ {
+ case '&':
+ *p++ = '&';
+ *p++ = 'a';
+ *p++ = 'm';
+ *p++ = 'p';
+ *p++ = ';';
+ break;
+ case '<': case '>':
+ *p++ = '&';
+ *p++ = (*s == '<' ? 'l' : 'g');
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case '\"':
+ *p++ = '&';
+ *p++ = 'q';
+ *p++ = 'u';
+ *p++ = 'o';
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ default:
+ *p++ = *s;
+ }
+ }
+ *p = '\0';
+ return res;
+}
+
+/* The function creates an HTML index containing references to given
+ directories and files on the appropriate host. The references are
+ FTP. */
+uerr_t
+ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f)
+{
+ FILE *fp;
+ char *upwd;
+ char *htclfile; /* HTML-clean file name */
+
+ if (!opt.dfp)
+ {
+ fp = fopen (file, "wb");
+ if (!fp)
+ {
+ logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+ return FOPENERR;
+ }
+ }
+ else
+ fp = opt.dfp;
+ if (u->user)
+ {
+ char *tmpu, *tmpp; /* temporary, clean user and passwd */
+
+ tmpu = CLEANDUP (u->user);
+ tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL;
+ upwd = (char *)xmalloc (strlen (tmpu)
+ + (tmpp ? (1 + strlen (tmpp)) : 0) + 2);
+ sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : "");
+ free (tmpu);
+ FREE_MAYBE (tmpp);
+ }
+ else
+ upwd = xstrdup ("");
+ fprintf (fp, "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n");
+ fprintf (fp, "<html>\n<head>\n<title>");
+ fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
+ fprintf (fp, "</title>\n</head>\n<body>\n<h1>");
+ fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port);
+ fprintf (fp, "</h1>\n<hr>\n<pre>\n");
+ while (f)
+ {
+ fprintf (fp, " ");
+ if (f->tstamp != -1)
+ {
+ /* #### Should we translate the months? */
+ static char *months[] = {
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+ };
+ struct tm *ptm = localtime ((time_t *)&f->tstamp);
+
+ fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
+ ptm->tm_mday);
+ if (ptm->tm_hour)
+ fprintf (fp, "%02d:%02d ", ptm->tm_hour, ptm->tm_min);
+ else
+ fprintf (fp, " ");
+ }
+ else
+ fprintf (fp, _("time unknown "));
+ switch (f->type)
+ {
+ case FT_PLAINFILE:
+ fprintf (fp, _("File "));
+ break;
+ case FT_DIRECTORY:
+ fprintf (fp, _("Directory "));
+ break;
+ case FT_SYMLINK:
+ fprintf (fp, _("Link "));
+ break;
+ default:
+ fprintf (fp, _("Not sure "));
+ break;
+ }
+ htclfile = html_quote_string (f->name);
+ fprintf (fp, "<a href=\"ftp://%s%s:%hu", upwd, u->host, u->port);
+ if (*u->dir != '/')
+ putc ('/', fp);
+ fprintf (fp, "%s", u->dir);
+ if (*u->dir)
+ putc ('/', fp);
+ fprintf (fp, "%s", htclfile);
+ if (f->type == FT_DIRECTORY)
+ putc ('/', fp);
+ fprintf (fp, "\">%s", htclfile);
+ if (f->type == FT_DIRECTORY)
+ putc ('/', fp);
+ fprintf (fp, "</a> ");
+ if (f->type == FT_PLAINFILE)
+ fprintf (fp, _(" (%s bytes)"), legible (f->size));
+ else if (f->type == FT_SYMLINK)
+ fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
+ putc ('\n', fp);
+ free (htclfile);
+ f = f->next;
+ }
+ fprintf (fp, "</pre>\n</body>\n</html>\n");
+ free (upwd);
+ if (!opt.dfp)
+ fclose (fp);
+ else
+ fflush (fp);
+ return FTPOK;
+}
#include "rbuf.h"
#include "retr.h"
#include "ftp.h"
-#include "html.h"
#include "connect.h"
#include "host.h"
#include "fnmatch.h"
}
reset_timer ();
/* Get the contents of the document. */
- res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf);
+ res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf, 0);
con->dltime = elapsed_time ();
tms = time_str (NULL);
tmrate = rate (*len - restval, con->dltime);
struct fileinfo *ftp_parse_ls PARAMS ((const char *));
uerr_t ftp_loop PARAMS ((struct urlinfo *, int *));
+uerr_t ftp_index (const char *, struct urlinfo *, struct fileinfo *);
+
#endif /* FTP_H */
--- /dev/null
+/* Hash tables.
+ Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "utils.h"
+
+#include "hash.h"
+
+#ifdef STANDALONE
+# define xmalloc malloc
+# define xrealloc realloc
+#endif
+
+/* This file implements simple hash tables based on linear probing.
+ The hash table stores key-value pairs in a contiguous array. Both
+ key and value are void pointers that the hash and test functions
+ know how to handle.
+
+ Although Knuth & co. recommend double hashing over linear probing,
+ we use the latter because it accesses array elements sequentially
+ in case of a collision, yielding in better cache behaviour and
+ ultimately in better speed. To avoid collision problems with
+ linear probing, we make sure that the table grows as soon as the
+ fullness/size ratio exceeds 75%. */
+
+struct ht_pair {
+ void *key;
+ void *value;
+};
+
+struct hash_table {
+ unsigned long (*hash_function) (const void *);
+ int (*test_function) (const void *, const void *);
+
+ int size; /* size of the array */
+ int fullness; /* number of non-empty fields */
+ int count; /* number of non-empty, non-deleted
+ fields. */
+
+ struct ht_pair *pairs;
+};
+
+#define ENTRY_DELETED ((void *)0xdeadbeef)
+
+#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED)
+#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL)
+
+/* Find a prime near, but greather than or equal to SIZE. */
+
+int
+prime_size (int size)
+{
+ static const unsigned long primes [] = {
+ 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
+ 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
+ 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
+ 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
+ 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
+ 10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
+ 50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
+ 243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
+ 1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL
+ };
+ int i;
+ for (i = 0; i < ARRAY_SIZE (primes); i++)
+ if (primes[i] >= size)
+ return primes[i];
+ /* huh? */
+ return size;
+}
+
+/* Create a hash table of INITIAL_SIZE with hash function
+ HASH_FUNCTION and test function TEST_FUNCTION. If you wish to
+ start out with a "small" table which will be regrown as needed,
+ specify 0 as INITIAL_SIZE. */
+
+struct hash_table *
+hash_table_new (int initial_size,
+ unsigned long (*hash_function) (const void *),
+ int (*test_function) (const void *, const void *))
+{
+ struct hash_table *ht
+ = (struct hash_table *)xmalloc (sizeof (struct hash_table));
+ ht->hash_function = hash_function;
+ ht->test_function = test_function;
+ ht->size = prime_size (initial_size);
+ ht->fullness = 0;
+ ht->count = 0;
+ ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
+ memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+ return ht;
+}
+
+/* Free the data associated with hash table HT. */
+
+void
+hash_table_destroy (struct hash_table *ht)
+{
+ free (ht->pairs);
+ free (ht);
+}
+
+/* Get the value that corresponds to the key KEY in the hash table HT.
+ If no value is found, return NULL. Note that NULL is a legal value
+ for value; if you are storing NULLs in your hash table, you can use
+ hash_table_exists to be sure that a (possibly NULL) value exists in
+ the table. */
+
+void *
+hash_table_get (struct hash_table *ht, const void *key)
+{
+ int location = ht->hash_function (key) % ht->size;
+ while (1)
+ {
+ struct ht_pair *the_pair = ht->pairs + location;
+ if (EMPTY_ENTRY_P (the_pair->key))
+ return NULL;
+ else if (DELETED_ENTRY_P (the_pair->key)
+ || !ht->test_function (key, the_pair->key))
+ {
+ ++location;
+ if (location == ht->size)
+ location = 0;
+ }
+ else
+ return the_pair->value;
+ }
+}
+
+/* Return 1 if KEY exists in HT, 0 otherwise. */
+
+int
+hash_table_exists (struct hash_table *ht, const void *key)
+{
+ int location = ht->hash_function (key) % ht->size;
+ while (1)
+ {
+ struct ht_pair *the_pair = ht->pairs + location;
+ if (EMPTY_ENTRY_P (the_pair->key))
+ return 0;
+ else if (DELETED_ENTRY_P (the_pair->key)
+ || !ht->test_function (key, the_pair->key))
+ {
+ ++location;
+ if (location == ht->size)
+ location = 0;
+ }
+ else
+ return 1;
+ }
+}
+
+#define MAX(i, j) (((i) >= (j)) ? (i) : (j))
+
+/* Grow hash table HT as necessary, and rehash all the key-value
+ pairs. */
+
+static void
+grow_hash_table (struct hash_table *ht)
+{
+ int i;
+ struct ht_pair *old_pairs = ht->pairs;
+ int old_count = ht->count; /* for assert() below */
+ int old_size = ht->size;
+
+ /* Normally, the idea is to double ht->size (and round it to next
+ prime) on each regrow:
+
+ ht->size = prime_size (ht->size * 2);
+
+ But it is possible that the table has large fullness because of
+ the many deleted entries. If that is the case, we don't want to
+ blindly grow the table; we just want to rehash it. For that
+ reason, we use ht->count as the relevant parameter. MAX is used
+ only because we don't want to actually shrink the table. (But
+ maybe that's wrong.) */
+
+ int needed_size = prime_size (ht->count * 2);
+ ht->size = MAX (old_size, needed_size);
+
+ ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair));
+ memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+
+ /* Need to reset these two; hash_table_put will reinitialize them. */
+ ht->fullness = 0;
+ ht->count = 0;
+ for (i = 0; i < old_size; i++)
+ {
+ struct ht_pair *the_pair = old_pairs + i;
+ if (!EMPTY_ENTRY_P (the_pair->key)
+ && !DELETED_ENTRY_P (the_pair->key))
+ hash_table_put (ht, the_pair->key, the_pair->value);
+ }
+ assert (ht->count == old_count);
+ free (old_pairs);
+}
+
+/* Put VALUE in the hash table HT under the key KEY. This regrows the
+ table if necessary. */
+
+void
+hash_table_put (struct hash_table *ht, const void *key, void *value)
+{
+ int location = ht->hash_function (key) % ht->size;
+ while (1)
+ {
+ struct ht_pair *the_pair = ht->pairs + location;
+ if (EMPTY_ENTRY_P (the_pair->key))
+ {
+ ++ht->fullness;
+ ++ht->count;
+ just_insert:
+ the_pair->key = (void *)key; /* const? */
+ the_pair->value = value;
+ break;
+ }
+ else if (DELETED_ENTRY_P (the_pair->key))
+ {
+ /* We're replacing a deleteed entry, so ht->count gets
+ increased, but ht->fullness remains unchanged. */
+ ++ht->count;
+ goto just_insert;
+ }
+ else if (ht->test_function (key, the_pair->key))
+ {
+ /* We're replacing an existing entry, so ht->count and
+ ht->fullness remain unchanged. */
+ goto just_insert;
+ }
+ else
+ {
+ ++location;
+ if (location == ht->size)
+ location = 0;
+ }
+ }
+ if (ht->fullness * 4 > ht->size * 3)
+ /* When fullness exceeds 75% of size, regrow the table. */
+ grow_hash_table (ht);
+}
+
+/* Remove KEY from HT. */
+
+int
+hash_table_remove (struct hash_table *ht, const void *key)
+{
+ int location = ht->hash_function (key) % ht->size;
+ while (1)
+ {
+ struct ht_pair *the_pair = ht->pairs + location;
+ if (EMPTY_ENTRY_P (the_pair->key))
+ return 0;
+ else if (DELETED_ENTRY_P (the_pair->key)
+ || !ht->test_function (key, the_pair->key))
+ {
+ ++location;
+ if (location == ht->size)
+ location = 0;
+ }
+ else
+ {
+ /* We don't really remove an entry from the hash table: we
+ just mark it as deleted. This is because there may be
+ other entries located after this entry whose hash number
+ points to a location before this entry. (Example: keys
+ A, B and C have the same hash. If you were to really
+ *delete* B from the table, C could no longer be found.)
+
+ As an optimization, it might be worthwhile to check
+ whether the immediately preceding entry is empty and, if
+ so, really delete the pair (set it to empty and decrease
+ the fullness along with the count). I *think* it should
+ be safe. */
+ the_pair->key = ENTRY_DELETED;
+ --ht->count;
+ return 1;
+ }
+ }
+}
+
+void
+hash_table_clear (struct hash_table *ht)
+{
+ memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair));
+ ht->fullness = 0;
+ ht->count = 0;
+}
+
+void
+hash_table_map (struct hash_table *ht,
+ int (*mapfun) (void *, void *, void *),
+ void *closure)
+{
+ int i;
+ for (i = 0; i < ht->size; i++)
+ {
+ struct ht_pair *the_pair = ht->pairs + i;
+ if (!EMPTY_ENTRY_P (the_pair->key)
+ && !DELETED_ENTRY_P (the_pair->key))
+ if (mapfun (the_pair->key, the_pair->value, closure))
+ return;
+ }
+}
+\f
+/* Support for hash tables whose keys are strings. */
+
+/* supposedly from the Dragon Book P436. */
+unsigned long
+string_hash (const void *sv)
+{
+ unsigned int h = 0;
+ unsigned const char *x = (unsigned const char *) sv;
+
+ while (*x)
+ {
+ unsigned int g;
+ h = (h << 4) + *x++;
+ if ((g = h & 0xf0000000) != 0)
+ h = (h ^ (g >> 24)) ^ g;
+ }
+
+ return h;
+}
+
+int
+string_cmp (const void *s1, const void *s2)
+{
+ return !strcmp ((const char *)s1, (const char *)s2);
+}
+
+struct hash_table *
+make_string_hash_table (int initial_size)
+{
+ return hash_table_new (initial_size, string_hash, string_cmp);
+}
+
+\f
+#ifdef STANDALONE
+
+#include <stdio.h>
+#include <string.h>
+
+int
+print_hash_table_mapper (const void *key, void *value, void *count)
+{
+ ++*(int *)count;
+ printf ("%s: %s\n", (const char *)key, (char *)value);
+ return 0;
+}
+
+void
+print_hash (struct hash_table *sht)
+{
+ int debug_count = 0;
+ hash_table_map (sht, print_hash_table_mapper, &debug_count);
+ assert (debug_count == sht->count);
+}
+
+int
+main (void)
+{
+ struct hash_table *ht = make_string_hash_table (0);
+ char line[80];
+ while ((fgets (line, sizeof (line), stdin)))
+ {
+ int len = strlen (line);
+ if (len <= 1)
+ continue;
+ line[--len] = '\0';
+ hash_table_put (ht, strdup (line), "here I am!");
+ if (len % 2)
+ hash_table_remove (ht, line);
+ }
+ print_hash (ht);
+#if 0
+ printf ("%d %d %d\n", ht->count, ht->fullness, ht->size);
+#endif
+ return 0;
+}
+#endif
--- /dev/null
+/* Hash table declarations.
+ Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* From XEmacs, and hence from Dragon book. */
+
+#define GOOD_HASH 65599 /* prime number just over 2^16; Dragon book, p. 435 */
+#define HASH2(a,b) (GOOD_HASH * (a) + (b))
+#define HASH3(a,b,c) (GOOD_HASH * HASH2 (a,b) + (c))
+#define HASH4(a,b,c,d) (GOOD_HASH * HASH3 (a,b,c) + (d))
+#define HASH5(a,b,c,d,e) (GOOD_HASH * HASH4 (a,b,c,d) + (e))
+#define HASH6(a,b,c,d,e,f) (GOOD_HASH * HASH5 (a,b,c,d,e) + (f))
+#define HASH7(a,b,c,d,e,f,g) (GOOD_HASH * HASH6 (a,b,c,d,e,f) + (g))
+#define HASH8(a,b,c,d,e,f,g,h) (GOOD_HASH * HASH7 (a,b,c,d,e,f,g) + (h))
+#define HASH9(a,b,c,d,e,f,g,h,i) (GOOD_HASH * HASH8 (a,b,c,d,e,f,g,h) + (i))
+
+struct hash_table;
+
+struct hash_table *hash_table_new PARAMS ((int,
+ unsigned long (*) (const void *),
+ int (*) (const void *,
+ const void *)));
+void hash_table_destroy PARAMS ((struct hash_table *));
+void *hash_table_get PARAMS ((struct hash_table *, const void *));
+int hash_table_exists PARAMS ((struct hash_table *, const void *));
+void hash_table_put PARAMS ((struct hash_table *, const void *, void *));
+int hash_table_remove PARAMS ((struct hash_table *, const void *));
+void hash_table_clear PARAMS ((struct hash_table *));
+void hash_table_map PARAMS ((struct hash_table *,
+ int (*) (void *, void *, void *),
+ void *));
+
+unsigned long string_hash PARAMS ((const void *));
+int string_cmp PARAMS ((const void *, const void *));
+struct hash_table *make_string_hash_table PARAMS ((int));
return 1;
}
+/* Write the value 1 into the integer pointed to by CLOSURE. */
+int
+header_exists (const char *header, void *closure)
+{
+ *(int *)closure = 1;
+ return 1;
+}
+
/* Skip LWS (linear white space), if present. Returns number of
characters to skip. */
int
int header_extract_number PARAMS ((const char *, void *));
int header_strdup PARAMS ((const char *, void *));
+int header_exists PARAMS ((const char *, void *));
int skip_lws PARAMS ((const char *));
/* Dealing with host names.
- Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+ Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
This file is part of Wget.
#include "utils.h"
#include "host.h"
#include "url.h"
+#include "hash.h"
#ifndef errno
extern int errno;
#endif
-/* Host list entry */
-struct host
+/* Mapping between all known hosts to their addresses (n.n.n.n). */
+struct hash_table *host_name_address_map;
+
+/* Mapping between all known addresses (n.n.n.n) to their hosts. This
+ is the inverse of host_name_address_map. These two tables share
+ the strdup'ed strings. */
+struct hash_table *host_address_name_map;
+
+/* Mapping between auxilliary (slave) and master host names. */
+struct hash_table *host_slave_master_map;
+
+/* Utility function: like xstrdup(), but also lowercases S. */
+
+static char *
+xstrdup_lower (const char *s)
{
- /* Host's symbolical name, as encountered at the time of first
- inclusion, e.g. "fly.cc.fer.hr". */
- char *hostname;
- /* Host's "real" name, i.e. its IP address, written out in ASCII
- form of N.N.N.N, e.g. "161.53.70.130". */
- char *realname;
- /* More than one HOSTNAME can correspond to the same REALNAME. For
- our purposes, the canonical name of the host is its HOSTNAME when
- it was first encountered. This entry is said to have QUALITY. */
- int quality;
- /* Next entry in the list. */
- struct host *next;
-};
-
-static struct host *hlist;
-
-static struct host *add_hlist PARAMS ((struct host *, const char *,
- const char *, int));
+ char *copy = xstrdup (s);
+ char *p = copy;
+ for (; *p; p++)
+ *p = TOLOWER (*p);
+ return copy;
+}
/* The same as gethostbyname, but supports internet addresses of the
- form `N.N.N.N'. */
+ form `N.N.N.N'. On some systems gethostbyname() knows how to do
+ this automatically. */
struct hostent *
ngethostbyname (const char *name)
{
return hp;
}
-/* Search for HOST in the linked list L, by hostname. Return the
- entry, if found, or NULL. The search is case-insensitive. */
-static struct host *
-search_host (struct host *l, const char *host)
-{
- for (; l; l = l->next)
- if (strcasecmp (l->hostname, host) == 0)
- return l;
- return NULL;
-}
+/* Add host name HOST with the address ADDR_TEXT to the cache.
+ Normally this means that the (HOST, ADDR_TEXT) pair will be to
+ host_name_address_map and to host_address_name_map. (It is the
+ caller's responsibility to make sure that HOST is not already in
+ host_name_address_map.)
-/* Like search_host, but searches by address. */
-static struct host *
-search_address (struct host *l, const char *address)
+ If the ADDR_TEXT has already been seen and belongs to another host,
+ HOST will be added to host_slave_master_map instead. */
+
+static void
+add_host_to_cache (const char *host, const char *addr_text)
{
- for (; l; l = l->next)
+ char *canonical_name = hash_table_get (host_address_name_map, addr_text);
+ if (canonical_name)
+ {
+ DEBUGP (("Mapping %s to %s in host_slave_master_map.\n",
+ host, canonical_name));
+ /* We've already dealt with that host under another name. */
+ hash_table_put (host_slave_master_map,
+ xstrdup_lower (host),
+ xstrdup_lower (canonical_name));
+ }
+ else
{
- int cmp = strcmp (l->realname, address);
- if (cmp == 0)
- return l;
- else if (cmp > 0)
- return NULL;
+ /* This is really the first time we're dealing with that host. */
+ char *h_copy = xstrdup_lower (host);
+ char *a_copy = xstrdup (addr_text);
+ DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy));
+ hash_table_put (host_name_address_map, h_copy, a_copy);
+ hash_table_put (host_address_name_map, a_copy, h_copy);
}
- return NULL;
}
-/* Store the address of HOSTNAME, internet-style, to WHERE. First
- check for it in the host list, and (if not found), use
- ngethostbyname to get it.
+/* Store the address of HOSTNAME, internet-style (four octets in
+ network order), to WHERE. First try to get the address from the
+ cache; if it is not available, call the DNS functions and update
+ the cache.
Return 1 on successful finding of the hostname, 0 otherwise. */
int
store_hostaddress (unsigned char *where, const char *hostname)
{
- struct host *t;
unsigned long addr;
+ char *addr_text;
+ char *canonical_name;
struct hostent *hptr;
struct in_addr in;
char *inet_s;
/* If the address is of the form d.d.d.d, there will be no trouble
with it. */
addr = (unsigned long)inet_addr (hostname);
- if ((int)addr == -1)
- {
- /* If it is not of that form, try to find it in the cache. */
- t = search_host (hlist, hostname);
- if (t)
- addr = (unsigned long)inet_addr (t->realname);
- }
/* If we have the numeric address, just store it. */
if ((int)addr != -1)
{
- /* ADDR is in network byte order, meaning the code works on
- little and big endian 32-bit architectures without change.
- On big endian 64-bit architectures we need to be careful to
- copy the correct four bytes. */
- int offset = 0;
+ /* ADDR is defined to be in network byte order, meaning the code
+ works on little and big endian 32-bit architectures without
+ change. On big endian 64-bit architectures we need to be
+ careful to copy the correct four bytes. */
+ int offset;
+ have_addr:
#ifdef WORDS_BIGENDIAN
offset = sizeof (unsigned long) - 4;
+#else
+ offset = 0;
#endif
memcpy (where, (char *)&addr + offset, 4);
return 1;
}
+
+ /* By now we know that the address is not of the form d.d.d.d. Try
+ to find it in our cache of host addresses. */
+ addr_text = hash_table_get (host_name_address_map, hostname);
+ if (addr_text)
+ {
+ DEBUGP (("Found %s in host_name_address_map: %s\n",
+ hostname, addr_text));
+ addr = (unsigned long)inet_addr (addr_text);
+ goto have_addr;
+ }
+
+ /* Maybe this host is known to us under another name. If so, we'll
+ find it in host_slave_master_map, and use the master name to find
+ its address in host_name_address_map. */
+ canonical_name = hash_table_get (host_slave_master_map, hostname);
+ if (canonical_name)
+ {
+ addr_text = hash_table_get (host_name_address_map, canonical_name);
+ assert (addr_text != NULL);
+ DEBUGP (("Found %s as slave of %s -> %s\n",
+ hostname, canonical_name, addr_text));
+ addr = (unsigned long)inet_addr (addr_text);
+ goto have_addr;
+ }
+
/* Since all else has failed, let's try gethostbyname(). Note that
we use gethostbyname() rather than ngethostbyname(), because we
- *know* the address is not numerical. */
+ already know that the address is not numerical. */
hptr = gethostbyname (hostname);
if (!hptr)
return 0;
/* Copy the address of the host to socket description. */
memcpy (where, hptr->h_addr_list[0], hptr->h_length);
- /* Now that we're here, we could as well cache the hostname for
- future use, as in realhost(). First, we have to look for it by
- address to know if it's already in the cache by another name. */
+ assert (hptr->h_length == 4);
+ /* Now that we've gone through the truoble of calling
+ gethostbyname(), we can store this valuable information to the
+ cache. First, we have to look for it by address to know if it's
+ already in the cache by another name. */
/* Originally, we copied to in.s_addr, but it appears to be missing
on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
- STRDUP_ALLOCA (inet_s, inet_ntoa (in));
- t = search_address (hlist, inet_s);
- if (t) /* Found in the list, as realname. */
- {
- /* Set the default, 0 quality. */
- hlist = add_hlist (hlist, hostname, inet_s, 0);
- return 1;
- }
- /* Since this is really the first time this host is encountered,
- set quality to 1. */
- hlist = add_hlist (hlist, hostname, inet_s, 1);
+ inet_s = inet_ntoa (in);
+ add_host_to_cache (hostname, inet_s);
return 1;
}
-/* Add a host to the host list. The list is sorted by addresses. For
- equal addresses, the entries with quality should bubble towards the
- beginning of the list. */
-static struct host *
-add_hlist (struct host *l, const char *nhost, const char *nreal, int quality)
-{
- struct host *t, *old, *beg;
-
- /* The entry goes to the beginning of the list if the list is empty
- or the order requires it. */
- if (!l || (strcmp (nreal, l->realname) < 0))
- {
- t = (struct host *)xmalloc (sizeof (struct host));
- t->hostname = xstrdup (nhost);
- t->realname = xstrdup (nreal);
- t->quality = quality;
- t->next = l;
- return t;
- }
-
- beg = l;
- /* Second two one-before-the-last element. */
- while (l->next)
- {
- int cmp;
- old = l;
- l = l->next;
- cmp = strcmp (nreal, l->realname);
- if (cmp >= 0)
- continue;
- /* If the next list element is greater than s, put s between the
- current and the next list element. */
- t = (struct host *)xmalloc (sizeof (struct host));
- old->next = t;
- t->next = l;
- t->hostname = xstrdup (nhost);
- t->realname = xstrdup (nreal);
- t->quality = quality;
- return beg;
- }
- t = (struct host *)xmalloc (sizeof (struct host));
- t->hostname = xstrdup (nhost);
- t->realname = xstrdup (nreal);
- t->quality = quality;
- /* Insert the new element after the last element. */
- l->next = t;
- t->next = NULL;
- return beg;
-}
-
/* Determine the "real" name of HOST, as perceived by Wget. If HOST
is referenced by more than one name, "real" name is considered to
- be the first one encountered in the past.
-
- If the host cannot be found in the list of already dealt-with
- hosts, try with its INET address. If this fails too, add it to the
- list. The routine does not call gethostbyname twice for the same
- host if it can possibly avoid it. */
+ be the first one encountered in the past. */
char *
realhost (const char *host)
{
- struct host *l, *l_real;
struct in_addr in;
struct hostent *hptr;
- char *inet_s;
+ char *master_name;
- DEBUGP (("Checking for %s.\n", host));
- /* Look for the host, looking by the host name. */
- l = search_host (hlist, host);
- if (l && l->quality) /* Found it with quality */
+ DEBUGP (("Checking for %s in host_name_address_map.\n", host));
+ if (hash_table_exists (host_name_address_map, host))
{
- DEBUGP (("%s was already used, by that name.\n", host));
- /* Here we return l->hostname, not host, because of the possible
- case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are
- the same, but we want the one that was first. */
- return xstrdup (l->hostname);
+ DEBUGP (("Found; %s was already used, by that name.\n", host));
+ return xstrdup_lower (host);
}
- else if (!l) /* Not found, with or without quality */
- {
- /* The fact that gethostbyname will get called makes it
- necessary to store it to the list, to ensure that
- gethostbyname will not be called twice for the same string.
- However, the quality argument must be set appropriately.
-
- Note that add_hlist must be called *after* the realname
- search, or the quality would be always set to 0 */
- DEBUGP (("This is the first time I hear about host %s by that name.\n",
- host));
- hptr = ngethostbyname (host);
- if (!hptr)
- return xstrdup (host);
- /* Originally, we copied to in.s_addr, but it appears to be
- missing on some systems. */
- memcpy (&in, *hptr->h_addr_list, sizeof (in));
- STRDUP_ALLOCA (inet_s, inet_ntoa (in));
- }
- else /* Found, without quality */
+
+ DEBUGP (("Checking for %s in host_slave_master_map.\n", host));
+ master_name = hash_table_get (host_slave_master_map, host);
+ if (master_name)
{
- /* This case happens when host is on the list,
- but not as first entry (the one with quality).
- Then we just get its INET address and pick
- up the first entry with quality. */
- DEBUGP (("We've dealt with host %s, but under the name %s.\n",
- host, l->realname));
- STRDUP_ALLOCA (inet_s, l->realname);
+ has_master:
+ DEBUGP (("Found; %s was already used, by the name %s.\n",
+ host, master_name));
+ return xstrdup (master_name);
}
- /* Now we certainly have the INET address. The following loop is
- guaranteed to pick either an entry with quality (because it is
- the first one), or none at all. */
- l_real = search_address (hlist, inet_s);
- if (l_real) /* Found in the list, as realname. */
+ DEBUGP (("First time I hear about %s by that name; looking it up.\n",
+ host));
+ hptr = ngethostbyname (host);
+ if (hptr)
{
- if (!l)
- /* Set the default, 0 quality. */
- hlist = add_hlist (hlist, host, inet_s, 0);
- return xstrdup (l_real->hostname);
+ char *inet_s;
+ /* Originally, we copied to in.s_addr, but it appears to be
+ missing on some systems. */
+ memcpy (&in, *hptr->h_addr_list, sizeof (in));
+ inet_s = inet_ntoa (in);
+
+ add_host_to_cache (host, inet_s);
+
+ /* add_host_to_cache() can establish a slave-master mapping. */
+ DEBUGP (("Checking again for %s in host_slave_master_map.\n", host));
+ master_name = hash_table_get (host_slave_master_map, host);
+ if (master_name)
+ goto has_master;
}
- /* Since this is really the first time this host is encountered,
- set quality to 1. */
- hlist = add_hlist (hlist, host, inet_s, 1);
- return xstrdup (host);
+
+ return xstrdup_lower (host);
}
/* Compare two hostnames (out of URL-s if the arguments are URL-s),
return _("Unknown error");
}
-/* Clean the host list. This is a separate function, so we needn't
- export HLIST and its implementation. Ha! */
void
clean_hosts (void)
{
- struct host *l = hlist;
+ /* host_name_address_map and host_address_name_map share the
+ strings. Because of that, calling free_keys_and_values once
+ suffices for both. */
+ free_keys_and_values (host_name_address_map);
+ hash_table_destroy (host_name_address_map);
+ hash_table_destroy (host_address_name_map);
+ free_keys_and_values (host_slave_master_map);
+ hash_table_destroy (host_slave_master_map);
+}
- while (l)
- {
- struct host *p = l->next;
- free (l->hostname);
- free (l->realname);
- free (l);
- l = p;
- }
- hlist = NULL;
+void
+host_init (void)
+{
+ host_name_address_map = make_string_hash_table (0);
+ host_address_name_map = make_string_hash_table (0);
+ host_slave_master_map = make_string_hash_table (0);
}
--- /dev/null
+/* HTML parser for Wget.
+ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* The only entry point to this module is map_html_tags(), which see. */
+
+/* TODO:
+
+ - Allow hooks for callers to process contents outside tags. This
+ is needed to implement handling <style> and <script>. The
+ taginfo structure already carries the information about where the
+ tags are, but this is not enough, because one would also want to
+ skip the comments. (The funny thing is that for <style> and
+ <script> you *don't* want to skip comments!)
+
+ - Create a test suite for regression testing. */
+
+/* HISTORY:
+
+ This is the third HTML parser written for Wget. The first one was
+ written some time during the Geturl 1.0 beta cycle, and was very
+ inefficient and buggy. It also contained some very complex code to
+ remember a list of parser states, because it was supposed to be
+ reentrant. The idea was that several parsers would be running
+ concurrently, and you'd have pass the function a unique ID string
+ (for example, the URL) by which it found the relevant parser state
+ and returned the next URL. Over-engineering at its best.
+
+ The second HTML parser was written for Wget 1.4 (the first version
+ by the name `Wget'), and was a complete rewrite. Although the new
+ parser behaved much better and made no claims of reentrancy, it
+ still shared many of the fundamental flaws of the old version -- it
+ only regarded HTML in terms tag-attribute pairs, where the
+ attribute's value was a URL to be returned. Any other property of
+ HTML, such as <base href=...>, or strange way to specify a URL,
+ such as <meta http-equiv=Refresh content="0; URL=..."> had to be
+ crudely hacked in -- and the caller had to be aware of these hacks.
+ Like its predecessor, this parser did not support HTML comments.
+
+ After Wget 1.5.1 was released, I set out to write a third HTML
+ parser. The objectives of the new parser were to: (1) provide a
+ clean way to analyze HTML lexically, (2) separate interpretation of
+ the markup from the parsing process, (3) be as correct as possible,
+ e.g. correctly skipping comments and other SGML declarations, (4)
+ understand the most common errors in markup and skip them or be
+ relaxed towrds them, and (5) be reasonably efficient (no regexps,
+ minimum copying and minimum or no heap allocation).
+
+ I believe this parser meets all of the above goals. It is
+ reasonably well structured, and could be relatively easily
+ separated from Wget and used elsewhere. While some of its
+ intrinsic properties limit its value as a general-purpose HTML
+ parser, I believe that, with minimum modifications, it could serve
+ as a backend for one.
+
+ Due to time and other constraints, this parser was not integrated
+ into Wget until the version ???. */
+
+/* DESCRIPTION:
+
+ The single entry point of this parser is map_html_tags(), which
+ works by calling a function you specify for each tag. The function
+ gets called with the pointer to a structure describing the tag and
+ its attributes. */
+
+/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
+ still need Wget headers to compile. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <assert.h>
+
+#include "wget.h"
+#include "html-parse.h"
+
+#ifdef STANDALONE
+# define xmalloc malloc
+# define xrealloc realloc
+#endif /* STANDALONE */
+
+/* Pool support. For efficiency, map_html_tags() stores temporary
+ string data to a single stack-allocated pool. If the pool proves
+ too small, additional memory is allocated/resized with
+ malloc()/realloc(). */
+
+struct pool {
+ char *contents; /* pointer to the contents. */
+ int size; /* size of the pool. */
+ int index; /* next unoccupied position in
+ contents. */
+
+ int alloca_p; /* whether contents was allocated
+ using alloca(). */
+ char *orig_contents; /* orig_contents, allocated by
+ alloca(). this is used by
+ POOL_FREE to restore the pool to
+ the "initial" state. */
+ int orig_size;
+};
+
+/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
+
+#define POOL_INIT(pool, initial_size) do { \
+ (pool).size = (initial_size); \
+ (pool).contents = ALLOCA_ARRAY (char, (pool).size); \
+ (pool).index = 0; \
+ (pool).alloca_p = 1; \
+ (pool).orig_contents = (pool).contents; \
+ (pool).orig_size = (pool).size; \
+} while (0)
+
+/* Grow the pool to accomodate at least SIZE new bytes. If the pool
+ already has room to accomodate SIZE bytes of data, this is a no-op. */
+
+#define POOL_GROW(pool, increase) do { \
+ int PG_newsize = (pool).index + increase; \
+ DO_REALLOC_FROM_ALLOCA ((pool).contents, (pool).size, PG_newsize, \
+ (pool).alloca_p, char); \
+} while (0)
+
+/* Append text in the range [beg, end) to POOL. No zero-termination
+ is done. */
+
+#define POOL_APPEND(pool, beg, end) do { \
+ const char *PA_beg = beg; \
+ int PA_size = end - PA_beg; \
+ POOL_GROW (pool, PA_size); \
+ memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
+ (pool).index += PA_size; \
+} while (0)
+
+/* The same as the above, but with zero termination. */
+
+#define POOL_APPEND_ZT(pool, beg, end) do { \
+ const char *PA_beg = beg; \
+ int PA_size = end - PA_beg; \
+ POOL_GROW (pool, PA_size + 1); \
+ memcpy ((pool).contents + (pool).index, PA_beg, PA_size); \
+ (pool).contents[(pool).index + PA_size] = '\0'; \
+ (pool).index += PA_size + 1; \
+} while (0)
+
+/* Forget old pool contents. The allocated memory is not freed. */
+#define POOL_REWIND(pool) pool.index = 0
+
+/* Free heap-allocated memory for contents of POOL. This calls free()
+ if the memory was allocated through malloc. It also restores
+ `contents' and `size' to their original, pre-malloc values. That
+ way after POOL_FREE, the pool is fully usable, just as if it were
+ freshly initialized with POOL_INIT. */
+
+#define POOL_FREE(pool) do { \
+ if (!(pool).alloca_p) \
+ free ((pool).contents); \
+ (pool).contents = (pool).orig_contents; \
+ (pool).size = (pool).orig_size; \
+ (pool).index = 0; \
+ (pool).alloca_p = 1; \
+} while (0)
+
+\f
+#define AP_DOWNCASE 1
+#define AP_PROCESS_ENTITIES 2
+#define AP_SKIP_BLANKS 4
+
+/* Copy the text in the range [BEG, END) to POOL, optionally
+ performing operations specified by FLAGS. FLAGS may be any
+ combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_SKIP_BLANKS
+ with the following meaning:
+
+ * AP_DOWNCASE -- downcase all the letters;
+
+ * AP_PROCESS_ENTITIES -- process the SGML entities and write out
+ the decoded string. Recognized entities are <, >, &, ",
+   and the numerical entities.
+
+ * AP_SKIP_BLANKS -- ignore blanks at the beginning and at the end
+ of text. */
+static void
+convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
+{
+ int old_index = pool->index;
+ int size;
+
+ /* First, skip blanks if required. We must do this before entities
+ are processed, so that blanks can still be inserted as, for
+ instance, ` '. */
+ if (flags & AP_SKIP_BLANKS)
+ {
+ while (beg < end && ISSPACE (*beg))
+ ++beg;
+ while (end > beg && ISSPACE (end[-1]))
+ --end;
+ }
+ size = end - beg;
+
+ if (flags & AP_PROCESS_ENTITIES)
+ {
+ /* Stack-allocate a copy of text, process entities and copy it
+ to the pool. */
+ char *local_copy = (char *)alloca (size + 1);
+ const char *from = beg;
+ char *to = local_copy;
+
+ while (from < end)
+ {
+ if (*from != '&')
+ *to++ = *from++;
+ else
+ {
+ const char *save = from;
+ int remain;
+
+ if (++from == end) goto lose;
+ remain = end - from;
+
+ if (*from == '#')
+ {
+ int numeric;
+ ++from;
+ if (from == end || !ISDIGIT (*from)) goto lose;
+ for (numeric = 0; from < end && ISDIGIT (*from); from++)
+ numeric = 10 * numeric + (*from) - '0';
+ if (from < end && ISALPHA (*from)) goto lose;
+ numeric &= 0xff;
+ *to++ = numeric;
+ }
+#define FROB(x) (remain >= (sizeof (x) - 1) \
+ && !memcmp (from, x, sizeof (x) - 1) \
+ && (*(from + sizeof (x) - 1) == ';' \
+ || remain == sizeof (x) - 1 \
+ || !ISALNUM (*(from + sizeof (x) - 1))))
+ else if (FROB ("lt"))
+ *to++ = '<', from += 2;
+ else if (FROB ("gt"))
+ *to++ = '>', from += 2;
+ else if (FROB ("amp"))
+ *to++ = '&', from += 3;
+ else if (FROB ("quot"))
+ *to++ = '\"', from += 4;
+ /* We don't implement the proposed "Added Latin 1"
+ entities (except for nbsp), because it is unnecessary
+ in the context of Wget, and would require hashing to
+ work efficiently. */
+ else if (FROB ("nbsp"))
+ *to++ = 160, from += 4;
+ else
+ goto lose;
+#undef FROB
+ /* If the entity was followed by `;', we step over the
+ `;'. Otherwise, it was followed by either a
+ non-alphanumeric or EOB, in which case we do nothing. */
+ if (from < end && *from == ';')
+ ++from;
+ continue;
+
+ lose:
+ /* This was not an entity after all. Back out. */
+ from = save;
+ *to++ = *from++;
+ }
+ }
+ *to++ = '\0';
+ POOL_APPEND (*pool, local_copy, to);
+ }
+ else
+ {
+ /* Just copy the text to the pool. */
+ POOL_APPEND_ZT (*pool, beg, end);
+ }
+
+ if (flags & AP_DOWNCASE)
+ {
+ char *p = pool->contents + old_index;
+ for (; *p; p++)
+ *p = TOLOWER (*p);
+ }
+}
+\f
+/* Check whether the contents of [POS, POS+LENGTH) match any of the
+ strings in the ARRAY. */
+static int
+array_allowed (const char **array, const char *beg, const char *end)
+{
+ int length = end - beg;
+ if (array)
+ {
+ for (; *array; array++)
+ if (length >= strlen (*array)
+ && !strncasecmp (*array, beg, length))
+ break;
+ if (!*array)
+ return 0;
+ }
+ return 1;
+}
+\f
+/* RFC1866: name [of attribute or tag] consists of letters, digits,
+ periods, or hyphens. We also allow _, for compatibility with
+ brain-damaged generators. */
+#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_')
+
+/* States while advancing through comments. */
+#define AC_S_DONE 0
+#define AC_S_BACKOUT 1
+#define AC_S_BANG 2
+#define AC_S_DEFAULT 3
+#define AC_S_DCLNAME 4
+#define AC_S_DASH1 5
+#define AC_S_DASH2 6
+#define AC_S_COMMENT 7
+#define AC_S_DASH3 8
+#define AC_S_DASH4 9
+#define AC_S_QUOTE1 10
+#define AC_S_IN_QUOTE 11
+#define AC_S_QUOTE2 12
+
+#ifdef STANDALONE
+static int comment_backout_count;
+#endif
+
+/* Advance over an SGML declaration (the <!...> forms you find in HTML
+ documents). The function returns the location after the
+ declaration. The reason we need this is that HTML comments are
+ expressed as comments in so-called "empty declarations".
+
+ To recap: any SGML declaration may have comments associated with
+ it, e.g.
+ <!MY-DECL -- isn't this fun? -- foo bar>
+
+ An HTML comment is merely an empty declaration (<!>) with a comment
+ attached, like this:
+ <!-- some stuff here -->
+
+ Several comments may be embedded in one comment declaration:
+ <!-- have -- -- fun -->
+
+ Whitespace is allowed between and after the comments, but not
+ before the first comment.
+
+ Additionally, this function attempts to handle double quotes in
+ SGML declarations correctly. */
+static const char *
+advance_declaration (const char *beg, const char *end)
+{
+ const char *p = beg;
+ char quote_char = '\0'; /* shut up, gcc! */
+ char ch;
+ int state = AC_S_BANG;
+
+ if (beg == end)
+ return beg;
+ ch = *p++;
+
+ /* It looked like a good idea to write this as a state machine, but
+ now I wonder... */
+
+ while (state != AC_S_DONE && state != AC_S_BACKOUT)
+ {
+ if (p == end)
+ state = AC_S_BACKOUT;
+ switch (state)
+ {
+ case AC_S_DONE:
+ case AC_S_BACKOUT:
+ break;
+ case AC_S_BANG:
+ if (ch == '!')
+ {
+ ch = *p++;
+ state = AC_S_DEFAULT;
+ }
+ else
+ state = AC_S_BACKOUT;
+ break;
+ case AC_S_DEFAULT:
+ switch (ch)
+ {
+ case '-':
+ state = AC_S_DASH1;
+ break;
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ ch = *p++;
+ break;
+ case '>':
+ state = AC_S_DONE;
+ break;
+ case '\'':
+ case '\"':
+ state = AC_S_QUOTE1;
+ break;
+ default:
+ if (NAME_CHAR_P (ch))
+ state = AC_S_DCLNAME;
+ else
+ state = AC_S_BACKOUT;
+ break;
+ }
+ break;
+ case AC_S_DCLNAME:
+ if (NAME_CHAR_P (ch))
+ ch = *p++;
+ else if (ch == '-')
+ state = AC_S_DASH1;
+ else
+ state = AC_S_DEFAULT;
+ break;
+ case AC_S_QUOTE1:
+ assert (ch == '\'' || ch == '\"');
+ quote_char = ch; /* cheating -- I really don't feel like
+ introducing more different states for
+ different quote characters. */
+ ch = *p++;
+ state = AC_S_IN_QUOTE;
+ break;
+ case AC_S_IN_QUOTE:
+ if (ch == quote_char)
+ state = AC_S_QUOTE2;
+ else
+ ch = *p++;
+ break;
+ case AC_S_QUOTE2:
+ assert (ch == quote_char);
+ ch = *p++;
+ state = AC_S_DEFAULT;
+ break;
+ case AC_S_DASH1:
+ assert (ch == '-');
+ ch = *p++;
+ state = AC_S_DASH2;
+ break;
+ case AC_S_DASH2:
+ switch (ch)
+ {
+ case '-':
+ ch = *p++;
+ state = AC_S_COMMENT;
+ break;
+ default:
+ state = AC_S_BACKOUT;
+ }
+ break;
+ case AC_S_COMMENT:
+ switch (ch)
+ {
+ case '-':
+ state = AC_S_DASH3;
+ break;
+ default:
+ ch = *p++;
+ break;
+ }
+ break;
+ case AC_S_DASH3:
+ assert (ch == '-');
+ ch = *p++;
+ state = AC_S_DASH4;
+ break;
+ case AC_S_DASH4:
+ switch (ch)
+ {
+ case '-':
+ ch = *p++;
+ state = AC_S_DEFAULT;
+ break;
+ default:
+ state = AC_S_COMMENT;
+ break;
+ }
+ break;
+ }
+ }
+
+ if (state == AC_S_BACKOUT)
+ {
+#ifdef STANDALONE
+ ++comment_backout_count;
+#endif
+ return beg + 1;
+ }
+ return p;
+}
+\f
+/* Advance P (a char pointer), with the explicit intent of being able
+ to read the next character. If this is not possible, go to finish. */
+
+#define ADVANCE(p) do { \
+ ++p; \
+ if (p >= end) \
+ goto finish; \
+} while (0)
+
+/* Skip whitespace, if any. */
+
+#define SKIP_WS(p) do { \
+ while (ISSPACE (*p)) { \
+ ADVANCE (p); \
+ } \
+} while (0)
+
+/* Skip non-whitespace, if any. */
+
+#define SKIP_NON_WS(p) do { \
+ while (!ISSPACE (*p)) { \
+ ADVANCE (p); \
+ } \
+} while (0)
+
+#ifdef STANDALONE
+static int tag_backout_count;
+#endif
+
+/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
+ MAPFUN will be called with two arguments: pointer to an initialized
+ struct taginfo, and CLOSURE.
+
+ ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
+ be processed by this function. If it is NULL, all the tags are
+ allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
+
+ (Obviously, the caller can filter out unwanted tags and attributes
+ just as well, but this is just an optimization designed to avoid
+ unnecessary copying for tags/attributes which the caller doesn't
+ want to know about. These lists are searched linearly; therefore,
+ if you're interested in a large number of tags or attributes, you'd
+ better set these to NULL and filter them out yourself with a
+ hashing process most appropriate for your application.) */
+
+void
+map_html_tags (const char *text, int size,
+ const char **allowed_tag_names,
+ const char **allowed_attribute_names,
+ void (*mapfun) (struct taginfo *, void *),
+ void *closure)
+{
+ const char *p = text;
+ const char *end = text + size;
+
+ int attr_pair_count = 8;
+ int attr_pair_alloca_p = 1;
+ struct attr_pair *pairs = ALLOCA_ARRAY (struct attr_pair, attr_pair_count);
+ struct pool pool;
+
+ if (!size)
+ return;
+
+ POOL_INIT (pool, 256);
+
+ {
+ int nattrs, end_tag;
+ const char *tag_name_begin, *tag_name_end;
+ const char *tag_start_position;
+ int uninteresting_tag;
+
+ look_for_tag:
+ POOL_REWIND (pool);
+
+ nattrs = 0;
+ end_tag = 0;
+
+ /* Find beginning of tag. We use memchr() instead of the usual
+ looping with ADVANCE() for speed. */
+ p = memchr (p, '<', end - p);
+ if (!p)
+ goto finish;
+
+ tag_start_position = p;
+ ADVANCE (p);
+
+ /* Establish the type of the tag (start-tag, end-tag or
+ declaration). */
+ if (*p == '!')
+ {
+ /* This is an SGML declaration -- just skip it. */
+ p = advance_declaration (p, end);
+ if (p == end)
+ goto finish;
+ goto look_for_tag;
+ }
+ else if (*p == '/')
+ {
+ end_tag = 1;
+ ADVANCE (p);
+ }
+ tag_name_begin = p;
+ while (NAME_CHAR_P (*p))
+ ADVANCE (p);
+ if (p == tag_name_begin)
+ goto look_for_tag;
+ tag_name_end = p;
+ SKIP_WS (p);
+ if (end_tag && *p != '>')
+ goto backout_tag;
+
+ if (!array_allowed (allowed_tag_names, tag_name_begin, tag_name_end))
+ /* We can't just say "goto look_for_tag" here because we need
+ the loop below to properly advance over the tag's attributes. */
+ uninteresting_tag = 1;
+ else
+ {
+ uninteresting_tag = 0;
+ convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
+ }
+
+ /* Find the attributes. */
+ while (1)
+ {
+ const char *attr_name_begin, *attr_name_end;
+ const char *attr_value_begin, *attr_value_end;
+ const char *attr_raw_value_begin, *attr_raw_value_end;
+ int operation = AP_DOWNCASE; /* stupid compiler. */
+
+ SKIP_WS (p);
+
+ /* Check for end of tag definition. */
+ if (*p == '>')
+ break;
+
+ /* Establish bounds of attribute name. */
+ attr_name_begin = p; /* <foo bar ...> */
+ /* ^ */
+ while (NAME_CHAR_P (*p))
+ ADVANCE (p);
+ attr_name_end = p; /* <foo bar ...> */
+ /* ^ */
+ if (attr_name_begin == attr_name_end)
+ goto backout_tag;
+
+ /* Establish bounds of attribute value. */
+ SKIP_WS (p);
+ if (NAME_CHAR_P (*p) || *p == '>')
+ {
+ /* Minimized attribute syntax allows `=' to be omitted.
+ For example, <UL COMPACT> is a valid shorthand for <UL
+ COMPACT="compact">. Even if such attributes are not
+ useful to Wget, we need to support them, so that the
+ tags containing them can be parsed correctly. */
+ attr_raw_value_begin = attr_value_begin = attr_name_begin;
+ attr_raw_value_end = attr_value_end = attr_name_end;
+ }
+ else if (*p == '=')
+ {
+ ADVANCE (p);
+ SKIP_WS (p);
+ if (*p == '\"' || *p == '\'')
+ {
+ int newline_seen = 0;
+ char quote_char = *p;
+ attr_raw_value_begin = p;
+ ADVANCE (p);
+ attr_value_begin = p; /* <foo bar="baz"> */
+ /* ^ */
+ while (*p != quote_char)
+ {
+ if (!newline_seen && *p == '\n')
+ {
+ /* If a newline is seen within the quotes, it
+ is most likely that someone forgot to close
+ the quote. In that case, we back out to
+ the value beginning, and terminate the tag
+ at either `>' or the delimiter, whichever
+ comes first. Such a tag terminated at `>'
+ is discarded. */
+ p = attr_value_begin;
+ newline_seen = 1;
+ continue;
+ }
+ else if (newline_seen && *p == '>')
+ break;
+ ADVANCE (p);
+ }
+ attr_value_end = p; /* <foo bar="baz"> */
+ /* ^ */
+ if (*p == quote_char)
+ ADVANCE (p);
+ else
+ goto look_for_tag;
+ attr_raw_value_end = p; /* <foo bar="baz"> */
+ /* ^ */
+ /* The AP_SKIP_BLANKS part is not entirely correct,
+ because we don't want to skip blanks for all the
+ attribute values. */
+ operation = AP_PROCESS_ENTITIES | AP_SKIP_BLANKS;
+ }
+ else
+ {
+ attr_value_begin = p; /* <foo bar=baz> */
+ /* ^ */
+ /* According to SGML, a name token should consist only
+ of alphanumerics, . and -. However, this is often
+ violated by, for instance, `%' in `width=75%'.
+ We'll be liberal and allow just about anything as
+ an attribute value. */
+ while (!ISSPACE (*p) && *p != '>')
+ ADVANCE (p);
+ attr_value_end = p; /* <foo bar=baz qux=quix> */
+ /* ^ */
+ if (attr_value_begin == attr_value_end)
+ /* <foo bar=> */
+ /* ^ */
+ goto backout_tag;
+ attr_raw_value_begin = attr_value_begin;
+ attr_raw_value_end = attr_value_end;
+ operation = AP_PROCESS_ENTITIES;
+ }
+ }
+ else
+ {
+ /* We skipped the whitespace and found something that is
+ neither `=' nor the beginning of the next attribute's
+ name. Back out. */
+ goto backout_tag; /* <foo bar /... */
+ /* ^ */
+ }
+
+ /* If we're not interested in the tag, don't bother with any
+ of the attributes. */
+ if (uninteresting_tag)
+ continue;
+
+ /* If we aren't interested in the attribute, skip it. We
+ cannot do this test any sooner, because our text pointer
+ needs to correctly advance over the attribute. */
+ if (allowed_attribute_names
+ && !array_allowed (allowed_attribute_names, attr_name_begin,
+ attr_name_end))
+ continue;
+
+ DO_REALLOC_FROM_ALLOCA (pairs, attr_pair_count, nattrs + 1,
+ attr_pair_alloca_p, struct attr_pair);
+
+ pairs[nattrs].name_pool_index = pool.index;
+ convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
+
+ pairs[nattrs].value_pool_index = pool.index;
+ convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
+ pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
+ pairs[nattrs].value_raw_size = (attr_raw_value_end
+ - attr_raw_value_begin);
+ ++nattrs;
+ }
+
+ if (uninteresting_tag)
+ {
+ ADVANCE (p);
+ goto look_for_tag;
+ }
+
+ /* By now, we have a valid tag with a name and zero or more
+ attributes. Fill in the data and call the mapper function. */
+ {
+ int i;
+ struct taginfo taginfo;
+
+ taginfo.name = pool.contents;
+ taginfo.end_tag_p = end_tag;
+ taginfo.nattrs = nattrs;
+ /* We fill in the char pointers only now, when pool can no
+ longer get realloc'ed. If we did that above, we could get
+ hosed by reallocation. Obviously, after this point, the pool
+ may no longer be grown. */
+ for (i = 0; i < nattrs; i++)
+ {
+ pairs[i].name = pool.contents + pairs[i].name_pool_index;
+ pairs[i].value = pool.contents + pairs[i].value_pool_index;
+ }
+ taginfo.attrs = pairs;
+ taginfo.start_position = tag_start_position;
+ taginfo.end_position = p + 1;
+ /* Ta-dam! */
+ (*mapfun) (&taginfo, closure);
+ ADVANCE (p);
+ }
+ goto look_for_tag;
+
+ backout_tag:
+#ifdef STANDALONE
+ ++tag_backout_count;
+#endif
+ /* The tag wasn't really a tag. Treat its contents as ordinary
+ data characters. */
+ p = tag_start_position + 1;
+ goto look_for_tag;
+ }
+
+ finish:
+ POOL_FREE (pool);
+ if (!attr_pair_alloca_p)
+ free (pairs);
+}
+
+#undef ADVANCE
+#undef SKIP_WS
+#undef SKIP_NON_WS
+\f
+#ifdef STANDALONE
+static void
+test_mapper (struct taginfo *taginfo, void *arg)
+{
+ int i;
+
+ printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
+ for (i = 0; i < taginfo->nattrs; i++)
+ printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
+ putchar ('\n');
+ ++*(int *)arg;
+}
+
+int main ()
+{
+ int size = 256;
+ char *x = (char *)xmalloc (size);
+ int length = 0;
+ int read_count;
+ int tag_counter = 0;
+
+ while ((read_count = fread (x + length, 1, size - length, stdin)))
+ {
+ length += read_count;
+ size <<= 1;
+ x = (char *)xrealloc (x, size);
+ }
+
+ map_html_tags (x, length, NULL, NULL, test_mapper, &tag_counter);
+ printf ("TAGS: %d\n", tag_counter);
+ printf ("Tag backouts: %d\n", tag_backout_count);
+ printf ("Comment backouts: %d\n", comment_backout_count);
+ return 0;
+}
+#endif /* STANDALONE */
--- /dev/null
+/* Declarations for html-parse.c.
+ Copyright (C) 1998 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+struct attr_pair {
+ char *name; /* attribute name */
+ char *value; /* attribute value */
+
+ /* Needed for URL conversion; the places where the value begins and
+ ends, including the quotes and everything. */
+ const char *value_raw_beginning;
+ int value_raw_size;
+
+ /* Used internally by map_html_tags. */
+ int name_pool_index, value_pool_index;
+};
+
+struct taginfo {
+ char *name; /* tag name */
+ int end_tag_p; /* whether this is an end-tag */
+ int nattrs; /* number of attributes */
+ struct attr_pair *attrs; /* attributes */
+
+ const char *start_position; /* start position of tag */
+ const char *end_position; /* end position of tag */
+};
+
+void map_html_tags PARAMS ((const char *, int, const char **, const char **,
+ void (*) (struct taginfo *, void *), void *));
--- /dev/null
+/* Collect URLs from HTML source.
+ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of Wget.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#include <config.h>
+
+#include <stdio.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "html-parse.h"
+#include "url.h"
+#include "utils.h"
+
+#ifndef errno
+extern int errno;
+#endif
+
+enum tag_category { TC_LINK, TC_SPEC };
+
+/* Here we try to categorize the known tags. Each tag has its ID and
+ cetegory. Category TC_LINK means that one or more of its
+ attributes contain links that should be retrieved. TC_SPEC means
+ that the tag is specific in some way, and has to be handled
+ specially. */
+static struct {
+ const char *name;
+ enum tag_category category;
+} known_tags[] = {
+#define TAG_A 0
+ { "a", TC_LINK },
+#define TAG_APPLET 1
+ { "applet", TC_LINK },
+#define TAG_AREA 2
+ { "area", TC_LINK },
+#define TAG_BASE 3
+ { "base", TC_SPEC },
+#define TAG_BGSOUND 4
+ { "bgsound", TC_LINK },
+#define TAG_BODY 5
+ { "body", TC_LINK },
+#define TAG_EMBED 6
+ { "embed", TC_LINK },
+#define TAG_FIG 7
+ { "fig", TC_LINK },
+#define TAG_FRAME 8
+ { "frame", TC_LINK },
+#define TAG_IFRAME 9
+ { "iframe", TC_LINK },
+#define TAG_IMG 10
+ { "img", TC_LINK },
+#define TAG_INPUT 11
+ { "input", TC_LINK },
+#define TAG_LAYER 12
+ { "layer", TC_LINK },
+#define TAG_LINK 13
+ { "link", TC_SPEC },
+#define TAG_META 14
+ { "meta", TC_SPEC },
+#define TAG_OVERLAY 15
+ { "overlay", TC_LINK },
+#define TAG_SCRIPT 16
+ { "script", TC_LINK },
+#define TAG_TABLE 17
+ { "table", TC_LINK },
+#define TAG_TD 18
+ { "td", TC_LINK },
+#define TAG_TH 19
+ { "th", TC_LINK }
+};
+
+/* Flags for specific url-attr pairs handled through TC_LINK: */
+#define AF_EXTERNAL 1
+
+/* For tags handled by TC_LINK: attributes that contain URLs to
+ download. */
+static struct {
+ int tagid;
+ const char *attr_name;
+ int flags;
+} url_tag_attr_map[] = {
+ { TAG_A, "href", AF_EXTERNAL },
+ { TAG_APPLET, "code", 0 },
+ { TAG_AREA, "href", AF_EXTERNAL },
+ { TAG_BGSOUND, "src", 0 },
+ { TAG_BODY, "background", 0 },
+ { TAG_EMBED, "src", 0 },
+ { TAG_FIG, "src", 0 },
+ { TAG_FRAME, "src", 0 },
+ { TAG_IFRAME, "src", 0 },
+ { TAG_IMG, "href", 0 },
+ { TAG_IMG, "lowsrc", 0 },
+ { TAG_IMG, "src", 0 },
+ { TAG_INPUT, "src", 0 },
+ { TAG_LAYER, "src", 0 },
+ { TAG_OVERLAY, "src", 0 },
+ { TAG_SCRIPT, "src", 0 },
+ { TAG_TABLE, "background", 0 },
+ { TAG_TD, "background", 0 },
+ { TAG_TH, "background", 0 }
+};
+
+/* The lists of interesting tags and attributes are built dynamically,
+ from the information above. However, some places in the code refer
+ to the attributes not mentioned here. We add them manually. */
+static const char *additional_attributes[] = {
+ "rel", /* for TAG_LINK */
+ "http-equiv", /* for TAG_META */
+ "name", /* for TAG_META */
+ "content" /* for TAG_META */
+};
+
+static const char **interesting_tags;
+static const char **interesting_attributes;
+
+void
+init_interesting (void)
+{
+ /* Init the variables interesting_tags and interesting_attributes
+ that are used by the HTML parser to know which tags and
+ attributes we're interested in. We initialize this only once,
+ for performance reasons.
+
+ Here we also make sure that what we put in interesting_tags
+ matches the user's preferences as specified through --ignore-tags
+ and --follow-tags. */
+
+ {
+ int i, ind = 0;
+ int size = ARRAY_SIZE (known_tags);
+ interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
+
+ for (i = 0; i < size; i++)
+ {
+ const char *name = known_tags[i].name;
+
+ /* Normally here we could say:
+ interesting_tags[i] = name;
+ But we need to respect the settings of --ignore-tags and
+ --follow-tags, so the code gets a bit harier. */
+
+ if (opt.ignore_tags)
+ {
+ /* --ignore-tags was specified. Do not match these
+ specific tags. --ignore-tags takes precedence over
+ --follow-tags, so we process --ignore first and fall
+ through if there's no match. */
+ int j, lose = 0;
+ for (j = 0; opt.ignore_tags[j] != NULL; j++)
+ /* Loop through all the tags this user doesn't care
+ about. */
+ if (strcasecmp(opt.ignore_tags[j], name) == EQ)
+ {
+ lose = 1;
+ break;
+ }
+ if (lose)
+ continue;
+ }
+
+ if (opt.follow_tags)
+ {
+ /* --follow-tags was specified. Only match these specific
+ tags, so return FALSE if we don't match one of them. */
+ int j, win = 0;
+ for (j = 0; opt.follow_tags[j] != NULL; j++)
+ /* Loop through all the tags this user cares about. */
+ if (strcasecmp(opt.follow_tags[j], name) == EQ)
+ {
+ win = 1;
+ break;
+ }
+ if (!win)
+ continue; /* wasn't one of the explicitly
+ desired tags */
+ }
+
+ /* If we get to here, --follow-tags isn't being used or the
+ tag is among the ones that are follwed, and --ignore-tags,
+ if specified, didn't include this tag, so it's an
+ "interesting" one. */
+ interesting_tags[ind++] = name;
+ }
+ interesting_tags[ind] = NULL;
+ }
+
+ /* The same for attributes, except we loop through url_tag_attr_map.
+ Here we also need to make sure that the list of attributes is
+ unique, and to include the attributes from additional_attributes. */
+ {
+ int i, ind;
+ const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
+ * sizeof (char *));
+ /* First copy the "additional" attributes. */
+ for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
+ att[i] = additional_attributes[i];
+ ind = i;
+ att[ind] = NULL;
+ for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++)
+ {
+ int j, seen = 0;
+ const char *look_for = url_tag_attr_map[i].attr_name;
+ for (j = 0; j < ind - 1; j++)
+ if (!strcmp (att[j], look_for))
+ {
+ seen = 1;
+ break;
+ }
+ if (!seen)
+ {
+ att = xrealloc (att, (ind + 2) * sizeof (*att));
+ att[ind++] = look_for;
+ att[ind] = NULL;
+ }
+ }
+ interesting_attributes = att;
+ }
+}
+
+static int
+find_tag (const char *tag_name)
+{
+ int i;
+
+ /* This is linear search; if the number of tags grow, we can switch
+ to binary search. */
+
+ for (i = 0; i < ARRAY_SIZE (known_tags); i++)
+ {
+ int cmp = strcasecmp (known_tags[i].name, tag_name);
+ /* known_tags are sorted alphabetically, so we can
+ micro-optimize. */
+ if (cmp > 0)
+ break;
+ else if (cmp == 0)
+ return i;
+ }
+ return -1;
+}
+
+/* Find the value of attribute named NAME in the taginfo TAG. If the
+ attribute is not present, return NULL. If ATTRID is non-NULL, the
+ exact identity of the attribute will be returned. */
+static char *
+find_attr (struct taginfo *tag, const char *name, int *attrid)
+{
+ int i;
+ for (i = 0; i < tag->nattrs; i++)
+ if (!strcasecmp (tag->attrs[i].name, name))
+ {
+ if (attrid)
+ *attrid = i;
+ return tag->attrs[i].value;
+ }
+ return NULL;
+}
+
+struct collect_urls_closure {
+ char *text; /* HTML text. */
+ char *base; /* Base URI of the document, possibly
+ changed through <base href=...>. */
+ urlpos *head, *tail; /* List of URLs */
+ const char *parent_base; /* Base of the current document. */
+ const char *document_file; /* File name of this document. */
+ int dash_p_leaf_HTML; /* Whether -p is specified, and this
+ document is the "leaf" node of the
+ HTML tree. */
+ int nofollow; /* whether NOFOLLOW was specified in a
+ <meta name=robots> tag. */
+};
+
+/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID
+ are the necessary context to store the position and size. */
+
+static void
+handle_link (struct collect_urls_closure *closure, const char *link_uri,
+ struct taginfo *tag, int attrid)
+{
+ int no_proto = !has_proto (link_uri);
+ urlpos *newel;
+
+ const char *base = closure->base ? closure->base : closure->parent_base;
+ char *complete_uri;
+
+ char *fragment = strrchr (link_uri, '#');
+
+ if (fragment)
+ {
+ /* Nullify the fragment identifier, i.e. everything after the
+ last occurrence of `#', inclusive. This copying is
+ relatively inefficient, but it doesn't matter because
+ fragment identifiers don't come up all that often. */
+ int hashlen = fragment - link_uri;
+ char *p = alloca (hashlen + 1);
+ memcpy (p, link_uri, hashlen);
+ p[hashlen] = '\0';
+ link_uri = p;
+ }
+
+ if (!base)
+ {
+ if (no_proto)
+ {
+ /* We have no base, and the link does not have a protocol or
+ a host attached to it. Nothing we can do. */
+ /* #### Should we print a warning here? Wget 1.5.x used to. */
+ return;
+ }
+ else
+ complete_uri = xstrdup (link_uri);
+ }
+ else
+ complete_uri = url_concat (base, link_uri);
+
+ DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
+ closure->document_file, base ? base : "(null)",
+ link_uri, complete_uri));
+
+ newel = (urlpos *)xmalloc (sizeof (urlpos));
+
+ memset (newel, 0, sizeof (*newel));
+ newel->next = NULL;
+ newel->url = complete_uri;
+ newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
+ newel->size = tag->attrs[attrid].value_raw_size;
+
+ /* A URL is relative if the host and protocol are not named, and the
+ name does not start with `/'.
+ #### This logic might need some rethinking. */
+ if (no_proto && *link_uri != '/')
+ newel->flags |= (URELATIVE | UNOPROTO);
+ else if (no_proto)
+ newel->flags |= UNOPROTO;
+
+ if (closure->tail)
+ {
+ closure->tail->next = newel;
+ closure->tail = newel;
+ }
+ else
+ closure->tail = closure->head = newel;
+}
+
+/* #### Document what this does.
+ #### It would be nice to split this into several functions. */
+
+static void
+collect_tags_mapper (struct taginfo *tag, void *arg)
+{
+ struct collect_urls_closure *closure = (struct collect_urls_closure *)arg;
+ int tagid = find_tag (tag->name);
+ assert (tagid != -1);
+
+ switch (known_tags[tagid].category)
+ {
+ case TC_LINK:
+ {
+ int i;
+ int size = ARRAY_SIZE (url_tag_attr_map);
+ for (i = 0; i < size; i++)
+ if (url_tag_attr_map[i].tagid == tagid)
+ break;
+ /* We've found the index of url_tag_attr_map where the
+ attributes of our tags begin. Now, look for every one of
+ them, and handle it. */
+ for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
+ {
+ char *attr_value;
+ int id;
+ if (closure->dash_p_leaf_HTML
+ && (url_tag_attr_map[i].flags & AF_EXTERNAL))
+ /* If we're at a -p leaf node, we don't want to retrieve
+ links to references we know are external, such as <a
+ href=...>. */
+ continue;
+
+ /* This find_attr() buried in a loop may seem inefficient
+ (O(n^2)), but it's not, since the number of attributes
+ (n) we loop over is extremely small. In the worst case
+ of IMG with all its possible attributes, n^2 will be
+ only 9. */
+ attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
+ if (attr_value)
+ handle_link (closure, attr_value, tag, id);
+ }
+ }
+ break;
+ case TC_SPEC:
+ switch (tagid)
+ {
+ case TAG_BASE:
+ {
+ char *newbase = find_attr (tag, "href", NULL);
+ if (!newbase)
+ break;
+ if (closure->base)
+ free (closure->base);
+ if (closure->parent_base)
+ closure->base = url_concat (closure->parent_base, newbase);
+ else
+ closure->base = xstrdup (newbase);
+ }
+ break;
+ case TAG_LINK:
+ {
+ int id;
+ char *rel = find_attr (tag, "rel", NULL);
+ char *href = find_attr (tag, "href", &id);
+ if (href)
+ {
+ /* In the normal case, all <link href=...> tags are
+ fair game.
+
+ In the special case of when -p is active, however,
+ and we're at a leaf node (relative to the -l
+ max. depth) in the HTML document tree, the only
+ <LINK> tag we'll follow is a <LINK REL=
+ "stylesheet">, as it's necessary for displaying
+ this document properly. We won't follow other
+ <LINK> tags, like <LINK REL="home">, for instance,
+ as they refer to external documents. */
+ if (!closure->dash_p_leaf_HTML
+ || (rel && !strcasecmp (rel, "stylesheet")))
+ handle_link (closure, href, tag, id);
+ }
+ }
+ break;
+ case TAG_META:
+ /* Some pages use a META tag to specify that the page be
+ refreshed by a new page after a given number of seconds.
+ The general format for this is:
+
+ <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
+
+ So we just need to skip past the "NUMBER; URL=" garbage
+ to get to the URL. */
+ {
+ int id;
+ char *name = find_attr (tag, "name", NULL);
+ char *http_equiv = find_attr (tag, "http-equiv", &id);
+ if (http_equiv && !strcasecmp (http_equiv, "refresh"))
+ {
+ char *refresh = find_attr (tag, "content", NULL);
+ char *p = refresh;
+ int offset;
+ while (ISDIGIT (*p))
+ ++p;
+ if (*p++ != ';')
+ return;
+ while (ISSPACE (*p))
+ ++p;
+ if (!(TOUPPER (*p) == 'U'
+ && TOUPPER (*(p + 1)) == 'R'
+ && TOUPPER (*(p + 2)) == 'L'
+ && *(p + 3) == '='))
+ return;
+ p += 4;
+ while (ISSPACE (*p))
+ ++p;
+ offset = p - refresh;
+ tag->attrs[id].value_raw_beginning += offset;
+ tag->attrs[id].value_raw_size -= offset;
+ handle_link (closure, p, tag, id);
+ }
+ else if (name && !strcasecmp (name, "robots"))
+ {
+ /* Handle stuff like:
+ <meta name="robots" content="index,nofollow"> */
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+ if (!strcasecmp (content, "none"))
+ closure->nofollow = 1;
+ else
+ {
+ while (*content)
+ {
+ /* Find the next occurrence of ',' or the end of
+ the string. */
+ char *end = strchr (content, ',');
+ if (end)
+ ++end;
+ else
+ end = content + strlen (content);
+ if (!strncasecmp (content, "nofollow", end - content))
+ closure->nofollow = 1;
+ content = end;
+ }
+ }
+ }
+ }
+ break;
+ default:
+ /* Category is TC_SPEC, but tag name is unhandled. This
+ must not be. */
+ abort ();
+ }
+ break;
+ }
+}
+
+/* Scan FILE, retrieving links to HTML documents from it. Each link is
+
+ Similar to get_urls_file, but for HTML files. FILE is scanned as
+ an HTML document. get_urls_html() constructs the URLs from the
+ relative href-s.
+
+ If SILENT is non-zero, do not barf on baseless relative links. */
+urlpos *
+get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
+ int *meta_disallow_follow)
+{
+ struct file_memory *fm;
+ struct collect_urls_closure closure;
+
+ /* Load the file. */
+ fm = read_file (file);
+ if (!fm)
+ {
+ logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+ return NULL;
+ }
+ DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+
+ closure.text = fm->content;
+ closure.head = closure.tail = NULL;
+ closure.base = NULL;
+ closure.parent_base = this_url ? this_url : opt.base_href;
+ closure.document_file = file;
+ closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
+ closure.nofollow = 0;
+
+ if (!interesting_tags)
+ init_interesting ();
+
+ map_html_tags (fm->content, fm->length, interesting_tags,
+ interesting_attributes, collect_tags_mapper, &closure);
+
+ DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow));
+ if (meta_disallow_follow)
+ *meta_disallow_follow = closure.nofollow;
+
+ FREE_MAYBE (closure.base);
+ read_file_free (fm);
+ return closure.head;
+}
return 1;
}
+/* Check whether the `Connection' header is set to "keep-alive". */
+static int
+http_process_connection (const char *hdr, void *arg)
+{
+ int *flag = (int *)arg;
+ if (!strcasecmp (hdr, "Keep-Alive"))
+ *flag = 1;
+ return 1;
+}
+\f
+/* Persistent connections (pc). */
+
+static unsigned char pc_last_host[4];
+static unsigned short pc_last_port;
+static int pc_last_fd;
+
+static void
+register_persistent (const char *host, unsigned short port, int fd)
+{
+ if (!store_hostaddress (pc_last_host, host))
+ return;
+ pc_last_port = port;
+ pc_last_fd = fd;
+}
+
+static void
+invalidate_persistent (void)
+{
+ pc_last_port = 0;
+}
+
+static int
+persistent_available_p (const char *host, unsigned short port)
+{
+ unsigned char this_host[4];
+ if (port != pc_last_port)
+ return 0;
+ if (!store_hostaddress (this_host, host))
+ return 0;
+ if (memcmp (pc_last_host, this_host, 4))
+ return 0;
+ if (!test_socket_open (pc_last_fd))
+ {
+ invalidate_persistent ();
+ return 0;
+ }
+ return 1;
+}
+
+/* The idea behind these two CLOSE macros is to distinguish between
+ two cases: one when the job we've been doing is finished, and we
+ want to close the connection and leave, and two when something is
+ seriously wrong and we're closing the connection as part of
+ cleanup.
+
+ In case of keep_alive, CLOSE_FINISH should leave the connection
+ open, while CLOSE_INVALIDATE should still close it.
+
+ The semantic difference between the flags `keep_alive' and
+ `reused_connection' is that keep_alive defines the state of HTTP:
+ whether the connection *will* be preservable. reused_connection,
+ on the other hand, reflects the present: whether the *current*
+ connection is the result of preserving. */
+
+#define CLOSE_FINISH(fd) do { \
+ if (!keep_alive) \
+ { \
+ CLOSE (fd); \
+ if (reused_connection) \
+ invalidate_persistent (); \
+ } \
+} while (0)
+
+#define CLOSE_INVALIDATE(fd) do { \
+ CLOSE (fd); \
+ if (reused_connection) \
+ invalidate_persistent (); \
+} while (0)
+
\f
struct http_stat
{
FILE *fp;
int auth_tried_already;
struct rbuf rbuf;
+ int keep_alive, http_keep_alive_1, http_keep_alive_2;
+ int reused_connection;
if (!(*dt & HEAD_ONLY))
/* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
again:
/* We need to come back here when the initial attempt to retrieve
without authorization header fails. */
+ keep_alive = 0;
+ http_keep_alive_1 = http_keep_alive_2 = 0;
+ reused_connection = 0;
/* Initialize certain elements of struct http_stat. */
hs->len = 0L;
ou = u;
/* First: establish the connection. */
- logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
- err = make_connection (&sock, u->host, u->port);
- switch (err)
+ if (u->proxy || !persistent_available_p (u->host, u->port))
{
- case HOSTERR:
- logputs (LOG_VERBOSE, "\n");
- logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
- return HOSTERR;
- break;
- case CONSOCKERR:
- logputs (LOG_VERBOSE, "\n");
- logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
- return CONSOCKERR;
- break;
- case CONREFUSED:
- logputs (LOG_VERBOSE, "\n");
- logprintf (LOG_NOTQUIET,
- _("Connection to %s:%hu refused.\n"), u->host, u->port);
- CLOSE (sock);
- return CONREFUSED;
- case CONERROR:
- logputs (LOG_VERBOSE, "\n");
- logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
- CLOSE (sock);
- return CONERROR;
- break;
- case NOCONERROR:
- /* Everything is fine! */
- logputs (LOG_VERBOSE, _("connected!\n"));
- break;
- default:
- abort ();
- break;
- } /* switch */
+ logprintf (LOG_VERBOSE, _("Connecting to %s:%hu... "), u->host, u->port);
+ err = make_connection (&sock, u->host, u->port);
+ switch (err)
+ {
+ case HOSTERR:
+ logputs (LOG_VERBOSE, "\n");
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", u->host, herrmsg (h_errno));
+ return HOSTERR;
+ break;
+ case CONSOCKERR:
+ logputs (LOG_VERBOSE, "\n");
+ logprintf (LOG_NOTQUIET, "socket: %s\n", strerror (errno));
+ return CONSOCKERR;
+ break;
+ case CONREFUSED:
+ logputs (LOG_VERBOSE, "\n");
+ logprintf (LOG_NOTQUIET,
+ _("Connection to %s:%hu refused.\n"), u->host, u->port);
+ CLOSE (sock);
+ return CONREFUSED;
+ case CONERROR:
+ logputs (LOG_VERBOSE, "\n");
+ logprintf (LOG_NOTQUIET, "connect: %s\n", strerror (errno));
+ CLOSE (sock);
+ return CONERROR;
+ break;
+ case NOCONERROR:
+ /* Everything is fine! */
+ logputs (LOG_VERBOSE, _("connected!\n"));
+ break;
+ default:
+ abort ();
+ break;
+ }
+ }
+ else
+ {
+ logprintf (LOG_VERBOSE, _("Reusing connection to %s:%hu.\n"), u->host, u->port);
+ sock = pc_last_fd;
+ reused_connection = 1;
+ }
if (u->proxy)
path = u->proxy->url;
User-Agent: %s\r\n\
Host: %s%s\r\n\
Accept: %s\r\n\
+Connection: Keep-Alive\r\n\
%s%s%s%s%s%s\r\n",
command, path, useragent, remhost,
host_port ? host_port : "",
num_written = iwrite (sock, request, strlen (request));
if (num_written < 0)
{
- logputs (LOG_VERBOSE, _("Failed writing HTTP request.\n"));
- CLOSE (sock);
+ logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
+ strerror (errno));
+ CLOSE_INVALIDATE (sock);
return WRITEFAILED;
}
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
FREE_MAYBE (type);
FREE_MAYBE (hs->newloc);
FREE_MAYBE (all_headers);
- CLOSE (sock);
+ CLOSE_INVALIDATE (sock);
return HEOF;
}
else if (status == HG_ERROR)
FREE_MAYBE (type);
FREE_MAYBE (hs->newloc);
FREE_MAYBE (all_headers);
- CLOSE (sock);
+ CLOSE_INVALIDATE (sock);
return HERR;
}
goto done_header;
}
}
+ /* Check for the `Keep-Alive' header. */
+ if (!http_keep_alive_1)
+ {
+ if (header_process (hdr, "Keep-Alive", header_exists,
+ &http_keep_alive_1))
+ goto done_header;
+ }
+ /* Check for `Connection: Keep-Alive'. */
+ if (!http_keep_alive_2)
+ {
+ if (header_process (hdr, "Connection", http_process_connection,
+ &http_keep_alive_2))
+ goto done_header;
+ }
done_header:
free (hdr);
}
logputs (LOG_VERBOSE, "\n");
+ if (contlen != -1
+ && (http_keep_alive_1 || http_keep_alive_2))
+ keep_alive = 1;
+ if (keep_alive && !reused_connection)
+ register_persistent (u->host, u->port, sock);
+
if ((statcode == HTTP_STATUS_UNAUTHORIZED)
&& authenticate_h)
{
FREE_MAYBE (type);
type = NULL;
FREEHSTAT (*hs);
- CLOSE (sock);
+ CLOSE_FINISH (sock);
if (auth_tried_already)
{
/* If we have tried it already, then there is not point
FREE_MAYBE (type);
FREE_MAYBE (hs->newloc);
FREE_MAYBE (all_headers);
- CLOSE (sock);
+ CLOSE_INVALIDATE (sock);
return RANGEERR;
}
_("Location: %s%s\n"),
hs->newloc ? hs->newloc : _("unspecified"),
hs->newloc ? _(" [following]") : "");
- CLOSE (sock);
+ CLOSE_FINISH (sock);
FREE_MAYBE (type);
FREE_MAYBE (all_headers);
return NEWLOCATION;
hs->res = 0;
FREE_MAYBE (type);
FREE_MAYBE (all_headers);
- CLOSE (sock);
+ CLOSE_FINISH (sock);
return RETRFINISHED;
}
if (!fp)
{
logprintf (LOG_NOTQUIET, "%s: %s\n", u->local, strerror (errno));
- CLOSE (sock);
+ CLOSE_FINISH (sock);
FREE_MAYBE (all_headers);
return FOPENERR;
}
/* Get the contents of the document. */
hs->res = get_contents (sock, fp, &hs->len, hs->restval,
(contlen != -1 ? contlen : 0),
- &rbuf);
+ &rbuf, keep_alive);
hs->dltime = elapsed_time ();
{
/* Close or flush the file. We have to be careful to check for
hs->res = -2;
}
FREE_MAYBE (all_headers);
- CLOSE (sock);
+ CLOSE_FINISH (sock);
if (hs->res == -2)
return FWRITEERR;
return RETRFINISHED;
textdomain ("wget");
#endif /* HAVE_NLS */
}
+
+/* It's kosher to declare these here because their interface _has_ to
+ be void foo(void). */
+void url_init PARAMS ((void));
+void host_init PARAMS ((void));
+
+/* This just calls the various initialization functions from the
+ modules that need one-time initialization. */
+static void
+private_initialize (void)
+{
+ url_init ();
+ host_init ();
+}
\f
/* Print the usage message. */
static void
};
i18n_initialize ();
+ private_initialize ();
append_to_log = 0;
#include "ftp.h"
#include "fnmatch.h"
#include "host.h"
+#include "hash.h"
extern char *version_string;
#define ROBOTS_FILENAME "robots.txt"
-/* #### Many of these lists should really be hashtables! */
-
-/* List of downloaded URLs. */
-static urlpos *urls_downloaded;
+static struct hash_table *dl_file_url_map;
+static struct hash_table *dl_url_file_map;
/* List of HTML URLs. */
static slist *urls_html;
/* List of undesirable-to-load URLs. */
-static slist *ulist;
+static struct hash_table *undesirable_urls;
/* List of forbidden locations. */
static char **forbidden = NULL;
void
recursive_cleanup (void)
{
- free_slist (ulist);
- ulist = NULL;
+ if (undesirable_urls)
+ {
+ string_set_free (undesirable_urls);
+ undesirable_urls = NULL;
+ }
+ if (dl_file_url_map)
+ {
+ free_keys_and_values (dl_file_url_map);
+ hash_table_destroy (dl_file_url_map);
+ dl_file_url_map = NULL;
+ }
+ if (dl_url_file_map)
+ {
+ free_keys_and_values (dl_url_file_map);
+ hash_table_destroy (dl_url_file_map);
+ dl_url_file_map = NULL;
+ }
+ undesirable_urls = NULL;
free_vec (forbidden);
forbidden = NULL;
- free_slist (urls_html);
+ slist_free (urls_html);
urls_html = NULL;
- free_urlpos (urls_downloaded);
- urls_downloaded = NULL;
FREE_MAYBE (base_dir);
FREE_MAYBE (robots_host);
first_time = 1;
char *constr, *filename, *newloc;
char *canon_this_url = NULL;
int dt, inl, dash_p_leaf_HTML = FALSE;
+ int meta_disallow_follow;
int this_url_ftp; /* See below the explanation */
uerr_t err;
struct urlinfo *rurl;
/* Cache the current URL in the list. */
if (first_time)
{
- ulist = add_slist (ulist, this_url, 0);
- urls_downloaded = NULL;
+ /* These three operations need to be done only once per Wget
+ run. They should probably be at a different location. */
+ if (!undesirable_urls)
+ undesirable_urls = make_string_hash_table (0);
+ if (!dl_file_url_map)
+ dl_file_url_map = make_string_hash_table (0);
+ if (!dl_url_file_map)
+ dl_url_file_map = make_string_hash_table (0);
+
+ hash_table_clear (undesirable_urls);
+ string_set_add (undesirable_urls, this_url);
+ hash_table_clear (dl_file_url_map);
+ hash_table_clear (dl_url_file_map);
urls_html = NULL;
- /* Enter this_url to the slist, in original and "enhanced" form. */
+ /* Enter this_url to the hash table, in original and "enhanced" form. */
u = newurl ();
err = parseurl (this_url, u, 0);
if (err == URLOK)
{
- ulist = add_slist (ulist, u->url, 0);
- urls_downloaded = add_url (urls_downloaded, u->url, file);
- urls_html = add_slist (urls_html, file, NOSORT);
+ string_set_add (undesirable_urls, u->url);
+ hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url));
+ hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file));
+ urls_html = slist_append (urls_html, file);
if (opt.no_parent)
base_dir = xstrdup (u->dir); /* Set the base dir. */
/* Set the canonical this_url to be sent as referer. This
/* Get the URL-s from an HTML file: */
url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
- 0, dash_p_leaf_HTML);
+ dash_p_leaf_HTML, &meta_disallow_follow);
+
+ if (opt.use_robots && meta_disallow_follow)
+ {
+ /* The META tag says we are not to follow this file. Respect
+ that. */
+ free_urlpos (url_list);
+ url_list = NULL;
+ }
/* Decide what to do with each of the URLs. A URL will be loaded if
it meets several requirements, discussed later. */
the list. */
/* inl is set if the URL we are working on (constr) is stored in
- ulist. Using it is crucial to avoid the incessant calls to
- in_slist, which is quite slow. */
- inl = in_slist (ulist, constr);
+ undesirable_urls. Using it is crucial to avoid unnecessary
+ repeated continuous hits to the hash table. */
+ inl = string_set_exists (undesirable_urls, constr);
/* If it is FTP, and FTP is not followed, chuck it out. */
if (!inl)
if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
{
DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
/* If it is absolute link and they are not followed, chuck it
if (opt.relative_only && !(cur_url->flags & URELATIVE))
{
DEBUGP (("It doesn't really look like a relative link.\n"));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
/* If its domain is not to be accepted/looked-up, chuck it out. */
if (!accept_domain (u))
{
DEBUGP (("I don't like the smell of that domain.\n"));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
/* Check for parent directory. */
{
/* Failing that too, kill the URL. */
DEBUGP (("Trying to escape parental guidance with no_parent on.\n"));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
freeurl (ut, 1);
if (!accdir (u->dir, ALLABS))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", constr, u->dir));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
}
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
constr, u->file));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
}
}
free (constr);
constr = xstrdup (u->url);
- inl = in_slist (ulist, constr);
+ string_set_add (undesirable_urls, constr);
if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
if (!opt.spanhost && this_url && !same_host (this_url, constr))
{
DEBUGP (("This is not the same hostname as the parent's.\n"));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
}
{
DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
ROBOTS_FILENAME));
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
inl = 1;
}
}
{
DEBUGP (("I've decided to load it -> "));
/* Add it to the list of already-loaded URL-s. */
- ulist = add_slist (ulist, constr, 0);
+ string_set_add (undesirable_urls, constr);
/* Automatically followed FTPs will *not* be downloaded
recursively. */
if (u->proto == URLFTP)
{
if (dt & RETROKF)
{
- urls_downloaded = add_url (urls_downloaded, constr, filename);
+ hash_table_put (dl_file_url_map,
+ xstrdup (filename), xstrdup (constr));
+ hash_table_put (dl_url_file_map,
+ xstrdup (constr), xstrdup (filename));
/* If the URL is HTML, note it. */
if (dt & TEXTHTML)
- urls_html = add_slist (urls_html, filename, NOSORT);
+ urls_html = slist_append (urls_html, filename);
}
}
/* If there was no error, and the type is text/html, parse
/* Increment the pbuf for the appropriate size. */
}
if (opt.convert_links && !opt.delete_after)
+ /* This is merely the first pass: the links that have been
+ successfully downloaded are converted. In the second pass,
+ convert_all_links() will also convert those links that have NOT
+ been downloaded to their canonical form. */
convert_links (file, url_list);
/* Free the linked list of URL-s. */
free_urlpos (url_list);
convert_all_links (void)
{
uerr_t res;
- urlpos *l1, *l2, *urls;
+ urlpos *l1, *urls;
struct urlinfo *u;
slist *html;
- urlpos *urlhtml;
for (html = urls_html; html; html = html->next)
{
+ int meta_disallow_follow;
+ char *url;
+
DEBUGP (("Rescanning %s\n", html->string));
/* Determine the URL of the HTML file. get_urls_html will need
it. */
- for (urlhtml = urls_downloaded; urlhtml; urlhtml = urlhtml->next)
- if (!strcmp (urlhtml->local_name, html->string))
- break;
- if (urlhtml)
- DEBUGP (("It should correspond to %s.\n", urlhtml->url));
+ url = hash_table_get (dl_file_url_map, html->string);
+ if (url)
+ DEBUGP (("It should correspond to %s.\n", url));
else
DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */
- urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
- FALSE);
+ urls = get_urls_html (html->string, url, FALSE, &meta_disallow_follow);
+ if (opt.use_robots && meta_disallow_follow)
+ {
+ /* The META tag says we are not to follow this file.
+ Respect that. */
+ free_urlpos (urls);
+ urls = NULL;
+ }
if (!urls)
continue;
for (l1 = urls; l1; l1 = l1->next)
{
+ char *local_name;
/* The URL must be in canonical form to be compared. */
u = newurl ();
res = parseurl (l1->url, u, 0);
}
/* We decide the direction of conversion according to whether
a URL was downloaded. Downloaded URLs will be converted
- ABS2REL, whereas non-downloaded will be converted REL2ABS.
- Note: not yet implemented; only ABS2REL works. */
- for (l2 = urls_downloaded; l2; l2 = l2->next)
- if (!strcmp (l2->url, u->url))
- {
- DEBUGP (("%s flagged for conversion, local %s\n",
- l2->url, l2->local_name));
- break;
- }
+ ABS2REL, whereas non-downloaded will be converted REL2ABS. */
+ local_name = hash_table_get (dl_url_file_map, u->url);
+ if (local_name)
+ DEBUGP (("%s flagged for conversion, local %s\n",
+ u->url, local_name));
/* Clear the flags. */
l1->flags &= ~ (UABS2REL | UREL2ABS);
/* Decide on the conversion direction. */
- if (l2)
+ if (local_name)
{
l1->flags |= UABS2REL;
- l1->local_name = xstrdup (l2->local_name);
+ l1->local_name = xstrdup (local_name);
}
else
{
#include "ftp.h"
#include "host.h"
#include "connect.h"
+#include "hash.h"
#ifdef WINDOWS
LARGE_INTEGER internal_time;
static int show_progress PARAMS ((long, long, enum spflags));
+#define MIN(i, j) ((i) <= (j) ? (i) : (j))
+
/* Reads the contents of file descriptor FD, until it is closed, or a
read error occurs. The data is read in 8K chunks, and stored to
stream fp, which should have been open for writing. If BUF is
from fd immediately, flush or discard the buffer. */
int
get_contents (int fd, FILE *fp, long *len, long restval, long expected,
- struct rbuf *rbuf)
+ struct rbuf *rbuf, int use_expected)
{
- int res;
+ int res = 0;
static char c[8192];
*len = restval;
*len += res;
}
}
- /* Read from fd while there is available data. */
- do
+ /* Read from fd while there is available data.
+
+ Normally, if expected is 0, it means that it is not known how
+ much data is expected. However, if use_expected is specified,
+ then expected being zero means exactly that. */
+ while (!use_expected || (*len < expected))
{
- res = iread (fd, c, sizeof (c));
+ int amount_to_read = (use_expected
+ ? MIN (expected - *len, sizeof (c))
+ : sizeof (c));
+ res = iread (fd, c, amount_to_read);
if (res > 0)
{
if (fwrite (c, sizeof (char), res, fp) < res)
}
*len += res;
}
- } while (res > 0);
+ else
+ break;
+ }
if (res < -1)
res = -1;
if (opt.verbose)
int local_use_proxy;
char *mynewloc, *proxy;
struct urlinfo *u;
- slist *redirections;
+ struct hash_table *redirections = NULL;
/* If dt is NULL, just ignore it. */
if (!dt)
if (file)
*file = NULL;
- redirections = NULL;
-
u = newurl ();
/* Parse the URL. */
result = parseurl (url, u, 0);
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
freeurl (u, 1);
- free_slist (redirections);
+ if (redirections)
+ string_set_free (redirections);
free (url);
return result;
}
{
logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
freeurl (u, 1);
- free_slist (redirections);
+ if (redirections)
+ string_set_free (redirections);
free (url);
return PROXERR;
}
else
logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
freeurl (u, 1);
- free_slist (redirections);
+ if (redirections)
+ string_set_free (redirections);
free (url);
return PROXERR;
}
logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, uerrmsg (newloc_result));
freeurl (newloc_struct, 1);
freeurl (u, 1);
- free_slist (redirections);
+ if (redirections)
+ string_set_free (redirections);
free (url);
free (mynewloc);
return result;
free (mynewloc);
mynewloc = xstrdup (newloc_struct->url);
- /* Check for redirection to back to itself. */
- if (!strcmp (u->url, newloc_struct->url))
+ if (!redirections)
{
- logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
- mynewloc);
- freeurl (newloc_struct, 1);
- freeurl (u, 1);
- free_slist (redirections);
- free (url);
- free (mynewloc);
- return WRONGCODE;
+ redirections = make_string_hash_table (0);
+ /* Add current URL immediately so we can detect it as soon
+ as possible in case of a cycle. */
+ string_set_add (redirections, u->url);
}
/* The new location is OK. Let's check for redirection cycle by
peeking through the history of redirections. */
- if (in_slist (redirections, newloc_struct->url))
+ if (string_set_exists (redirections, newloc_struct->url))
{
logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
mynewloc);
freeurl (newloc_struct, 1);
freeurl (u, 1);
- free_slist (redirections);
+ if (redirections)
+ string_set_free (redirections);
free (url);
free (mynewloc);
return WRONGCODE;
}
-
- redirections = add_slist (redirections, newloc_struct->url, NOSORT);
+ string_set_add (redirections, newloc_struct->url);
free (url);
url = mynewloc;
*file = NULL;
}
freeurl (u, 1);
- free_slist (redirections);
+ if (redirections)
+ string_set_free (redirections);
if (newloc)
*newloc = url;
uerr_t status;
urlpos *url_list, *cur_url;
- /* If spider-mode is on, we do not want get_urls_html barfing
- errors on baseless links. */
- url_list = (html ? get_urls_html (file, NULL, opt.spider, FALSE)
+ url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
: get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
#include "rbuf.h"
-int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *));
+int get_contents PARAMS ((int, FILE *, long *, long, long, struct rbuf *, int));
uerr_t retrieve_url PARAMS ((const char *, char **, char **,
const char *, int *));
#include "utils.h"
#include "url.h"
#include "host.h"
-#include "html.h"
#ifndef errno
extern int errno;
#define DEFAULT_HTTP_PORT 80
#define DEFAULT_FTP_PORT 21
-/* URL separator (for findurl) */
-#define URL_SEPARATOR "!\"#'(),>`{}|<>"
+/* Table of Unsafe chars. This is intialized in
+ init_unsafe_char_table. */
-/* A list of unsafe characters for encoding, as per RFC1738. '@' and
- ':' (not listed in RFC) were added because of user/password
- encoding. */
+static char unsafe_char_table[256];
-#ifndef WINDOWS
-# define URL_UNSAFE_CHARS "<>\"#%{}|\\^~[]`@:"
-#else /* WINDOWS */
-# define URL_UNSAFE_CHARS "<>\"%{}|\\^[]`"
-#endif /* WINDOWS */
-
-#define UNSAFE_CHAR(c) ( ((unsigned char)(c) <= ' ') /* ASCII 32 */ \
- || ((unsigned char)(c) > '~') /* ASCII 127 */ \
- || strchr (URL_UNSAFE_CHARS, c))
+#define UNSAFE_CHAR(c) (unsafe_char_table[(unsigned char)(c)])
/* If S contains unsafe characters, free it and replace it with a
version that doesn't. */
return 0;
}
+/* Unsafe chars:
+ - anything <= 32;
+ - stuff from rfc1738 ("<>\"#%{}|\\^~[]`");
+ - @ and :, for user/password encoding.
+ - everything over 127 (but we don't bother with recording those. */
+void
+init_unsafe_char_table (void)
+{
+ int i;
+ for (i = 0; i < 256; i++)
+ if (i < 32 || i >= 127
+ || i == '<'
+ || i == '>'
+ || i == '\"'
+ || i == '#'
+ || i == '%'
+ || i == '{'
+ || i == '}'
+ || i == '|'
+ || i == '\\'
+ || i == '^'
+ || i == '~'
+ || i == '['
+ || i == ']'
+ || i == '`')
+ unsafe_char_table[i] = 1;
+}
+
/* Returns 1 if the string contains unsafe characters, 0 otherwise. */
int
contains_unsafe (const char *s)
/* Returns 1 if the URL begins with a protocol (supported or
unsupported), 0 otherwise. */
-static int
+int
has_proto (const char *url)
{
char **s;
return res;
}
\f
-/* Find URL of format scheme:hostname[:port]/dir in a buffer. The
- buffer may contain pretty much anything; no errors are signaled. */
-static const char *
-findurl (const char *buf, int howmuch, int *count)
-{
- char **prot;
- const char *s1, *s2;
-
- for (s1 = buf; howmuch; s1++, howmuch--)
- for (prot = protostrings; *prot; prot++)
- if (howmuch <= strlen (*prot))
- continue;
- else if (!strncasecmp (*prot, s1, strlen (*prot)))
- {
- for (s2 = s1, *count = 0;
- howmuch && *s2 && *s2 >= 32 && *s2 < 127 && !ISSPACE (*s2) &&
- !strchr (URL_SEPARATOR, *s2);
- s2++, (*count)++, howmuch--);
- return s1;
- }
- return NULL;
-}
-
-/* Scans the file for signs of URL-s. Returns a vector of pointers,
- each pointer representing a URL string. The file is *not* assumed
- to be HTML. */
urlpos *
get_urls_file (const char *file)
{
- long nread;
- FILE *fp;
- char *buf;
- const char *pbuf;
- int size;
- urlpos *first, *current, *old;
-
- if (file && !HYPHENP (file))
- {
- fp = fopen (file, "rb");
- if (!fp)
- {
- logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
- return NULL;
- }
- }
- else
- fp = stdin;
- /* Load the file. */
- load_file (fp, &buf, &nread);
- if (file && !HYPHENP (file))
- fclose (fp);
- DEBUGP (("Loaded %s (size %ld).\n", file, nread));
- first = current = NULL;
- /* Fill the linked list with URLs. */
- for (pbuf = buf; (pbuf = findurl (pbuf, nread - (pbuf - buf), &size));
- pbuf += size)
- {
- /* Allocate the space. */
- old = current;
- current = (urlpos *)xmalloc (sizeof (urlpos));
- if (old)
- old->next = current;
- memset (current, 0, sizeof (*current));
- current->next = NULL;
- current->url = (char *)xmalloc (size + 1);
- memcpy (current->url, pbuf, size);
- current->url[size] = '\0';
- if (!first)
- first = current;
- }
- /* Free the buffer. */
- free (buf);
-
- return first;
-}
-
-/* Similar to get_urls_file, but for HTML files. FILE is scanned as
- an HTML document using htmlfindurl(), which see. get_urls_html()
- constructs the HTML-s from the relative href-s.
+ struct file_memory *fm;
+ urlpos *head, *tail;
+ const char *text, *text_end;
- If SILENT is non-zero, do not barf on baseless relative links. */
-urlpos *
-get_urls_html (const char *file, const char *this_url, int silent,
- int dash_p_leaf_HTML)
-{
- long nread;
- FILE *fp;
- char *orig_buf;
- const char *buf;
- int step, first_time;
- urlpos *first, *current, *old;
-
- if (file && !HYPHENP (file))
+ /* Load the file. */
+ fm = read_file (file);
+ if (!fm)
{
- fp = fopen (file, "rb");
- if (!fp)
- {
- logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
- return NULL;
- }
+ logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+ return NULL;
}
- else
- fp = stdin;
- /* Load the file. */
- load_file (fp, &orig_buf, &nread);
- if (file && !HYPHENP (file))
- fclose (fp);
- DEBUGP (("Loaded HTML file %s (size %ld).\n", file, nread));
- first = current = NULL;
- first_time = 1;
- /* Iterate over the URLs in BUF, picked by htmlfindurl(). */
- for (buf = orig_buf;
- (buf = htmlfindurl (buf, nread - (buf - orig_buf), &step, first_time,
- dash_p_leaf_HTML));
- buf += step)
+ DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ head = tail = NULL;
+ text = fm->content;
+ text_end = fm->content + fm->length;
+ while (text < text_end)
{
- int i, no_proto;
- int size = step;
- const char *pbuf = buf;
- char *constr, *base;
- const char *cbase;
- char *needs_freeing, *url_data;
-
- first_time = 0;
-
- /* A frequent phenomenon that needs to be handled are pages
- generated by brain-damaged HTML generators, which refer to to
- URI-s as <a href="<spaces>URI<spaces>">. We simply ignore
- any spaces at the beginning or at the end of the string.
- This is probably not strictly correct, but that's what the
- browsers do, so we may follow. May the authors of "WYSIWYG"
- HTML tools burn in hell for the damage they've inflicted! */
- while ((pbuf < buf + step) && ISSPACE (*pbuf))
- {
- ++pbuf;
- --size;
- }
- while (size && ISSPACE (pbuf[size - 1]))
- --size;
- if (!size)
- break;
-
- /* It would be nice if we could avoid allocating memory in this
- loop, but I don't see an easy way. To process the entities,
- we need to either copy the data, or change it destructively.
- I choose the former.
-
- We have two pointers: needs_freeing and url_data, because the
- code below does thing like url_data += <something>, and we
- want to pass the original string to free(). */
- needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
- size = strlen (url_data);
-
- for (i = 0; protostrings[i]; i++)
- {
- if (!strncasecmp (protostrings[i], url_data,
- MINVAL (strlen (protostrings[i]), size)))
- break;
- }
- /* Check for http:RELATIVE_URI. See below for details. */
- if (protostrings[i]
- && !(strncasecmp (url_data, "http:", 5) == 0
- && strncasecmp (url_data, "http://", 7) != 0))
- {
- no_proto = 0;
- }
+ const char *line_beg = text;
+ const char *line_end = memchr (text, '\n', text_end - text);
+ if (!line_end)
+ line_end = text_end;
else
+ ++line_end;
+ text = line_end;
+ while (line_beg < line_end
+ && ISSPACE (*line_beg))
+ ++line_beg;
+ while (line_end > line_beg + 1
+ && ISSPACE (*(line_end - 1)))
+ --line_end;
+ if (line_end > line_beg)
{
- no_proto = 1;
- /* This is for extremely brain-damaged pages that refer to
- relative URI-s as <a href="http:URL">. Just strip off the
- silly leading "http:" (as well as any leading blanks
- before it). */
- if ((size > 5) && !strncasecmp ("http:", url_data, 5))
- url_data += 5, size -= 5;
- }
- if (!no_proto)
- {
- for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
- {
- if (!strncasecmp (sup_protos[i].name, url_data,
- MINVAL (strlen (sup_protos[i].name), size)))
- break;
- }
- /* Do *not* accept a non-supported protocol. */
- if (i == ARRAY_SIZE (sup_protos))
- {
- free (needs_freeing);
- continue;
- }
- }
- if (no_proto)
- {
- /* First, construct the base, which can be relative itself.
-
- Criteria for creating the base are:
- 1) html_base created by <base href="...">
- 2) current URL
- 3) base provided from the command line */
- cbase = html_base ();
- if (!cbase)
- cbase = this_url;
- if (!cbase)
- cbase = opt.base_href;
- if (!cbase) /* Error condition -- a baseless
- relative link. */
- {
- if (!opt.quiet && !silent)
- {
- /* Use malloc, not alloca because this is called in
- a loop. */
- char *temp = (char *)malloc (size + 1);
- strncpy (temp, url_data, size);
- temp[size] = '\0';
- logprintf (LOG_NOTQUIET,
- _("Error (%s): Link %s without a base provided.\n"),
- file, temp);
- free (temp);
- }
- free (needs_freeing);
- continue;
- }
- if (this_url)
- base = construct (this_url, cbase, strlen (cbase),
- !has_proto (cbase));
+ urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
+ memset (entry, 0, sizeof (*entry));
+ entry->next = NULL;
+ entry->url = strdupdelim (line_beg, line_end);
+ if (!head)
+ head = entry;
else
- {
- /* Base must now be absolute, with host name and
- protocol. */
- if (!has_proto (cbase))
- {
- logprintf (LOG_NOTQUIET, _("\
-Error (%s): Base %s relative, without referer URL.\n"),
- file, cbase);
- free (needs_freeing);
- continue;
- }
- base = xstrdup (cbase);
- }
- constr = construct (base, url_data, size, no_proto);
- free (base);
- }
- else /* has proto */
- {
- constr = (char *)xmalloc (size + 1);
- strncpy (constr, url_data, size);
- constr[size] = '\0';
+ tail->next = entry;
+ tail = entry;
}
-#ifdef DEBUG
- if (opt.debug)
- {
- char *tmp;
- const char *tmp2;
-
- tmp2 = html_base ();
- /* Use malloc, not alloca because this is called in a loop. */
- tmp = (char *)xmalloc (size + 1);
- strncpy (tmp, url_data, size);
- tmp[size] = '\0';
- logprintf (LOG_ALWAYS,
- "file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
- file, this_url ? this_url : "(null)",
- tmp2 ? tmp2 : "(null)", tmp, constr);
- free (tmp);
- }
-#endif
-
- /* Allocate the space. */
- old = current;
- current = (urlpos *)xmalloc (sizeof (urlpos));
- if (old)
- old->next = current;
- if (!first)
- first = current;
- /* Fill the values. */
- memset (current, 0, sizeof (*current));
- current->next = NULL;
- current->url = constr;
- current->size = step;
- current->pos = buf - orig_buf;
- /* A URL is relative if the host and protocol are not named,
- and the name does not start with `/'. */
- if (no_proto && *url_data != '/')
- current->flags |= (URELATIVE | UNOPROTO);
- else if (no_proto)
- current->flags |= UNOPROTO;
- free (needs_freeing);
}
- free (orig_buf);
-
- return first;
+ read_file_free (fm);
+ return head;
}
\f
/* Free the linked list of urlpos. */
return !sufmatch (no_proxy, host);
}
\f
+static void write_backup_file PARAMS ((const char *, downloaded_file_t));
+
/* Change the links in an HTML document. Accepts a structure that
defines the positions of all the links. */
void
convert_links (const char *file, urlpos *l)
{
+ struct file_memory *fm;
FILE *fp;
- char *buf, *p, *p2;
+ char *p;
downloaded_file_t downloaded_file_return;
- long size;
+
+ {
+ /* First we do a "dry run": go through the list L and see whether
+ any URL needs to be converted in the first place. If not, just
+ leave the file alone. */
+ int count = 0;
+ urlpos *dry = l;
+ for (dry = l; dry; dry = dry->next)
+ if (dry->flags & (UABS2REL | UREL2ABS))
+ ++count;
+ if (!count)
+ {
+ logprintf (LOG_VERBOSE, _("Nothing to do while converting %s.\n"),
+ file);
+ return;
+ }
+ }
logprintf (LOG_VERBOSE, _("Converting %s... "), file);
- /* Read from the file.... */
- fp = fopen (file, "rb");
- if (!fp)
+
+ fm = read_file (file);
+ if (!fm)
{
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
file, strerror (errno));
return;
}
- /* ...to a buffer. */
- load_file (fp, &buf, &size);
- fclose (fp);
-
- downloaded_file_return = downloaded_file(CHECK_FOR_FILE, file);
+ downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
if (opt.backup_converted && downloaded_file_return)
- /* Rather than just writing over the original .html file with the converted
- version, save the former to *.orig. Note we only do this for files we've
- _successfully_ downloaded, so we don't clobber .orig files sitting around
- from previous invocations. */
- {
- /* Construct the backup filename as the original name plus ".orig". */
- size_t filename_len = strlen(file);
- char* filename_plus_orig_suffix;
- boolean already_wrote_backup_file = FALSE;
- slist* converted_file_ptr;
- static slist* converted_files = NULL;
-
- if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
- {
- /* Just write "orig" over "html". We need to do it this way because
- when we're checking to see if we've downloaded the file before (to
- see if we can skip downloading it), we don't know if it's a
- text/html file. Therefore we don't know yet at that stage that -E
- is going to cause us to tack on ".html", so we need to compare
- vs. the original URL plus ".orig", not the original URL plus
- ".html.orig". */
- filename_plus_orig_suffix = xmalloc(filename_len + 1);
- strcpy(filename_plus_orig_suffix, file);
- strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
- }
- else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
- {
- /* Append ".orig" to the name. */
- filename_plus_orig_suffix = xmalloc(filename_len + sizeof(".orig"));
- strcpy(filename_plus_orig_suffix, file);
- strcpy(filename_plus_orig_suffix + filename_len, ".orig");
- }
-
- /* We can get called twice on the same URL thanks to the
- convert_all_links() call in main(). If we write the .orig file each
- time in such a case, it'll end up containing the first-pass conversion,
- not the original file. So, see if we've already been called on this
- file. */
- converted_file_ptr = converted_files;
- while (converted_file_ptr != NULL)
- if (strcmp(converted_file_ptr->string, file) == 0)
- {
- already_wrote_backup_file = TRUE;
- break;
- }
- else
- converted_file_ptr = converted_file_ptr->next;
-
- if (!already_wrote_backup_file)
- {
- /* Rename <file> to <file>.orig before former gets written over. */
- if (rename(file, filename_plus_orig_suffix) != 0)
- logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
- file, filename_plus_orig_suffix, strerror (errno));
-
- /* Remember that we've already written a .orig backup for this file.
- Note that we never free this memory since we need it till the
- convert_all_links() call, which is one of the last things the
- program does before terminating. BTW, I'm not sure if it would be
- safe to just set 'converted_file_ptr->string' to 'file' below,
- rather than making a copy of the string... Another note is that I
- thought I could just add a field to the urlpos structure saying
- that we'd written a .orig file for this URL, but that didn't work,
- so I had to make this separate list. */
- converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
- converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
- converted_file_ptr->next = converted_files;
- converted_files = converted_file_ptr;
- }
+ write_backup_file (file, downloaded_file_return);
- free(filename_plus_orig_suffix);
+ /* Before opening the file for writing, unlink the file. This is
+ important if the data in FM is mmaped. In such case, nulling the
+ file, which is what fopen() below does, would make us read all
+ zeroes from the mmaped region. */
+ if (unlink (file) < 0 && errno != ENOENT)
+ {
+ logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
+ file, strerror (errno));
+ read_file_free (fm);
+ return;
}
/* Now open the file for writing. */
fp = fopen (file, "wb");
{
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
file, strerror (errno));
- free (buf);
+ read_file_free (fm);
return;
}
- /* Presumably we have to loop through multiple URLs here (even though we're
- only talking about a single local file) because of the -O option. */
- for (p = buf; l; l = l->next)
+ /* Here we loop through all the URLs in file, replacing those of
+ them that are downloaded with relative references. */
+ p = fm->content;
+ for (; l; l = l->next)
{
- if (l->pos >= size)
+ char *url_start = fm->content + l->pos;
+ if (l->pos >= fm->length)
{
DEBUGP (("Something strange is going on. Please investigate."));
break;
}
- /* If the URL already is relative or it is not to be converted
- for some other reason (e.g. because of not having been
- downloaded in the first place), skip it. */
- if ((l->flags & URELATIVE) || !(l->flags & UABS2REL))
+ /* If the URL is not to be converted, skip it. */
+ if (!(l->flags & (UABS2REL | UREL2ABS)))
{
DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url,
l->pos, l->flags));
continue;
}
- /* Else, reach the position of the offending URL, echoing
- everything up to it to the outfile. */
- for (p2 = buf + l->pos; p < p2; p++)
- putc (*p, fp);
+
+ /* Echo the file contents, up to the offending URL's opening
+ quote, to the outfile. */
+ fwrite (p, 1, url_start - p, fp);
+ p = url_start;
if (l->flags & UABS2REL)
- /* Convert absolute URL to relative. */
{
+ /* Convert absolute URL to relative. */
char *newname = construct_relative (file, l->local_name);
- fprintf (fp, "%s", newname);
+ putc (*p, fp); /* quoting char */
+ fputs (newname, fp);
+ p += l->size - 1;
+ putc (*p, fp); /* close quote */
+ ++p;
DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n",
l->url, newname, l->pos, file));
free (newname);
}
- p += l->size;
+ else if (l->flags & UREL2ABS)
+ {
+ /* Convert the link to absolute URL. */
+ char *newlink = l->url;
+ putc (*p, fp); /* quoting char */
+ fputs (newlink, fp);
+ p += l->size - 1;
+ putc (*p, fp); /* close quote */
+ ++p;
+ DEBUGP (("REL2ABS: <something> to %s at position %d in %s.\n",
+ newlink, l->pos, file));
+ }
}
/* Output the rest of the file. */
- if (p - buf < size)
- {
- for (p2 = buf + size; p < p2; p++)
- putc (*p, fp);
- }
+ if (p - fm->content < fm->length)
+ fwrite (p, 1, fm->length - (p - fm->content), fp);
fclose (fp);
- free (buf);
+ read_file_free (fm);
logputs (LOG_VERBOSE, _("done.\n"));
}
return t;
}
+static void
+write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
+{
+ /* Rather than just writing over the original .html file with the
+ converted version, save the former to *.orig. Note we only do
+ this for files we've _successfully_ downloaded, so we don't
+ clobber .orig files sitting around from previous invocations. */
+
+ /* Construct the backup filename as the original name plus ".orig". */
+ size_t filename_len = strlen(file);
+ char* filename_plus_orig_suffix;
+ boolean already_wrote_backup_file = FALSE;
+ slist* converted_file_ptr;
+ static slist* converted_files = NULL;
+
+ if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
+ {
+ /* Just write "orig" over "html". We need to do it this way
+ because when we're checking to see if we've downloaded the
+ file before (to see if we can skip downloading it), we don't
+ know if it's a text/html file. Therefore we don't know yet
+ at that stage that -E is going to cause us to tack on
+ ".html", so we need to compare vs. the original URL plus
+ ".orig", not the original URL plus ".html.orig". */
+ filename_plus_orig_suffix = alloca (filename_len + 1);
+ strcpy(filename_plus_orig_suffix, file);
+ strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+ }
+ else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
+ {
+ /* Append ".orig" to the name. */
+ filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
+ strcpy(filename_plus_orig_suffix, file);
+ strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ }
+
+ /* We can get called twice on the same URL thanks to the
+ convert_all_links() call in main(). If we write the .orig file
+ each time in such a case, it'll end up containing the first-pass
+ conversion, not the original file. So, see if we've already been
+ called on this file. */
+ converted_file_ptr = converted_files;
+ while (converted_file_ptr != NULL)
+ if (strcmp(converted_file_ptr->string, file) == 0)
+ {
+ already_wrote_backup_file = TRUE;
+ break;
+ }
+ else
+ converted_file_ptr = converted_file_ptr->next;
+
+ if (!already_wrote_backup_file)
+ {
+ /* Rename <file> to <file>.orig before former gets written over. */
+ if (rename(file, filename_plus_orig_suffix) != 0)
+ logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
+ file, filename_plus_orig_suffix, strerror (errno));
+
+ /* Remember that we've already written a .orig backup for this file.
+ Note that we never free this memory since we need it till the
+ convert_all_links() call, which is one of the last things the
+ program does before terminating. BTW, I'm not sure if it would be
+ safe to just set 'converted_file_ptr->string' to 'file' below,
+ rather than making a copy of the string... Another note is that I
+ thought I could just add a field to the urlpos structure saying
+ that we'd written a .orig file for this URL, but that didn't work,
+ so I had to make this separate list. */
+ converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
+ converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
+ converted_file_ptr->next = converted_files;
+ converted_files = converted_file_ptr;
+ }
+}
/* Remembers which files have been downloaded. In the standard case, should be
called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
return FILE_NOT_ALREADY_DOWNLOADED;
}
}
+\f
+/* Initialization of static stuff. */
+void
+url_init (void)
+{
+ init_unsafe_char_table ();
+}
void freeurl PARAMS ((struct urlinfo *, int));
uerr_t urlproto PARAMS ((const char *));
int skip_proto PARAMS ((const char *));
+int has_proto PARAMS ((const char *));
int skip_uname PARAMS ((const char *));
uerr_t parseurl PARAMS ((const char *, struct urlinfo *, int));
int url_equal PARAMS ((const char *, const char *));
urlpos *get_urls_file PARAMS ((const char *));
-urlpos *get_urls_html PARAMS ((const char *, const char *, int, int));
+urlpos *get_urls_html PARAMS ((const char *, const char *, int, int *));
void free_urlpos PARAMS ((urlpos *));
char *url_concat PARAMS ((const char *, const char *));
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif
+#ifdef HAVE_MMAP
+# include <sys/mman.h>
+#endif
#ifdef HAVE_PWD_H
# include <pwd.h>
#endif
#ifdef NeXT
# include <libc.h> /* for access() */
#endif
+#include <fcntl.h>
#include <assert.h>
#include "wget.h"
#include "utils.h"
#include "fnmatch.h"
+#include "hash.h"
#ifndef errno
extern int errno;
line = xrealloc (line, length + 1);
return line;
}
+\f
+/* Read FILE into memory. A pointer to `struct file_memory' are
+ returned; use struct element `content' to access file contents, and
+ the element `length' to know the file length. `content' is *not*
+ zero-terminated, and you should *not* read or write beyond the [0,
+ length) range of characters.
-/* Load file pointed to by FP to memory and return the malloc-ed
- buffer with the contents. *NREAD will contain the number of read
- bytes. The file is loaded in chunks, allocated exponentially,
- starting with FILE_BUFFER_SIZE bytes. */
-void
-load_file (FILE *fp, char **buf, long *nread)
-{
- long bufsize;
+ After you are done with the file contents, call read_file_free to
+ release the memory.
+
+ Depending on the operating system and the type of file that is
+ being read, read_file() either mmap's the file into memory, or
+ reads the file into the core using read().
- bufsize = 512;
- *nread = 0;
- *buf = NULL;
- while (!feof (fp) && !ferror (fp))
+ If file is named "-", fileno(stdin) is used for reading instead.
+ If you want to read from a real file named "-", use "./-" instead. */
+
+struct file_memory *
+read_file (const char *file)
+{
+ int fd;
+ struct file_memory *fm;
+ long size;
+ int inhibit_close = 0;
+
+ /* Some magic in the finest tradition of Perl and its kin: if FILE
+ is "-", just use stdin. */
+ if (HYPHENP (file))
{
- *buf = (char *)xrealloc (*buf, bufsize + *nread);
- *nread += fread (*buf + *nread, sizeof (char), bufsize, fp);
- bufsize <<= 1;
+ fd = fileno (stdin);
+ inhibit_close = 1;
+ /* Note that we don't inhibit mmap() in this case. If stdin is
+ redirected from a regular file, mmap() will still work. */
+ }
+ else
+ fd = open (file, O_RDONLY);
+ if (fd < 0)
+ return NULL;
+ fm = xmalloc (sizeof (struct file_memory));
+
+#ifdef HAVE_MMAP
+ {
+ struct stat buf;
+ if (fstat (fd, &buf) < 0)
+ goto mmap_lose;
+ fm->length = buf.st_size;
+ /* NOTE: As far as I know, the callers of this function never
+ modify the file text. Relying on this would enable us to
+ specify PROT_READ and MAP_SHARED for a marginal gain in
+ efficiency, but at some cost to generality. */
+ fm->content = mmap (NULL, fm->length, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE, fd, 0);
+ if (fm->content == MAP_FAILED)
+ goto mmap_lose;
+ if (!inhibit_close)
+ close (fd);
+
+ fm->mmap_p = 1;
+ return fm;
+ }
+
+ mmap_lose:
+ /* The most common reason why mmap() fails is that FD does not point
+ to a plain file. However, it's also possible that mmap() doesn't
+ work for a particular type of file. Therefore, whenever mmap()
+ fails, we just fall back to the regular method. */
+#endif /* HAVE_MMAP */
+
+ fm->length = 0;
+ size = 512; /* number of bytes fm->contents can
+ hold at any given time. */
+ fm->content = xmalloc (size);
+ while (1)
+ {
+ long nread;
+ if (fm->length > size / 2)
+ {
+ /* #### I'm not sure whether the whole exponential-growth
+ thing makes sense with kernel read. On Linux at least,
+ read() refuses to read more than 4K from a file at a
+ single chunk anyway. But other Unixes might optimize it
+ better, and it doesn't *hurt* anything, so I'm leaving
+ it. */
+
+ /* Normally, we grow SIZE exponentially to make the number
+ of calls to read() and realloc() logarithmic in relation
+ to file size. However, read() can read an amount of data
+ smaller than requested, and it would be unreasonably to
+ double SIZE every time *something* was read. Therefore,
+ we double SIZE only when the length exceeds half of the
+ entire allocated size. */
+ size <<= 1;
+ fm->content = xrealloc (fm->content, size);
+ }
+ nread = read (fd, fm->content + fm->length, size - fm->length);
+ if (nread > 0)
+ /* Successful read. */
+ fm->length += nread;
+ else if (nread < 0)
+ /* Error. */
+ goto lose;
+ else
+ /* EOF */
+ break;
}
- /* #### No indication of encountered error?? */
+ if (!inhibit_close)
+ close (fd);
+ if (size > fm->length && fm->length != 0)
+ /* Due to exponential growth of fm->content, the allocated region
+ might be much larger than what is actually needed. */
+ fm->content = xrealloc (fm->content, fm->length);
+ fm->mmap_p = 0;
+ return fm;
+
+ lose:
+ if (!inhibit_close)
+ close (fd);
+ free (fm->content);
+ free (fm);
+ return NULL;
}
+/* Release the resources held by FM. Specifically, this calls
+ munmap() or free() on fm->content, depending whether mmap or
+ malloc/read were used to read in the file. It also frees the
+ memory needed to hold the FM structure itself. */
+
+void
+read_file_free (struct file_memory *fm)
+{
+#ifdef HAVE_MMAP
+ if (fm->mmap_p)
+ {
+ munmap (fm->content, fm->length);
+ }
+ else
+#endif
+ {
+ free (fm->content);
+ }
+ free (fm);
+}
+\f
/* Free the pointers in a NULL-terminated vector of pointers, then
free the pointer itself. */
void
return v1;
}
-/* A set of simple-minded routines to store and search for strings in
- a linked list. You may add a string to the slist, and peek whether
- it's still in there at any time later. */
+/* A set of simple-minded routines to store strings in a linked list.
+ This used to also be used for searching, but now we have hash
+ tables for that. */
-/* Add an element to the list. If flags is NOSORT, the list will not
- be sorted. */
+/* Append an element to the list. */
slist *
-add_slist (slist *l, const char *s, int flags)
+slist_append (slist *l, const char *s)
{
- slist *t, *old, *beg;
- int cmp;
+ slist *newel = (slist *)xmalloc (sizeof (slist));
+ slist *beg = l;
- if (flags & NOSORT)
- {
- if (!l)
- {
- t = (slist *)xmalloc (sizeof (slist));
- t->string = xstrdup (s);
- t->next = NULL;
- return t;
- }
- beg = l;
- /* Find the last element. */
- while (l->next)
- l = l->next;
- t = (slist *)xmalloc (sizeof (slist));
- l->next = t;
- t->string = xstrdup (s);
- t->next = NULL;
- return beg;
- }
- /* Empty list or changing the first element. */
- if (!l || (cmp = strcmp (l->string, s)) > 0)
- {
- t = (slist *)xmalloc (sizeof (slist));
- t->string = xstrdup (s);
- t->next = l;
- return t;
- }
+ newel->string = xstrdup (s);
+ newel->next = NULL;
- beg = l;
- if (cmp == 0)
- return beg;
-
- /* Second two one-before-the-last element. */
+ if (!l)
+ return newel;
+ /* Find the last element. */
while (l->next)
- {
- old = l;
- l = l->next;
- cmp = strcmp (s, l->string);
- if (cmp == 0) /* no repeating in the list */
- return beg;
- else if (cmp > 0)
- continue;
- /* If the next list element is greater than s, put s between the
- current and the next list element. */
- t = (slist *)xmalloc (sizeof (slist));
- old->next = t;
- t->next = l;
- t->string = xstrdup (s);
- return beg;
- }
- t = (slist *)xmalloc (sizeof (slist));
- t->string = xstrdup (s);
- /* Insert the new element after the last element. */
- l->next = t;
- t->next = NULL;
+ l = l->next;
+ l->next = newel;
return beg;
}
/* Is there a specific entry in the list? */
int
-in_slist (slist *l, const char *s)
+slist_contains (slist *l, const char *s)
{
- int cmp;
-
- while (l)
- {
- cmp = strcmp (l->string, s);
- if (cmp == 0)
- return 1;
- else if (cmp > 0) /* the list is ordered! */
- return 0;
- l = l->next;
- }
+ for (; l; l = l->next)
+ if (!strcmp (l->string, s))
+ return 1;
return 0;
}
/* Free the whole slist. */
void
-free_slist (slist *l)
+slist_free (slist *l)
{
slist *n;
l = n;
}
}
+\f
+/* Sometimes it's useful to create "sets" of strings, i.e. special
+ hash tables where you want to store strings as keys and merely
+ query for their existence. Here is a set of utility routines that
+ makes that transparent. */
+
+void
+string_set_add (struct hash_table *ht, const char *s)
+{
+ /* We use "1" as value. It provides us a useful and clear arbitrary
+ value, and it consumes no memory -- the pointers to the same
+ string "1" will be shared by all the key-value pairs in the hash
+ table. */
+ hash_table_put (ht, xstrdup (s), "1");
+}
+
+int
+string_set_exists (struct hash_table *ht, const char *s)
+{
+ return hash_table_exists (ht, s);
+}
+
+static int
+string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
+{
+ free (key);
+ return 0;
+}
+
+void
+string_set_free (struct hash_table *ht)
+{
+ hash_table_map (ht, string_set_free_mapper, NULL);
+ hash_table_destroy (ht);
+}
+
+static int
+free_keys_and_values_mapper (void *key, void *value, void *arg_ignored)
+{
+ free (key);
+ free (value);
+ return 0;
+}
+
+/* Another utility function: call free() on all keys and values of HT. */
+
+void
+free_keys_and_values (struct hash_table *ht)
+{
+ hash_table_map (ht, free_keys_and_values_mapper, NULL);
+}
+
\f
/* Engine for legible and legible_long_long; this function works on
strings. */
#ifndef UTILS_H
#define UTILS_H
-/* Flags for slist. */
-enum {
- NOSORT = 1
-};
-
enum accd {
ALLABS = 1
};
struct _slist *next;
} slist;
+struct hash_table;
+
+struct file_memory {
+ char *content;
+ long length;
+ int mmap_p;
+};
+
char *time_str PARAMS ((time_t *));
const char *uerrmsg PARAMS ((uerr_t));
char *suffix PARAMS ((const char *s));
char *read_whole_line PARAMS ((FILE *));
-void load_file PARAMS ((FILE *, char **, long *));
+struct file_memory *read_file PARAMS ((const char *));
+void read_file_free PARAMS ((struct file_memory *));
void free_vec PARAMS ((char **));
char **merge_vecs PARAMS ((char **, char **));
-slist *add_slist PARAMS ((slist *, const char *, int));
-int in_slist PARAMS ((slist *, const char *));
-void free_slist PARAMS ((slist *));
+slist *slist_append PARAMS ((slist *, const char *));
+int slist_contains PARAMS ((slist *, const char *));
+void slist_free PARAMS ((slist *));
+
+void string_set_add PARAMS ((struct hash_table *, const char *));
+int string_set_exists PARAMS ((struct hash_table *, const char *));
+void string_set_free PARAMS ((struct hash_table *));
+void free_keys_and_values PARAMS ((struct hash_table *));
char *legible PARAMS ((long));
char *legible_very_long PARAMS ((VERY_LONG_TYPE));
/* Print X if debugging is enabled; a no-op otherwise. */
#ifdef DEBUG
-# define DEBUGP(x) do { debug_logprintf x; } while (0)
+# define DEBUGP(x) do { if (opt.debug) { debug_logprintf x; } } while (0)
#else /* not DEBUG */
# define DEBUGP(x) DO_NOTHING
#endif /* not DEBUG */