From: hniksic Date: Sun, 19 Nov 2000 20:50:10 +0000 (-0800) Subject: [svn] A bunch of new features: X-Git-Tag: v1.13~2375 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=b0b1c815c15e49c9172f59428810713097a65e37 [svn] A bunch of new features: - use mmap() to read whole files in core instead of allocating memory and read'ing it. - use a new, more general, HTML parser (html-parse.c) and interface to it from Wget (html-url.c). - respect (easy with the new HTML parser). - use hash tables instead of linked lists in places where the lists were used to facilitate mappings. - rewrite the code in host.c to be more readable and faster (hash tables instead of home-grown lists.) - make convert_links properly convert partial URLs to complete ones for those URLs that have *not* been downloaded. - use HTTP persistent connections where available. very simple-minded, caches the last connection to the server. Published in . --- diff --git a/ChangeLog b/ChangeLog index 46905616..e873628f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2000-11-10 Hrvoje Niksic + + * configure.in: Test for MMAP. + 2000-11-16 Hrvoje Niksic * windows/config.h.ms: snprintf and vsnprintf exist under Windows. diff --git a/TODO b/TODO index d8319039..c0fecefc 100644 --- a/TODO +++ b/TODO @@ -49,15 +49,6 @@ changes. * Make `-k' check for files that were downloaded in the past and convert links to them in newly-downloaded documents. -* -k should convert relative references to absolute if not downloaded. - -* -k should convert "hostless absolute" URLs, like . - However, Brian McMahon wants the old incorrect behavior to still - be available as an option, as he depends on it to allow mirrors of his site to - send CGI queries to his original site, but still get graphics off of the - mirror site. Perhaps this would be better dealt with by adding an option to - tell -k not to convert certain URL patterns? - * Add option to clobber existing file names (no `.N' suffixes). * Introduce a concept of "boolean" options. For instance, every @@ -85,9 +76,6 @@ changes. * Allow size limit to files (perhaps with an option to download oversize files up through the limit or not at all, to get more functionality than [u]limit. -* Recognize HTML comments correctly. Add more options for handling - bogus HTML found all over the 'net. - * Implement breadth-first retrieval. * Download to .in* when mirroring. diff --git a/configure b/configure index f7e130a2..de78c984 100755 --- a/configure +++ b/configure @@ -2040,15 +2040,281 @@ EOF fi +for ac_hdr in unistd.h +do +ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 +echo "configure:2048: checking for $ac_hdr" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:2058: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'` + cat >> confdefs.h <&6 +fi +done + +for ac_func in getpagesize +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:2087: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:2115: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + +echo $ac_n "checking for working mmap""... $ac_c" 1>&6 +echo "configure:2140: checking for working mmap" >&5 +if eval "test \"`echo '$''{'ac_cv_func_mmap_fixed_mapped'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + ac_cv_func_mmap_fixed_mapped=no +else + cat > conftest.$ac_ext < +#include +#include + +/* This mess was copied from the GNU getpagesize.h. */ +#ifndef HAVE_GETPAGESIZE +# ifdef HAVE_UNISTD_H +# include +# endif + +/* Assume that all systems that can run configure have sys/param.h. */ +# ifndef HAVE_SYS_PARAM_H +# define HAVE_SYS_PARAM_H 1 +# endif + +# ifdef _SC_PAGESIZE +# define getpagesize() sysconf(_SC_PAGESIZE) +# else /* no _SC_PAGESIZE */ +# ifdef HAVE_SYS_PARAM_H +# include +# ifdef EXEC_PAGESIZE +# define getpagesize() EXEC_PAGESIZE +# else /* no EXEC_PAGESIZE */ +# ifdef NBPG +# define getpagesize() NBPG * CLSIZE +# ifndef CLSIZE +# define CLSIZE 1 +# endif /* no CLSIZE */ +# else /* no NBPG */ +# ifdef NBPC +# define getpagesize() NBPC +# else /* no NBPC */ +# ifdef PAGESIZE +# define getpagesize() PAGESIZE +# endif /* PAGESIZE */ +# endif /* no NBPC */ +# endif /* no NBPG */ +# endif /* no EXEC_PAGESIZE */ +# else /* no HAVE_SYS_PARAM_H */ +# define getpagesize() 8192 /* punt totally */ +# endif /* no HAVE_SYS_PARAM_H */ +# endif /* no _SC_PAGESIZE */ + +#endif /* no HAVE_GETPAGESIZE */ + +#ifdef __cplusplus +extern "C" { void *malloc(unsigned); } +#else +char *malloc(); +#endif + +int +main() +{ + char *data, *data2, *data3; + int i, pagesize; + int fd; + + pagesize = getpagesize(); + + /* + * First, make a file with some known garbage in it. + */ + data = malloc(pagesize); + if (!data) + exit(1); + for (i = 0; i < pagesize; ++i) + *(data + i) = rand(); + umask(0); + fd = creat("conftestmmap", 0600); + if (fd < 0) + exit(1); + if (write(fd, data, pagesize) != pagesize) + exit(1); + close(fd); + + /* + * Next, try to mmap the file at a fixed address which + * already has something else allocated at it. If we can, + * also make sure that we see the same garbage. + */ + fd = open("conftestmmap", O_RDWR); + if (fd < 0) + exit(1); + data2 = malloc(2 * pagesize); + if (!data2) + exit(1); + data2 += (pagesize - ((int) data2 & (pagesize - 1))) & (pagesize - 1); + if (data2 != mmap(data2, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FIXED, fd, 0L)) + exit(1); + for (i = 0; i < pagesize; ++i) + if (*(data + i) != *(data2 + i)) + exit(1); + + /* + * Finally, make sure that changes to the mapped area + * do not percolate back to the file as seen by read(). + * (This is a bug on some variants of i386 svr4.0.) + */ + for (i = 0; i < pagesize; ++i) + *(data2 + i) = *(data2 + i) + 1; + data3 = malloc(pagesize); + if (!data3) + exit(1); + if (read(fd, data3, pagesize) != pagesize) + exit(1); + for (i = 0; i < pagesize; ++i) + if (*(data + i) != *(data3 + i)) + exit(1); + close(fd); + unlink("conftestmmap"); + exit(0); +} + +EOF +if { (eval echo configure:2288: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_func_mmap_fixed_mapped=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_func_mmap_fixed_mapped=no +fi +rm -fr conftest* +fi + +fi + +echo "$ac_t""$ac_cv_func_mmap_fixed_mapped" 1>&6 +if test $ac_cv_func_mmap_fixed_mapped = yes; then + cat >> confdefs.h <<\EOF +#define HAVE_MMAP 1 +EOF + +fi + for ac_func in strdup strstr strcasecmp strncasecmp do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2047: checking for $ac_func" >&5 +echo "configure:2313: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2341: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2098,12 +2364,12 @@ done for ac_func in gettimeofday mktime strptime do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2102: checking for $ac_func" >&5 +echo "configure:2368: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2396: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2153,12 +2419,12 @@ done for ac_func in strerror snprintf vsnprintf select signal symlink access isatty do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2157: checking for $ac_func" >&5 +echo "configure:2423: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2451: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2208,12 +2474,12 @@ done for ac_func in uname gethostname do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2212: checking for $ac_func" >&5 +echo "configure:2478: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2506: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2264,12 +2530,12 @@ done for ac_func in gethostbyname do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2268: checking for $ac_func" >&5 +echo "configure:2534: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2562: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2314,7 +2580,7 @@ EOF else echo "$ac_t""no" 1>&6 echo $ac_n "checking for gethostbyname in -lnsl""... $ac_c" 1>&6 -echo "configure:2318: checking for gethostbyname in -lnsl" >&5 +echo "configure:2584: checking for gethostbyname in -lnsl" >&5 ac_lib_var=`echo nsl'_'gethostbyname | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2322,7 +2588,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lnsl $LIBS" cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2603: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2367,7 +2633,7 @@ done echo $ac_n "checking for socket in -lsocket""... $ac_c" 1>&6 -echo "configure:2371: checking for socket in -lsocket" >&5 +echo "configure:2637: checking for socket in -lsocket" >&5 ac_lib_var=`echo socket'_'socket | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2375,7 +2641,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lsocket $LIBS" cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2656: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2417,7 +2683,7 @@ fi if test "x${with_socks}" = xyes then echo $ac_n "checking for main in -lresolv""... $ac_c" 1>&6 -echo "configure:2421: checking for main in -lresolv" >&5 +echo "configure:2687: checking for main in -lresolv" >&5 ac_lib_var=`echo resolv'_'main | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2425,14 +2691,14 @@ else ac_save_LIBS="$LIBS" LIBS="-lresolv $LIBS" cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2702: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2460,7 +2726,7 @@ else fi echo $ac_n "checking for Rconnect in -lsocks""... $ac_c" 1>&6 -echo "configure:2464: checking for Rconnect in -lsocks" >&5 +echo "configure:2730: checking for Rconnect in -lsocks" >&5 ac_lib_var=`echo socks'_'Rconnect | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2468,7 +2734,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lsocks $LIBS" cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2749: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2511,7 +2777,7 @@ fi ALL_LINGUAS="cs de hr it no pl pt_BR ru" echo $ac_n "checking whether NLS is requested""... $ac_c" 1>&6 -echo "configure:2515: checking whether NLS is requested" >&5 +echo "configure:2781: checking whether NLS is requested" >&5 # Check whether --enable-nls or --disable-nls was given. if test "${enable_nls+set}" = set; then enableval="$enable_nls" @@ -2528,7 +2794,7 @@ fi # Extract the first word of "msgfmt", so it can be a program name with args. set dummy msgfmt; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2532: checking for $ac_word" >&5 +echo "configure:2798: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_path_MSGFMT'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else @@ -2562,7 +2828,7 @@ fi # Extract the first word of "xgettext", so it can be a program name with args. set dummy xgettext; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2566: checking for $ac_word" >&5 +echo "configure:2832: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_path_XGETTEXT'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else @@ -2597,7 +2863,7 @@ fi # Extract the first word of "gmsgfmt", so it can be a program name with args. set dummy gmsgfmt; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2601: checking for $ac_word" >&5 +echo "configure:2867: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_path_GMSGFMT'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else @@ -2647,17 +2913,17 @@ fi do ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 -echo "configure:2651: checking for $ac_hdr" >&5 +echo "configure:2917: checking for $ac_hdr" >&5 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext < EOF ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" -{ (eval echo configure:2661: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +{ (eval echo configure:2927: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` if test -z "$ac_err"; then rm -rf conftest* @@ -2687,12 +2953,12 @@ done for ac_func in gettext do echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 -echo "configure:2691: checking for $ac_func" >&5 +echo "configure:2957: checking for $ac_func" >&5 if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:2985: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_func_$ac_func=yes" else @@ -2737,7 +3003,7 @@ EOF else echo "$ac_t""no" 1>&6 echo $ac_n "checking for gettext in -lintl""... $ac_c" 1>&6 -echo "configure:2741: checking for gettext in -lintl" >&5 +echo "configure:3007: checking for gettext in -lintl" >&5 ac_lib_var=`echo intl'_'gettext | sed 'y%./+-%__p_%'` if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 @@ -2745,7 +3011,7 @@ else ac_save_LIBS="$LIBS" LIBS="-lintl $LIBS" cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then +if { (eval echo configure:3026: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then rm -rf conftest* eval "ac_cv_lib_$ac_lib_var=yes" else @@ -2824,7 +3090,7 @@ do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 -echo "configure:2828: checking for $ac_word" >&5 +echo "configure:3094: checking for $ac_word" >&5 if eval "test \"`echo '$''{'ac_cv_prog_MAKEINFO'+set}'`\" = set"; then echo $ac_n "(cached) $ac_c" 1>&6 else diff --git a/configure.in b/configure.in index 474e5d57..4f4440b0 100644 --- a/configure.in +++ b/configure.in @@ -160,6 +160,7 @@ dnl dnl Checks for library functions. dnl AC_FUNC_ALLOCA +AC_FUNC_MMAP AC_CHECK_FUNCS(strdup strstr strcasecmp strncasecmp) AC_CHECK_FUNCS(gettimeofday mktime strptime) AC_CHECK_FUNCS(strerror snprintf vsnprintf select signal symlink access isatty) diff --git a/doc/ChangeLog b/doc/ChangeLog index bddc4b5d..110102a6 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,8 @@ +2000-11-15 Hrvoje Niksic + + * wget.texi (Robots): Document that we now support the meta tag + exclusion. + 2000-11-16 Hrvoje Niksic * wget.texi: Use --- consistently. diff --git a/doc/wget.texi b/doc/wget.texi index eb4d00c9..1accbb94 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -2548,8 +2548,8 @@ this: This is explained in some detail at @url{http://info.webcrawler.com/mak/projects/robots/meta-user.html}. -Unfortunately, Wget does not support this method of robot exclusion yet, -but it will be implemented in the next release. +Wget supports this method of robot exclusion in addition to the usual +@file{/robots.txt} exclusion. @node Security Considerations, Contributors, Robots, Appendices @section Security Considerations diff --git a/po/cs.gmo b/po/cs.gmo index 5511ce5c..c276436c 100644 Binary files a/po/cs.gmo and b/po/cs.gmo differ diff --git a/po/de.gmo b/po/de.gmo index 6dc32778..63f01f82 100644 Binary files a/po/de.gmo and b/po/de.gmo differ diff --git a/po/hr.gmo b/po/hr.gmo index 039855ad..70c9d001 100644 Binary files a/po/hr.gmo and b/po/hr.gmo differ diff --git a/po/it.gmo b/po/it.gmo index 6e9269b1..f70efae4 100644 Binary files a/po/it.gmo and b/po/it.gmo differ diff --git a/po/no.gmo b/po/no.gmo index a150d4e0..c54d1a07 100644 Binary files a/po/no.gmo and b/po/no.gmo differ diff --git a/po/pl.gmo b/po/pl.gmo index 6308a0e4..96e50064 100644 Binary files a/po/pl.gmo and b/po/pl.gmo differ diff --git a/po/pt_BR.gmo b/po/pt_BR.gmo index 917f90f8..447bdf17 100644 Binary files a/po/pt_BR.gmo and b/po/pt_BR.gmo differ diff --git a/po/ru.gmo b/po/ru.gmo index 6019b4af..df7cd87d 100644 Binary files a/po/ru.gmo and b/po/ru.gmo differ diff --git a/src/ChangeLog b/src/ChangeLog index 29f99623..acbc0331 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,117 @@ +2000-11-19 Hrvoje Niksic + + * retr.c (get_contents): If use_expected, make sure that the + appropriate amount of data is being read. + + * http.c (gethttp): Check for both `Keep-Alive: ...' and + `Connection: Keep-Alive'. + + * wget.h (DEBUGP): Call debug_logprintf only if opt.debug is + turned on. + +2000-11-19 Hrvoje Niksic + + * http.c (connection_available_p): Use it. + + * connect.c (test_socket_open): New function. + + * http.c (gethttp): Support persistent connections. Based on the + ideas, and partly on code, by Sam Horrocks . + (register_persistent): New function. + (connection_available_p): Ditto. + (invalidate_connection): Ditto. + +2000-11-19 Hrvoje Niksic + + * url.c (convert_links): Handle UREL2ABS case. + + * recur.c (recursive_retrieve): Instead of the list + urls_downloaded, use hash tables dl_file_url_map and + dl_url_file_map. + (convert_all_links): Use them to retrieve data. + + * host.c (clean_hosts): Free the hash tables. + + * main.c (private_initialize): Call host_init(). + + * host.c (store_hostaddress): Use a saner, hash table-based data + model. + (realhost): Ditto. + (host_init): Initialize the hash tables. + +2000-11-18 Hrvoje Niksic + + * utils.c (slist_append): Eviscerate NOSORT. Hash tables are now + used for what the sorted slists used to be used for. + (slist_contains): Don't rely on the list being sorted. + (slist_append): Simplify the code. + + * recur.c (recursive_cleanup): Use free_string_set. + + * utils.c (string_set_add, string_set_exists, string_set_free): + New functions for easier freeing of hash tables whose keys are + strdup'ed strings. + + * recur.c (recursive_retrieve): Use the hash table functions for + storing undesirable URLs. + + * hash.c: New file. + +2000-11-17 Hrvoje Niksic + + * main.c (private_initialize): Call url_init. + (main): Call private_initialize. + + * url.c (unsafe_char_table): New table. + (UNSAFE_CHAR): Use it. + (init_unsafe_char_table): New function. + (url_init): New function; call init_unsafe_char_table. + +2000-11-15 Hrvoje Niksic + + * html-url.c (handle_link): Handle HTML fragment identifiers. + + * recur.c (recursive_retrieve): If norobot info is respected and + the file is specified not to be followed by robots, respect that. + + * html-url.c (collect_tags_mapper): Handle . For us the important cases are where X is NONE or + where X contains NOFOLLOW. + (get_urls_html): Propagate that information to the caller. + +2000-11-13 Hrvoje Niksic + + * url.c (convert_links): Unlink the file we might be reading from + before writing to it. + (convert_links): Use alloca instead of malloc for + filename_plus_orig_suffix. + +2000-11-10 Hrvoje Niksic + + * url.c (get_urls_file): Ditto. + (convert_links): Ditto. + + * html-url.c (get_urls_html): Use read_file() instead of + load_file(). + + * utils.c (read_file): New function, instead of the old + load_file(). + (read_file_free): Ditto. + + * url.c (findurl): Search only for the supported protocols. + (convert_links): Use fwrite() when writing out a region of + characters. + +2000-11-10 Hrvoje Niksic + + * ftp-ls.c: Move html_quote_string and ftp_index here. + + * url.c: Remove get_urls_html, since that's now in html-url.c. + + * html-url.c: New file. + + * html-parse.c: New file. + 2000-11-16 Hrvoje Niksic * mswindows.h: Define snprintf and vsnprintf to _snprintf and diff --git a/src/Makefile.in b/src/Makefile.in index e3b433b0..bfe9868d 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -57,9 +57,10 @@ MD5_OBJ = @MD5_OBJ@ OPIE_OBJ = @OPIE_OBJ@ OBJ = $(ALLOCA) cmpt$o connect$o fnmatch$o ftp$o ftp-basic$o \ - ftp-ls$o $(OPIE_OBJ) getopt$o headers$o host$o html$o \ - http$o init$o log$o main$o $(MD5_OBJ) netrc$o rbuf$o \ - recur$o retr$o snprintf$o url$o utils$o version$o + ftp-ls$o $(OPIE_OBJ) getopt$o hash$o headers$o host$o \ + html-parse$o html-url$o http$o init$o log$o main$o \ + $(MD5_OBJ) netrc$o rbuf$o recur$o retr$o snprintf$o \ + url$o utils$o version$o .SUFFIXES: .SUFFIXES: .c .o ._c ._o @@ -133,26 +134,31 @@ TAGS: *.c *.h # DO NOT DELETE THIS LINE -- make depend depends on it. -cmpt$o: config.h wget.h sysdep.h options.h -connect$o: config.h wget.h sysdep.h options.h connect.h host.h -fnmatch$o: config.h wget.h sysdep.h options.h fnmatch.h -ftp-basic$o: config.h wget.h sysdep.h options.h utils.h rbuf.h connect.h host.h -ftp-ls$o: config.h wget.h sysdep.h options.h utils.h ftp.h rbuf.h -ftp-opie$o: config.h wget.h sysdep.h options.h md5.h -ftp$o: config.h wget.h sysdep.h options.h utils.h url.h rbuf.h retr.h ftp.h html.h connect.h host.h fnmatch.h netrc.h -getopt$o: wget.h sysdep.h options.h -headers$o: config.h wget.h sysdep.h options.h connect.h rbuf.h headers.h -host$o: config.h wget.h sysdep.h options.h utils.h host.h url.h -html$o: config.h wget.h sysdep.h options.h url.h utils.h ftp.h rbuf.h html.h -http$o: config.h wget.h sysdep.h options.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h -init$o: config.h wget.h sysdep.h options.h utils.h init.h host.h recur.h netrc.h -log$o: config.h wget.h sysdep.h options.h utils.h -main$o: config.h wget.h sysdep.h options.h utils.h getopt.h init.h retr.h rbuf.h recur.h host.h -md5$o: wget.h sysdep.h options.h md5.h -mswindows$o: config.h winsock.h wget.h sysdep.h options.h url.h -netrc$o: wget.h sysdep.h options.h utils.h netrc.h init.h -rbuf$o: config.h wget.h sysdep.h options.h rbuf.h connect.h -recur$o: config.h wget.h sysdep.h options.h url.h recur.h utils.h retr.h rbuf.h ftp.h fnmatch.h host.h -retr$o: config.h wget.h sysdep.h options.h utils.h retr.h rbuf.h url.h recur.h ftp.h host.h connect.h -url$o: config.h wget.h sysdep.h options.h utils.h url.h host.h html.h -utils$o: config.h wget.h sysdep.h options.h utils.h fnmatch.h +cmpt$o: wget.h +connect$o: wget.h connect.h host.h +fnmatch$o: wget.h fnmatch.h +ftp-basic$o: wget.h utils.h rbuf.h connect.h host.h +ftp-ls$o: wget.h utils.h ftp.h url.h +ftp-opie$o: wget.h md5.h +ftp$o: wget.h utils.h url.h rbuf.h retr.h ftp.h connect.h host.h fnmatch.h netrc.h +getopt$o: wget.h getopt.h +hash$o: wget.h utils.h hash.h +headers$o: wget.h connect.h rbuf.h headers.h +host$o: wget.h utils.h host.h url.h hash.h +html-parse$o: wget.h html-parse.h +html-url$o: wget.h html-parse.h url.h utils.h +html$o: wget.h url.h utils.h ftp.h +http$o: wget.h utils.h url.h host.h rbuf.h retr.h headers.h connect.h fnmatch.h netrc.h md5.h +init$o: wget.h utils.h init.h host.h recur.h netrc.h +log$o: wget.h utils.h +main$o: wget.h utils.h getopt.h init.h retr.h recur.h host.h +md5$o: wget.h md5.h +mswindows$o: wget.h url.h +netrc$o: wget.h utils.h netrc.h init.h +rbuf$o: wget.h rbuf.h connect.h +recur$o: wget.h url.h recur.h utils.h retr.h ftp.h fnmatch.h host.h hash.h +retr$o: wget.h utils.h retr.h url.h recur.h ftp.h host.h connect.h hash.h +snprintf$o: +url$o: wget.h utils.h url.h host.h +utils$o: wget.h utils.h fnmatch.h hash.h +version$o: diff --git a/src/config.h.in b/src/config.h.in index 2038acde..ed200e32 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -101,6 +101,9 @@ char *alloca (); /* Define if you have the uname function. */ #undef HAVE_UNAME +/* Define if you have a working version of mmap. */ +#undef HAVE_MMAP + /* Define if you have the gethostname function. */ #undef HAVE_GETHOSTNAME diff --git a/src/connect.c b/src/connect.c index 28ce2043..feb2bb52 100644 --- a/src/connect.c +++ b/src/connect.c @@ -107,6 +107,37 @@ make_connection (int *sock, char *hostname, unsigned short port) return NOCONERROR; } +int +test_socket_open (int sock) +{ +#ifdef HAVE_SELECT + fd_set check_set; + struct timeval to; + + /* Check if we still have a valid (non-EOF) connection. From Andrew + * Maholski's code in the Unix Socket FAQ. */ + + FD_ZERO (&check_set); + FD_SET (sock, &check_set); + + /* Wait one microsecond */ + to.tv_sec = 0; + to.tv_usec = 1; + + /* If we get a timeout, then that means still connected */ + if (select (sock + 1, &check_set, NULL, NULL, &to) == 0) + { + /* Connection is valid (not EOF), so continue */ + return 1; + } + else + return 0; +#else + /* Without select, it's hard to know for sure. */ + return 1; +#endif +} + /* Bind the local port PORT. This does all the necessary work, which is creating a socket, setting SO_REUSEADDR option on it, then calling bind() and listen(). If *PORT is 0, a random port is diff --git a/src/ftp-ls.c b/src/ftp-ls.c index 16a7f7d6..884cf3d8 100644 --- a/src/ftp-ls.c +++ b/src/ftp-ls.c @@ -36,6 +36,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "wget.h" #include "utils.h" #include "ftp.h" +#include "url.h" /* Converts symbolic permissions to number-style ones, e.g. string rwxr-xr-x to 755. For now, it knows nothing of @@ -388,3 +389,175 @@ ftp_parse_ls (const char *file) { return ftp_parse_unix_ls (file); } + +/* Stuff for creating FTP index. */ + +/* The function returns the pointer to the malloc-ed quoted version of + string s. It will recognize and quote numeric and special graphic + entities, as per RFC1866: + + `&' -> `&' + `<' -> `<' + `>' -> `>' + `"' -> `"' + + No other entities are recognized or replaced. */ +static char * +html_quote_string (const char *s) +{ + const char *b = s; + char *p, *res; + int i; + + /* Pass through the string, and count the new size. */ + for (i = 0; *s; s++, i++) + { + if (*s == '&') + i += 4; /* `amp;' */ + else if (*s == '<' || *s == '>') + i += 3; /* `lt;' and `gt;' */ + else if (*s == '\"') + i += 5; /* `quot;' */ + } + res = (char *)xmalloc (i + 1); + s = b; + for (p = res; *s; s++) + { + switch (*s) + { + case '&': + *p++ = '&'; + *p++ = 'a'; + *p++ = 'm'; + *p++ = 'p'; + *p++ = ';'; + break; + case '<': case '>': + *p++ = '&'; + *p++ = (*s == '<' ? 'l' : 'g'); + *p++ = 't'; + *p++ = ';'; + break; + case '\"': + *p++ = '&'; + *p++ = 'q'; + *p++ = 'u'; + *p++ = 'o'; + *p++ = 't'; + *p++ = ';'; + break; + default: + *p++ = *s; + } + } + *p = '\0'; + return res; +} + +/* The function creates an HTML index containing references to given + directories and files on the appropriate host. The references are + FTP. */ +uerr_t +ftp_index (const char *file, struct urlinfo *u, struct fileinfo *f) +{ + FILE *fp; + char *upwd; + char *htclfile; /* HTML-clean file name */ + + if (!opt.dfp) + { + fp = fopen (file, "wb"); + if (!fp) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return FOPENERR; + } + } + else + fp = opt.dfp; + if (u->user) + { + char *tmpu, *tmpp; /* temporary, clean user and passwd */ + + tmpu = CLEANDUP (u->user); + tmpp = u->passwd ? CLEANDUP (u->passwd) : NULL; + upwd = (char *)xmalloc (strlen (tmpu) + + (tmpp ? (1 + strlen (tmpp)) : 0) + 2); + sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : ""); + free (tmpu); + FREE_MAYBE (tmpp); + } + else + upwd = xstrdup (""); + fprintf (fp, "\n"); + fprintf (fp, "\n\n"); + fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port); + fprintf (fp, "\n\n\n

"); + fprintf (fp, _("Index of /%s on %s:%d"), u->dir, u->host, u->port); + fprintf (fp, "

\n
\n
\n");
+  while (f)
+    {
+      fprintf (fp, "  ");
+      if (f->tstamp != -1)
+	{
+	  /* #### Should we translate the months? */
+	  static char *months[] = {
+	    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+	    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+	  };
+	  struct tm *ptm = localtime ((time_t *)&f->tstamp);
+
+	  fprintf (fp, "%d %s %02d ", ptm->tm_year + 1900, months[ptm->tm_mon],
+		  ptm->tm_mday);
+	  if (ptm->tm_hour)
+	    fprintf (fp, "%02d:%02d  ", ptm->tm_hour, ptm->tm_min);
+	  else
+	    fprintf (fp, "       ");
+	}
+      else
+	fprintf (fp, _("time unknown       "));
+      switch (f->type)
+	{
+	case FT_PLAINFILE:
+	  fprintf (fp, _("File        "));
+	  break;
+	case FT_DIRECTORY:
+	  fprintf (fp, _("Directory   "));
+	  break;
+	case FT_SYMLINK:
+	  fprintf (fp, _("Link        "));
+	  break;
+	default:
+	  fprintf (fp, _("Not sure    "));
+	  break;
+	}
+      htclfile = html_quote_string (f->name);
+      fprintf (fp, "host, u->port);
+      if (*u->dir != '/')
+	putc ('/', fp);
+      fprintf (fp, "%s", u->dir);
+      if (*u->dir)
+	putc ('/', fp);
+      fprintf (fp, "%s", htclfile);
+      if (f->type == FT_DIRECTORY)
+	putc ('/', fp);
+      fprintf (fp, "\">%s", htclfile);
+      if (f->type == FT_DIRECTORY)
+	putc ('/', fp);
+      fprintf (fp, " ");
+      if (f->type == FT_PLAINFILE)
+	fprintf (fp, _(" (%s bytes)"), legible (f->size));
+      else if (f->type == FT_SYMLINK)
+	fprintf (fp, "-> %s", f->linkto ? f->linkto : "(nil)");
+      putc ('\n', fp);
+      free (htclfile);
+      f = f->next;
+    }
+  fprintf (fp, "
\n\n\n"); + free (upwd); + if (!opt.dfp) + fclose (fp); + else + fflush (fp); + return FTPOK; +} diff --git a/src/ftp.c b/src/ftp.c index 4c26cf7d..aa283cf8 100644 --- a/src/ftp.c +++ b/src/ftp.c @@ -40,7 +40,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "rbuf.h" #include "retr.h" #include "ftp.h" -#include "html.h" #include "connect.h" #include "host.h" #include "fnmatch.h" @@ -722,7 +721,7 @@ Error in server response, closing control connection.\n")); } reset_timer (); /* Get the contents of the document. */ - res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf); + res = get_contents (dtsock, fp, len, restval, expected_bytes, &con->rbuf, 0); con->dltime = elapsed_time (); tms = time_str (NULL); tmrate = rate (*len - restval, con->dltime); diff --git a/src/ftp.h b/src/ftp.h index c2e6d44c..064e6354 100644 --- a/src/ftp.h +++ b/src/ftp.h @@ -92,4 +92,6 @@ typedef struct struct fileinfo *ftp_parse_ls PARAMS ((const char *)); uerr_t ftp_loop PARAMS ((struct urlinfo *, int *)); +uerr_t ftp_index (const char *, struct urlinfo *, struct fileinfo *); + #endif /* FTP_H */ diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 00000000..e54fb33a --- /dev/null +++ b/src/hash.c @@ -0,0 +1,403 @@ +/* Hash tables. + Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include "wget.h" +#include "utils.h" + +#include "hash.h" + +#ifdef STANDALONE +# define xmalloc malloc +# define xrealloc realloc +#endif + +/* This file implements simple hash tables based on linear probing. + The hash table stores key-value pairs in a contiguous array. Both + key and value are void pointers that the hash and test functions + know how to handle. + + Although Knuth & co. recommend double hashing over linear probing, + we use the latter because it accesses array elements sequentially + in case of a collision, yielding in better cache behaviour and + ultimately in better speed. To avoid collision problems with + linear probing, we make sure that the table grows as soon as the + fullness/size ratio exceeds 75%. */ + +struct ht_pair { + void *key; + void *value; +}; + +struct hash_table { + unsigned long (*hash_function) (const void *); + int (*test_function) (const void *, const void *); + + int size; /* size of the array */ + int fullness; /* number of non-empty fields */ + int count; /* number of non-empty, non-deleted + fields. */ + + struct ht_pair *pairs; +}; + +#define ENTRY_DELETED ((void *)0xdeadbeef) + +#define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED) +#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL) + +/* Find a prime near, but greather than or equal to SIZE. */ + +int +prime_size (int size) +{ + static const unsigned long primes [] = { + 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031, + 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783, + 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941, + 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519, + 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301, + 10445899, 13579681, 17653589, 22949669, 29834603, 38784989, + 50420551, 65546729, 85210757, 110774011, 144006217, 187208107, + 243370577, 316381771, 411296309, 534685237, 695090819, 903618083, + 1174703521, 1527114613, 1985248999, 2580823717UL, 3355070839UL + }; + int i; + for (i = 0; i < ARRAY_SIZE (primes); i++) + if (primes[i] >= size) + return primes[i]; + /* huh? */ + return size; +} + +/* Create a hash table of INITIAL_SIZE with hash function + HASH_FUNCTION and test function TEST_FUNCTION. If you wish to + start out with a "small" table which will be regrown as needed, + specify 0 as INITIAL_SIZE. */ + +struct hash_table * +hash_table_new (int initial_size, + unsigned long (*hash_function) (const void *), + int (*test_function) (const void *, const void *)) +{ + struct hash_table *ht + = (struct hash_table *)xmalloc (sizeof (struct hash_table)); + ht->hash_function = hash_function; + ht->test_function = test_function; + ht->size = prime_size (initial_size); + ht->fullness = 0; + ht->count = 0; + ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair)); + memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + return ht; +} + +/* Free the data associated with hash table HT. */ + +void +hash_table_destroy (struct hash_table *ht) +{ + free (ht->pairs); + free (ht); +} + +/* Get the value that corresponds to the key KEY in the hash table HT. + If no value is found, return NULL. Note that NULL is a legal value + for value; if you are storing NULLs in your hash table, you can use + hash_table_exists to be sure that a (possibly NULL) value exists in + the table. */ + +void * +hash_table_get (struct hash_table *ht, const void *key) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + return NULL; + else if (DELETED_ENTRY_P (the_pair->key) + || !ht->test_function (key, the_pair->key)) + { + ++location; + if (location == ht->size) + location = 0; + } + else + return the_pair->value; + } +} + +/* Return 1 if KEY exists in HT, 0 otherwise. */ + +int +hash_table_exists (struct hash_table *ht, const void *key) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + return 0; + else if (DELETED_ENTRY_P (the_pair->key) + || !ht->test_function (key, the_pair->key)) + { + ++location; + if (location == ht->size) + location = 0; + } + else + return 1; + } +} + +#define MAX(i, j) (((i) >= (j)) ? (i) : (j)) + +/* Grow hash table HT as necessary, and rehash all the key-value + pairs. */ + +static void +grow_hash_table (struct hash_table *ht) +{ + int i; + struct ht_pair *old_pairs = ht->pairs; + int old_count = ht->count; /* for assert() below */ + int old_size = ht->size; + + /* Normally, the idea is to double ht->size (and round it to next + prime) on each regrow: + + ht->size = prime_size (ht->size * 2); + + But it is possible that the table has large fullness because of + the many deleted entries. If that is the case, we don't want to + blindly grow the table; we just want to rehash it. For that + reason, we use ht->count as the relevant parameter. MAX is used + only because we don't want to actually shrink the table. (But + maybe that's wrong.) */ + + int needed_size = prime_size (ht->count * 2); + ht->size = MAX (old_size, needed_size); + + ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair)); + memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + + /* Need to reset these two; hash_table_put will reinitialize them. */ + ht->fullness = 0; + ht->count = 0; + for (i = 0; i < old_size; i++) + { + struct ht_pair *the_pair = old_pairs + i; + if (!EMPTY_ENTRY_P (the_pair->key) + && !DELETED_ENTRY_P (the_pair->key)) + hash_table_put (ht, the_pair->key, the_pair->value); + } + assert (ht->count == old_count); + free (old_pairs); +} + +/* Put VALUE in the hash table HT under the key KEY. This regrows the + table if necessary. */ + +void +hash_table_put (struct hash_table *ht, const void *key, void *value) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + { + ++ht->fullness; + ++ht->count; + just_insert: + the_pair->key = (void *)key; /* const? */ + the_pair->value = value; + break; + } + else if (DELETED_ENTRY_P (the_pair->key)) + { + /* We're replacing a deleteed entry, so ht->count gets + increased, but ht->fullness remains unchanged. */ + ++ht->count; + goto just_insert; + } + else if (ht->test_function (key, the_pair->key)) + { + /* We're replacing an existing entry, so ht->count and + ht->fullness remain unchanged. */ + goto just_insert; + } + else + { + ++location; + if (location == ht->size) + location = 0; + } + } + if (ht->fullness * 4 > ht->size * 3) + /* When fullness exceeds 75% of size, regrow the table. */ + grow_hash_table (ht); +} + +/* Remove KEY from HT. */ + +int +hash_table_remove (struct hash_table *ht, const void *key) +{ + int location = ht->hash_function (key) % ht->size; + while (1) + { + struct ht_pair *the_pair = ht->pairs + location; + if (EMPTY_ENTRY_P (the_pair->key)) + return 0; + else if (DELETED_ENTRY_P (the_pair->key) + || !ht->test_function (key, the_pair->key)) + { + ++location; + if (location == ht->size) + location = 0; + } + else + { + /* We don't really remove an entry from the hash table: we + just mark it as deleted. This is because there may be + other entries located after this entry whose hash number + points to a location before this entry. (Example: keys + A, B and C have the same hash. If you were to really + *delete* B from the table, C could no longer be found.) + + As an optimization, it might be worthwhile to check + whether the immediately preceding entry is empty and, if + so, really delete the pair (set it to empty and decrease + the fullness along with the count). I *think* it should + be safe. */ + the_pair->key = ENTRY_DELETED; + --ht->count; + return 1; + } + } +} + +void +hash_table_clear (struct hash_table *ht) +{ + memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + ht->fullness = 0; + ht->count = 0; +} + +void +hash_table_map (struct hash_table *ht, + int (*mapfun) (void *, void *, void *), + void *closure) +{ + int i; + for (i = 0; i < ht->size; i++) + { + struct ht_pair *the_pair = ht->pairs + i; + if (!EMPTY_ENTRY_P (the_pair->key) + && !DELETED_ENTRY_P (the_pair->key)) + if (mapfun (the_pair->key, the_pair->value, closure)) + return; + } +} + +/* Support for hash tables whose keys are strings. */ + +/* supposedly from the Dragon Book P436. */ +unsigned long +string_hash (const void *sv) +{ + unsigned int h = 0; + unsigned const char *x = (unsigned const char *) sv; + + while (*x) + { + unsigned int g; + h = (h << 4) + *x++; + if ((g = h & 0xf0000000) != 0) + h = (h ^ (g >> 24)) ^ g; + } + + return h; +} + +int +string_cmp (const void *s1, const void *s2) +{ + return !strcmp ((const char *)s1, (const char *)s2); +} + +struct hash_table * +make_string_hash_table (int initial_size) +{ + return hash_table_new (initial_size, string_hash, string_cmp); +} + + +#ifdef STANDALONE + +#include +#include + +int +print_hash_table_mapper (const void *key, void *value, void *count) +{ + ++*(int *)count; + printf ("%s: %s\n", (const char *)key, (char *)value); + return 0; +} + +void +print_hash (struct hash_table *sht) +{ + int debug_count = 0; + hash_table_map (sht, print_hash_table_mapper, &debug_count); + assert (debug_count == sht->count); +} + +int +main (void) +{ + struct hash_table *ht = make_string_hash_table (0); + char line[80]; + while ((fgets (line, sizeof (line), stdin))) + { + int len = strlen (line); + if (len <= 1) + continue; + line[--len] = '\0'; + hash_table_put (ht, strdup (line), "here I am!"); + if (len % 2) + hash_table_remove (ht, line); + } + print_hash (ht); +#if 0 + printf ("%d %d %d\n", ht->count, ht->fullness, ht->size); +#endif + return 0; +} +#endif diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 00000000..ab3136aa --- /dev/null +++ b/src/hash.h @@ -0,0 +1,50 @@ +/* Hash table declarations. + Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* From XEmacs, and hence from Dragon book. */ + +#define GOOD_HASH 65599 /* prime number just over 2^16; Dragon book, p. 435 */ +#define HASH2(a,b) (GOOD_HASH * (a) + (b)) +#define HASH3(a,b,c) (GOOD_HASH * HASH2 (a,b) + (c)) +#define HASH4(a,b,c,d) (GOOD_HASH * HASH3 (a,b,c) + (d)) +#define HASH5(a,b,c,d,e) (GOOD_HASH * HASH4 (a,b,c,d) + (e)) +#define HASH6(a,b,c,d,e,f) (GOOD_HASH * HASH5 (a,b,c,d,e) + (f)) +#define HASH7(a,b,c,d,e,f,g) (GOOD_HASH * HASH6 (a,b,c,d,e,f) + (g)) +#define HASH8(a,b,c,d,e,f,g,h) (GOOD_HASH * HASH7 (a,b,c,d,e,f,g) + (h)) +#define HASH9(a,b,c,d,e,f,g,h,i) (GOOD_HASH * HASH8 (a,b,c,d,e,f,g,h) + (i)) + +struct hash_table; + +struct hash_table *hash_table_new PARAMS ((int, + unsigned long (*) (const void *), + int (*) (const void *, + const void *))); +void hash_table_destroy PARAMS ((struct hash_table *)); +void *hash_table_get PARAMS ((struct hash_table *, const void *)); +int hash_table_exists PARAMS ((struct hash_table *, const void *)); +void hash_table_put PARAMS ((struct hash_table *, const void *, void *)); +int hash_table_remove PARAMS ((struct hash_table *, const void *)); +void hash_table_clear PARAMS ((struct hash_table *)); +void hash_table_map PARAMS ((struct hash_table *, + int (*) (void *, void *, void *), + void *)); + +unsigned long string_hash PARAMS ((const void *)); +int string_cmp PARAMS ((const void *, const void *)); +struct hash_table *make_string_hash_table PARAMS ((int)); diff --git a/src/headers.c b/src/headers.c index 6b1a670f..521073df 100644 --- a/src/headers.c +++ b/src/headers.c @@ -165,6 +165,14 @@ header_strdup (const char *header, void *closure) return 1; } +/* Write the value 1 into the integer pointed to by CLOSURE. */ +int +header_exists (const char *header, void *closure) +{ + *(int *)closure = 1; + return 1; +} + /* Skip LWS (linear white space), if present. Returns number of characters to skip. */ int diff --git a/src/headers.h b/src/headers.h index cc66e49b..5f85c6eb 100644 --- a/src/headers.h +++ b/src/headers.h @@ -31,5 +31,6 @@ int header_process PARAMS ((const char *, const char *, int header_extract_number PARAMS ((const char *, void *)); int header_strdup PARAMS ((const char *, void *)); +int header_exists PARAMS ((const char *, void *)); int skip_lws PARAMS ((const char *)); diff --git a/src/host.c b/src/host.c index 3fa1bb84..eeb4940d 100644 --- a/src/host.c +++ b/src/host.c @@ -1,5 +1,5 @@ /* Dealing with host names. - Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. + Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc. This file is part of Wget. @@ -48,35 +48,38 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utils.h" #include "host.h" #include "url.h" +#include "hash.h" #ifndef errno extern int errno; #endif -/* Host list entry */ -struct host +/* Mapping between all known hosts to their addresses (n.n.n.n). */ +struct hash_table *host_name_address_map; + +/* Mapping between all known addresses (n.n.n.n) to their hosts. This + is the inverse of host_name_address_map. These two tables share + the strdup'ed strings. */ +struct hash_table *host_address_name_map; + +/* Mapping between auxilliary (slave) and master host names. */ +struct hash_table *host_slave_master_map; + +/* Utility function: like xstrdup(), but also lowercases S. */ + +static char * +xstrdup_lower (const char *s) { - /* Host's symbolical name, as encountered at the time of first - inclusion, e.g. "fly.cc.fer.hr". */ - char *hostname; - /* Host's "real" name, i.e. its IP address, written out in ASCII - form of N.N.N.N, e.g. "161.53.70.130". */ - char *realname; - /* More than one HOSTNAME can correspond to the same REALNAME. For - our purposes, the canonical name of the host is its HOSTNAME when - it was first encountered. This entry is said to have QUALITY. */ - int quality; - /* Next entry in the list. */ - struct host *next; -}; - -static struct host *hlist; - -static struct host *add_hlist PARAMS ((struct host *, const char *, - const char *, int)); + char *copy = xstrdup (s); + char *p = copy; + for (; *p; p++) + *p = TOLOWER (*p); + return copy; +} /* The same as gethostbyname, but supports internet addresses of the - form `N.N.N.N'. */ + form `N.N.N.N'. On some systems gethostbyname() knows how to do + this automatically. */ struct hostent * ngethostbyname (const char *name) { @@ -91,42 +94,51 @@ ngethostbyname (const char *name) return hp; } -/* Search for HOST in the linked list L, by hostname. Return the - entry, if found, or NULL. The search is case-insensitive. */ -static struct host * -search_host (struct host *l, const char *host) -{ - for (; l; l = l->next) - if (strcasecmp (l->hostname, host) == 0) - return l; - return NULL; -} +/* Add host name HOST with the address ADDR_TEXT to the cache. + Normally this means that the (HOST, ADDR_TEXT) pair will be to + host_name_address_map and to host_address_name_map. (It is the + caller's responsibility to make sure that HOST is not already in + host_name_address_map.) -/* Like search_host, but searches by address. */ -static struct host * -search_address (struct host *l, const char *address) + If the ADDR_TEXT has already been seen and belongs to another host, + HOST will be added to host_slave_master_map instead. */ + +static void +add_host_to_cache (const char *host, const char *addr_text) { - for (; l; l = l->next) + char *canonical_name = hash_table_get (host_address_name_map, addr_text); + if (canonical_name) + { + DEBUGP (("Mapping %s to %s in host_slave_master_map.\n", + host, canonical_name)); + /* We've already dealt with that host under another name. */ + hash_table_put (host_slave_master_map, + xstrdup_lower (host), + xstrdup_lower (canonical_name)); + } + else { - int cmp = strcmp (l->realname, address); - if (cmp == 0) - return l; - else if (cmp > 0) - return NULL; + /* This is really the first time we're dealing with that host. */ + char *h_copy = xstrdup_lower (host); + char *a_copy = xstrdup (addr_text); + DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy)); + hash_table_put (host_name_address_map, h_copy, a_copy); + hash_table_put (host_address_name_map, a_copy, h_copy); } - return NULL; } -/* Store the address of HOSTNAME, internet-style, to WHERE. First - check for it in the host list, and (if not found), use - ngethostbyname to get it. +/* Store the address of HOSTNAME, internet-style (four octets in + network order), to WHERE. First try to get the address from the + cache; if it is not available, call the DNS functions and update + the cache. Return 1 on successful finding of the hostname, 0 otherwise. */ int store_hostaddress (unsigned char *where, const char *hostname) { - struct host *t; unsigned long addr; + char *addr_text; + char *canonical_name; struct hostent *hptr; struct in_addr in; char *inet_s; @@ -134,178 +146,119 @@ store_hostaddress (unsigned char *where, const char *hostname) /* If the address is of the form d.d.d.d, there will be no trouble with it. */ addr = (unsigned long)inet_addr (hostname); - if ((int)addr == -1) - { - /* If it is not of that form, try to find it in the cache. */ - t = search_host (hlist, hostname); - if (t) - addr = (unsigned long)inet_addr (t->realname); - } /* If we have the numeric address, just store it. */ if ((int)addr != -1) { - /* ADDR is in network byte order, meaning the code works on - little and big endian 32-bit architectures without change. - On big endian 64-bit architectures we need to be careful to - copy the correct four bytes. */ - int offset = 0; + /* ADDR is defined to be in network byte order, meaning the code + works on little and big endian 32-bit architectures without + change. On big endian 64-bit architectures we need to be + careful to copy the correct four bytes. */ + int offset; + have_addr: #ifdef WORDS_BIGENDIAN offset = sizeof (unsigned long) - 4; +#else + offset = 0; #endif memcpy (where, (char *)&addr + offset, 4); return 1; } + + /* By now we know that the address is not of the form d.d.d.d. Try + to find it in our cache of host addresses. */ + addr_text = hash_table_get (host_name_address_map, hostname); + if (addr_text) + { + DEBUGP (("Found %s in host_name_address_map: %s\n", + hostname, addr_text)); + addr = (unsigned long)inet_addr (addr_text); + goto have_addr; + } + + /* Maybe this host is known to us under another name. If so, we'll + find it in host_slave_master_map, and use the master name to find + its address in host_name_address_map. */ + canonical_name = hash_table_get (host_slave_master_map, hostname); + if (canonical_name) + { + addr_text = hash_table_get (host_name_address_map, canonical_name); + assert (addr_text != NULL); + DEBUGP (("Found %s as slave of %s -> %s\n", + hostname, canonical_name, addr_text)); + addr = (unsigned long)inet_addr (addr_text); + goto have_addr; + } + /* Since all else has failed, let's try gethostbyname(). Note that we use gethostbyname() rather than ngethostbyname(), because we - *know* the address is not numerical. */ + already know that the address is not numerical. */ hptr = gethostbyname (hostname); if (!hptr) return 0; /* Copy the address of the host to socket description. */ memcpy (where, hptr->h_addr_list[0], hptr->h_length); - /* Now that we're here, we could as well cache the hostname for - future use, as in realhost(). First, we have to look for it by - address to know if it's already in the cache by another name. */ + assert (hptr->h_length == 4); + /* Now that we've gone through the truoble of calling + gethostbyname(), we can store this valuable information to the + cache. First, we have to look for it by address to know if it's + already in the cache by another name. */ /* Originally, we copied to in.s_addr, but it appears to be missing on some systems. */ memcpy (&in, *hptr->h_addr_list, sizeof (in)); - STRDUP_ALLOCA (inet_s, inet_ntoa (in)); - t = search_address (hlist, inet_s); - if (t) /* Found in the list, as realname. */ - { - /* Set the default, 0 quality. */ - hlist = add_hlist (hlist, hostname, inet_s, 0); - return 1; - } - /* Since this is really the first time this host is encountered, - set quality to 1. */ - hlist = add_hlist (hlist, hostname, inet_s, 1); + inet_s = inet_ntoa (in); + add_host_to_cache (hostname, inet_s); return 1; } -/* Add a host to the host list. The list is sorted by addresses. For - equal addresses, the entries with quality should bubble towards the - beginning of the list. */ -static struct host * -add_hlist (struct host *l, const char *nhost, const char *nreal, int quality) -{ - struct host *t, *old, *beg; - - /* The entry goes to the beginning of the list if the list is empty - or the order requires it. */ - if (!l || (strcmp (nreal, l->realname) < 0)) - { - t = (struct host *)xmalloc (sizeof (struct host)); - t->hostname = xstrdup (nhost); - t->realname = xstrdup (nreal); - t->quality = quality; - t->next = l; - return t; - } - - beg = l; - /* Second two one-before-the-last element. */ - while (l->next) - { - int cmp; - old = l; - l = l->next; - cmp = strcmp (nreal, l->realname); - if (cmp >= 0) - continue; - /* If the next list element is greater than s, put s between the - current and the next list element. */ - t = (struct host *)xmalloc (sizeof (struct host)); - old->next = t; - t->next = l; - t->hostname = xstrdup (nhost); - t->realname = xstrdup (nreal); - t->quality = quality; - return beg; - } - t = (struct host *)xmalloc (sizeof (struct host)); - t->hostname = xstrdup (nhost); - t->realname = xstrdup (nreal); - t->quality = quality; - /* Insert the new element after the last element. */ - l->next = t; - t->next = NULL; - return beg; -} - /* Determine the "real" name of HOST, as perceived by Wget. If HOST is referenced by more than one name, "real" name is considered to - be the first one encountered in the past. - - If the host cannot be found in the list of already dealt-with - hosts, try with its INET address. If this fails too, add it to the - list. The routine does not call gethostbyname twice for the same - host if it can possibly avoid it. */ + be the first one encountered in the past. */ char * realhost (const char *host) { - struct host *l, *l_real; struct in_addr in; struct hostent *hptr; - char *inet_s; + char *master_name; - DEBUGP (("Checking for %s.\n", host)); - /* Look for the host, looking by the host name. */ - l = search_host (hlist, host); - if (l && l->quality) /* Found it with quality */ + DEBUGP (("Checking for %s in host_name_address_map.\n", host)); + if (hash_table_exists (host_name_address_map, host)) { - DEBUGP (("%s was already used, by that name.\n", host)); - /* Here we return l->hostname, not host, because of the possible - case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are - the same, but we want the one that was first. */ - return xstrdup (l->hostname); + DEBUGP (("Found; %s was already used, by that name.\n", host)); + return xstrdup_lower (host); } - else if (!l) /* Not found, with or without quality */ - { - /* The fact that gethostbyname will get called makes it - necessary to store it to the list, to ensure that - gethostbyname will not be called twice for the same string. - However, the quality argument must be set appropriately. - - Note that add_hlist must be called *after* the realname - search, or the quality would be always set to 0 */ - DEBUGP (("This is the first time I hear about host %s by that name.\n", - host)); - hptr = ngethostbyname (host); - if (!hptr) - return xstrdup (host); - /* Originally, we copied to in.s_addr, but it appears to be - missing on some systems. */ - memcpy (&in, *hptr->h_addr_list, sizeof (in)); - STRDUP_ALLOCA (inet_s, inet_ntoa (in)); - } - else /* Found, without quality */ + + DEBUGP (("Checking for %s in host_slave_master_map.\n", host)); + master_name = hash_table_get (host_slave_master_map, host); + if (master_name) { - /* This case happens when host is on the list, - but not as first entry (the one with quality). - Then we just get its INET address and pick - up the first entry with quality. */ - DEBUGP (("We've dealt with host %s, but under the name %s.\n", - host, l->realname)); - STRDUP_ALLOCA (inet_s, l->realname); + has_master: + DEBUGP (("Found; %s was already used, by the name %s.\n", + host, master_name)); + return xstrdup (master_name); } - /* Now we certainly have the INET address. The following loop is - guaranteed to pick either an entry with quality (because it is - the first one), or none at all. */ - l_real = search_address (hlist, inet_s); - if (l_real) /* Found in the list, as realname. */ + DEBUGP (("First time I hear about %s by that name; looking it up.\n", + host)); + hptr = ngethostbyname (host); + if (hptr) { - if (!l) - /* Set the default, 0 quality. */ - hlist = add_hlist (hlist, host, inet_s, 0); - return xstrdup (l_real->hostname); + char *inet_s; + /* Originally, we copied to in.s_addr, but it appears to be + missing on some systems. */ + memcpy (&in, *hptr->h_addr_list, sizeof (in)); + inet_s = inet_ntoa (in); + + add_host_to_cache (host, inet_s); + + /* add_host_to_cache() can establish a slave-master mapping. */ + DEBUGP (("Checking again for %s in host_slave_master_map.\n", host)); + master_name = hash_table_get (host_slave_master_map, host); + if (master_name) + goto has_master; } - /* Since this is really the first time this host is encountered, - set quality to 1. */ - hlist = add_hlist (hlist, host, inet_s, 1); - return xstrdup (host); + + return xstrdup_lower (host); } /* Compare two hostnames (out of URL-s if the arguments are URL-s), @@ -547,20 +500,23 @@ herrmsg (int error) return _("Unknown error"); } -/* Clean the host list. This is a separate function, so we needn't - export HLIST and its implementation. Ha! */ void clean_hosts (void) { - struct host *l = hlist; + /* host_name_address_map and host_address_name_map share the + strings. Because of that, calling free_keys_and_values once + suffices for both. */ + free_keys_and_values (host_name_address_map); + hash_table_destroy (host_name_address_map); + hash_table_destroy (host_address_name_map); + free_keys_and_values (host_slave_master_map); + hash_table_destroy (host_slave_master_map); +} - while (l) - { - struct host *p = l->next; - free (l->hostname); - free (l->realname); - free (l); - l = p; - } - hlist = NULL; +void +host_init (void) +{ + host_name_address_map = make_string_hash_table (0); + host_address_name_map = make_string_hash_table (0); + host_slave_master_map = make_string_hash_table (0); } diff --git a/src/html-parse.c b/src/html-parse.c new file mode 100644 index 00000000..b5efa7f2 --- /dev/null +++ b/src/html-parse.c @@ -0,0 +1,856 @@ +/* HTML parser for Wget. + Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +This file is part of Wget. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* The only entry point to this module is map_html_tags(), which see. */ + +/* TODO: + + - Allow hooks for callers to process contents outside tags. This + is needed to implement handling