From c06d32a3098aec29f0172247bd01e345f111aa6e Mon Sep 17 00:00:00 2001 From: hniksic Date: Sun, 21 Sep 2003 05:02:57 -0700 Subject: [PATCH] [svn] Treat xhtml files just like regular html. By Matthew J. Mellon. --- doc/wget.texi | 37 ++++++++++++++++++++----------------- src/ChangeLog | 5 +++++ src/http.c | 5 ++++- src/wget.h | 3 ++- 4 files changed, 31 insertions(+), 19 deletions(-) diff --git a/doc/wget.texi b/doc/wget.texi index 641d2840..91691c99 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -143,9 +143,9 @@ which can be a great hindrance when transferring a lot of data. @c man end @end ignore @c man begin DESCRIPTION -Wget can follow links in @sc{html} pages and create local versions of -remote web sites, fully recreating the directory structure of the -original site. This is sometimes referred to as ``recursive +Wget can follow links in @sc{html} and @sc{xhtml} pages and create local +versions of remote web sites, fully recreating the directory structure of +the original site. This is sometimes referred to as ``recursive downloading.'' While doing that, Wget respects the Robot Exclusion Standard (@file{/robots.txt}). Wget can be instructed to convert the links in downloaded @sc{html} files to the local files for offline @@ -944,23 +944,24 @@ current directory). @cindex .html extension @item -E @itemx --html-extension -If a file of type @samp{text/html} is downloaded and the URL does not -end with the regexp @samp{\.[Hh][Tt][Mm][Ll]?}, this option will cause -the suffix @samp{.html} to be appended to the local filename. This is -useful, for instance, when you're mirroring a remote site that uses -@samp{.asp} pages, but you want the mirrored pages to be viewable on -your stock Apache server. Another good use for this is when you're -downloading the output of CGIs. A URL like -@samp{http://site.com/article.cgi?25} will be saved as +If a file of type @samp{application/xhtml+xml} or @samp{text/html} is +downloaded and the URL does not end with the regexp +@samp{\.[Hh][Tt][Mm][Ll]?}, this option will cause the suffix @samp{.html} +to be appended to the local filename. This is useful, for instance, when +you're mirroring a remote site that uses @samp{.asp} pages, but you want +the mirrored pages to be viewable on your stock Apache server. Another +good use for this is when you're downloading the output of CGIs. A URL +like @samp{http://site.com/article.cgi?25} will be saved as @file{article.cgi?25.html}. Note that filenames changed in this way will be re-downloaded every time you re-mirror a site, because Wget can't tell that the local @file{@var{X}.html} file corresponds to remote URL @samp{@var{X}} (since it doesn't yet know that the URL produces output of type -@samp{text/html}. To prevent this re-downloading, you must use -@samp{-k} and @samp{-K} so that the original version of the file will be -saved as @file{@var{X}.orig} (@pxref{Recursive Retrieval Options}). +@samp{text/html} or @samp{application/xhtml+xml}. To prevent this +re-downloading, you must use @samp{-k} and @samp{-K} so that the original +version of the file will be saved as @file{@var{X}.orig} (@pxref{Recursive +Retrieval Options}). @cindex http user @cindex http password @@ -1524,7 +1525,8 @@ With @sc{http} @sc{url}s, Wget retrieves and parses the @sc{html} from the given @sc{url}, documents, retrieving the files the @sc{html} document was referring to, through markups like @code{href}, or @code{src}. If the freshly downloaded file is also of type -@code{text/html}, it will be parsed and followed further. +@code{text/html} or @code{application/xhtml+xml}, it will be parsed and +followed further. Recursive retrieval of @sc{http} and @sc{html} content is @dfn{breadth-first}. This means that Wget first downloads the requested @@ -2229,7 +2231,8 @@ Turn globbing on/off---the same as @samp{-g}. Define an additional header, like @samp{--header}. @item html_extension = on/off -Add a @samp{.html} extension to @samp{text/html} files without it, like +Add a @samp{.html} extension to @samp{text/html} or +@samp{application/xhtml+xml} files without it, like @samp{-E}. @item http_passwd = @var{string} @@ -2658,7 +2661,7 @@ But you've also noticed that local viewing doesn't work all that well when HTML files are saved under extensions other than @samp{.html}, perhaps because they were served as @file{index.cgi}. So you'd like Wget to rename all the files served with content-type @samp{text/html} -to @file{@var{name}.html}. +or @samp{application/xhtml+xml} to @file{@var{name}.html}. @example wget --mirror --convert-links --backup-converted \ diff --git a/src/ChangeLog b/src/ChangeLog index 2a531b7a..1a0c9883 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,8 @@ +2003-09-21 Matthew J. Mellon + + * http.c (gethttp): Recognize content-type "application/xhtml+xml" + as what Wget considers "text/html". + 2003-09-21 Hrvoje Niksic * connect.c (connect_with_timeout): Made timeout type double. diff --git a/src/http.c b/src/http.c index 31e0bfdc..13a8364e 100644 --- a/src/http.c +++ b/src/http.c @@ -82,6 +82,7 @@ static int cookies_loaded_p; struct cookie_jar *wget_cookie_jar; #define TEXTHTML_S "text/html" +#define TEXTXHTML_S "application/xhtml+xml" #define HTTP_ACCEPT "*/*" /* Some status code validation macros: */ @@ -1323,7 +1324,9 @@ Accept: %s\r\n\ /* If content-type is not given, assume text/html. This is because of the multitude of broken CGI's that "forget" to generate the content-type. */ - if (!type || 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S))) + if (!type || + 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) || + 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S))) *dt |= TEXTHTML; else *dt &= ~TEXTHTML; diff --git a/src/wget.h b/src/wget.h index 8851b3e0..1c06db05 100644 --- a/src/wget.h +++ b/src/wget.h @@ -299,7 +299,8 @@ extern const char *exec_name; /* Document type ("dt") flags */ enum { - TEXTHTML = 0x0001, /* document is of type text/html */ + TEXTHTML = 0x0001, /* document is of type text/html + or application/xhtml+xml */ RETROKF = 0x0002, /* retrieval was OK */ HEAD_ONLY = 0x0004, /* only send the HEAD request */ SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */ -- 2.39.2