From a0d0f332d5f230e40fe7fff8fc76839c4f4704ce Mon Sep 17 00:00:00 2001 From: Micah Cowan Date: Tue, 22 Apr 2008 00:15:48 -0700 Subject: [PATCH] Ted Mielczarek's CSS wonder-patch, applied against the source from around the time the patch was written. --- configure.in | 3 + src/Makefile.in | 26 +++-- src/convert.c | 100 +++++++++++------ src/convert.h | 4 + src/css-tokens.h | 66 ++++++++++++ src/css-url.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++ src/css-url.h | 36 +++++++ src/css.lex | 137 ++++++++++++++++++++++++ src/html-parse.c | 128 ++++++++++++++++++++++ src/html-parse.h | 3 + src/html-url.c | 117 +++++++++++++------- src/html-url.h | 51 +++++++++ src/http.c | 81 +++++++++----- src/recur.c | 62 ++++++++--- src/recur.h | 5 - src/retr.c | 3 + src/wget.h | 3 +- 17 files changed, 970 insertions(+), 128 deletions(-) create mode 100644 src/css-tokens.h create mode 100644 src/css-url.c create mode 100644 src/css-url.h create mode 100644 src/css.lex create mode 100644 src/html-url.h diff --git a/configure.in b/configure.in index 9f735391..2dc71771 100644 --- a/configure.in +++ b/configure.in @@ -115,6 +115,9 @@ test -z "$CC" && cc_specified=yes AC_PROG_CC AC_AIX +YYTEXT_POINTER=1 +AC_PROG_LEX + dnl Turn on optimization by default. Specifically: dnl dnl if the user hasn't specified CFLAGS, then diff --git a/src/Makefile.in b/src/Makefile.in index bcacd7dd..75fe22c2 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -54,6 +54,7 @@ CFLAGS = @CFLAGS@ LDFLAGS = @LDFLAGS@ LIBS = @LIBS@ @LIBSSL@ @LIBGNUTLS@ exeext = @exeext@ +LEX = @LEX@ INCLUDES = -I. -I$(srcdir) @@ -72,12 +73,12 @@ NTLM_OBJ = @NTLM_OBJ@ SSL_OBJ = @SSL_OBJ@ GETOPT_OBJ = @GETOPT_OBJ@ -OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \ - ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \ - host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \ - log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \ - res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ) \ - url.o utils.o version.o xmalloc.o +OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o css-url.o \ + ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \ + host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o \ + lex.yy.o log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o \ + recur.o res.o retr.o safe-ctype.o snprintf.o spider.o \ + $(SSL_OBJ) url.o utils.o version.o xmalloc.o .SUFFIXES: .SUFFIXES: .c .o @@ -90,16 +91,19 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o \ wget$(exeext): $(OBJ) $(LINK) $(OBJ) $(LIBS) +lex.yy.c: css.lex + $(LEX) $< + # We make object files depend on every header. Rather than attempt to # track dependencies, everything gets recompiled when a header # changes. With a program of Wget's size this doesn't waste much # time, and it's a lot safer than attempting to get all the # dependencies right. -$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \ - gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \ - http-ntlm.h init.h log.h mswindows.h netrc.h options.h \ - progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \ +$(OBJ): config-post.h config.h connect.h convert.h cookies.h css-url.h \ + ftp.h gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h \ + http-ntlm.h init.h log.h mswindows.h netrc.h options.h \ + progress.h ptimer.h recur.h res.h retr.h safe-ctype.h \ spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h # @@ -122,7 +126,7 @@ uninstall.bin: # clean: - $(RM) *.o wget$(exeext) *~ *.bak core core.[0-9]* + $(RM) *.o wget$(exeext) *~ *.bak core core.[0-9]* lex.yy.c distclean: clean $(RM) Makefile config.h diff --git a/src/convert.c b/src/convert.c index 4274bc5b..7b38550b 100644 --- a/src/convert.c +++ b/src/convert.c @@ -46,50 +46,37 @@ so, delete this exception statement from your version. */ #include "hash.h" #include "ptimer.h" #include "res.h" +#include "html-url.h" +#include "css-url.h" static struct hash_table *dl_file_url_map; struct hash_table *dl_url_file_map; -/* Set of HTML files downloaded in this Wget run, used for link +/* Set of HTML/CSS files downloaded in this Wget run, used for link conversion after Wget is done. */ struct hash_table *downloaded_html_set; +struct hash_table *downloaded_css_set; static void convert_links (const char *, struct urlpos *); -/* This function is called when the retrieval is done to convert the - links that have been downloaded. It has to be called at the end of - the retrieval, because only then does Wget know conclusively which - URLs have been downloaded, and which not, so it can tell which - direction to convert to. - - The "direction" means that the URLs to the files that have been - downloaded get converted to the relative URL which will point to - that file. And the other URLs get converted to the remote URL on - the server. - - All the downloaded HTMLs are kept in downloaded_html_files, and - downloaded URLs in urls_downloaded. All the information is - extracted from these two lists. */ void -convert_all_links (void) +convert_links_in_hashtable (struct hash_table *downloaded_set, + int is_css, + int *file_count) { int i; - double secs; - int file_count = 0; - - struct ptimer *timer = ptimer_new (); int cnt; char **file_array; cnt = 0; - if (downloaded_html_set) - cnt = hash_table_count (downloaded_html_set); + if (downloaded_set) + cnt = hash_table_count (downloaded_set); if (cnt == 0) return; file_array = alloca_array (char *, cnt); - string_set_to_array (downloaded_html_set, file_array); + string_set_to_array (downloaded_set, file_array); for (i = 0; i < cnt; i++) { @@ -97,7 +84,7 @@ convert_all_links (void) char *url; char *file = file_array[i]; - /* Determine the URL of the HTML file. get_urls_html will need + /* Determine the URL of the file. get_urls_{html,css} will need it. */ url = hash_table_get (dl_file_url_map, file); if (!url) @@ -108,8 +95,9 @@ convert_all_links (void) DEBUGP (("Scanning %s (from %s)\n", file, url)); - /* Parse the HTML file... */ - urls = get_urls_html (file, url, NULL); + /* Parse the file... */ + urls = is_css ? get_urls_css_file (file, url) : + get_urls_html (file, url, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the @@ -161,11 +149,38 @@ convert_all_links (void) /* Convert the links in the file. */ convert_links (file, urls); - ++file_count; + ++*file_count; /* Free the data. */ free_urlpos (urls); } +} + +/* This function is called when the retrieval is done to convert the + links that have been downloaded. It has to be called at the end of + the retrieval, because only then does Wget know conclusively which + URLs have been downloaded, and which not, so it can tell which + direction to convert to. + + The "direction" means that the URLs to the files that have been + downloaded get converted to the relative URL which will point to + that file. And the other URLs get converted to the remote URL on + the server. + + All the downloaded HTMLs are kept in downloaded_html_files, and + downloaded URLs in urls_downloaded. All the information is + extracted from these two lists. */ + +void +convert_all_links (void) +{ + double secs; + int file_count = 0; + + struct ptimer *timer = ptimer_new (); + + convert_links_in_hashtable (downloaded_html_set, 0, &file_count); + convert_links_in_hashtable (downloaded_css_set, 1, &file_count); secs = ptimer_measure (timer); ptimer_destroy (timer); @@ -174,13 +189,14 @@ convert_all_links (void) } static void write_backup_file (const char *, downloaded_file_t); +static const char *replace_plain (const char*, int, FILE*, const char *); static const char *replace_attr (const char *, int, FILE *, const char *); static const char *replace_attr_refresh_hack (const char *, int, FILE *, const char *, int); static char *local_quote_string (const char *); static char *construct_relative (const char *, const char *); -/* Change the links in one HTML file. LINKS is a list of links in the +/* Change the links in one file. LINKS is a list of links in the document, along with their positions and the desired direction of the conversion. */ static void @@ -277,7 +293,9 @@ convert_links (const char *file, struct urlpos *links) char *newname = construct_relative (file, link->local_name); char *quoted_newname = local_quote_string (newname); - if (!link->link_refresh_p) + if (link->link_css_p) + p = replace_plain (p, link->size, fp, quoted_newname); + else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newname); else p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, @@ -296,7 +314,9 @@ convert_links (const char *file, struct urlpos *links) char *newlink = link->url->url; char *quoted_newlink = html_quote_string (newlink); - if (!link->link_refresh_p) + if (link->link_css_p) + p = replace_plain (p, link->size, fp, quoted_newlink); + else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newlink); else p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink, @@ -406,6 +426,7 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) size_t filename_len = strlen (file); char* filename_plus_orig_suffix; + /* TODO: hack this to work with css files */ if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) { /* Just write "orig" over "html". We need to do it this way @@ -465,6 +486,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) static bool find_fragment (const char *, int, const char **, const char **); +/* Replace a string with NEW_TEXT. Ignore quoting. */ +static const char * +replace_plain (const char *p, int size, FILE *fp, const char *new_text) +{ + fputs (new_text, fp); + p += size; + return p; +} + /* Replace an attribute's original text with NEW_TEXT. */ static const char * @@ -832,6 +862,16 @@ register_html (const char *url, const char *file) string_set_add (downloaded_html_set, file); } +/* Register that FILE is a CSS file that has been downloaded. */ + +void +register_css (const char *url, const char *file) +{ + if (!downloaded_css_set) + downloaded_css_set = make_string_hash_table (0); + string_set_add (downloaded_css_set, file); +} + static void downloaded_files_free (void); /* Cleanup the data structures associated with this file. */ diff --git a/src/convert.h b/src/convert.h index 11d6a5f1..1cd05f36 100644 --- a/src/convert.h +++ b/src/convert.h @@ -33,6 +33,7 @@ so, delete this exception statement from your version. */ struct hash_table; /* forward decl */ extern struct hash_table *dl_url_file_map; extern struct hash_table *downloaded_html_set; +extern struct hash_table *downloaded_css_set; enum convert_options { CO_NOCONVERT = 0, /* don't convert this URL */ @@ -64,7 +65,9 @@ struct urlpos { unsigned int link_complete_p :1; /* the link was complete (had host name) */ unsigned int link_base_p :1; /* the url came from */ unsigned int link_inline_p :1; /* needed to render the page */ + unsigned int link_css_p :1; /* the url came from CSS */ unsigned int link_expect_html :1; /* expected to contain HTML */ + unsigned int link_expect_css :1; /* expected to contain CSS */ unsigned int link_refresh_p :1; /* link was received from */ @@ -98,6 +101,7 @@ downloaded_file_t downloaded_file (downloaded_file_t, const char *); void register_download (const char *, const char *); void register_redirection (const char *, const char *); void register_html (const char *, const char *); +void register_css (const char *, const char *); void register_delete_file (const char *); void convert_all_links (void); void convert_cleanup (void); diff --git a/src/css-tokens.h b/src/css-tokens.h new file mode 100644 index 00000000..4feef42a --- /dev/null +++ b/src/css-tokens.h @@ -0,0 +1,66 @@ +/* Declarations for css.lex + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +#ifndef CSS_TOKENS_H +#define CSS_TOKENS_H + +enum { + CSSEOF, + S, + CDO, + CDC, + INCLUDES, + DASHMATCH, + LBRACE, + PLUS, + GREATER, + COMMA, + STRING, + INVALID, + IDENT, + HASH, + IMPORT_SYM, + PAGE_SYM, + MEDIA_SYM, + CHARSET_SYM, + IMPORTANT_SYM, + EMS, + EXS, + LENGTH, + ANGLE, + TIME, + FREQ, + DIMENSION, + PERCENTAGE, + NUMBER, + URI, + FUNCTION +} css_tokens; + +#endif /* CSS_TOKENS_H */ diff --git a/src/css-url.c b/src/css-url.c new file mode 100644 index 00000000..42c8fc3e --- /dev/null +++ b/src/css-url.c @@ -0,0 +1,273 @@ +/* Collect URLs from CSS source. + Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +/* + Note that this is not an actual CSS parser, but just a lexical + scanner with a tiny bit more smarts bolted on top. A full parser + is somewhat overkill for this job. The only things we're interested + in are @import rules and url() tokens, so it's easy enough to + grab those without truly understanding the input. The only downside + to this is that we might be coerced into downloading files that + a browser would ignore. That might merit some more investigation. + */ + +#include + +#include +#ifdef HAVE_STRING_H +# include +#else +# include +#endif +#include +#include +#include +#include + +#include "wget.h" +#include "utils.h" +#include "convert.h" +#include "html-url.h" +#include "css-tokens.h" + +/* from lex.yy.c */ +extern char *yytext; +extern int yyleng; +typedef struct yy_buffer_state *YY_BUFFER_STATE; +extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len ); +extern int yylex (void); + +#if 1 +const char *token_names[] = { + "CSSEOF", + "S", + "CDO", + "CDC", + "INCLUDES", + "DASHMATCH", + "LBRACE", + "PLUS", + "GREATER", + "COMMA", + "STRING", + "INVALID", + "IDENT", + "HASH", + "IMPORT_SYM", + "PAGE_SYM", + "MEDIA_SYM", + "CHARSET_SYM", + "IMPORTANT_SYM", + "EMS", + "EXS", + "LENGTH", + "ANGLE", + "TIME", + "FREQ", + "DIMENSION", + "PERCENTAGE", + "NUMBER", + "URI", + "FUNCTION" +}; +#endif + +/* + Given a detected URI token, get only the URI specified within. + Also adjust the starting position and length of the string. + + A URI can be specified with or without quotes, and the quotes + can be single or double quotes. In addition there can be + whitespace after the opening parenthesis and before the closing + parenthesis. +*/ +char * +get_uri_string (const char *at, int *pos, int *length) +{ + char *uri; + /*char buf[1024]; + strncpy(buf,at + *pos, *length); + buf[*length] = '\0'; + DEBUGP (("get_uri_string: \"%s\"\n", buf));*/ + + if (0 != strncasecmp (at + *pos, "url(", 4)) + return NULL; + + *pos += 4; + *length -= 5; /* url() */ + /* skip leading space */ + while (isspace (at[*pos])) + { + (*pos)++; + (*length)--; + } + /* skip trailing space */ + while (isspace (at[*pos + *length - 1])) + { + (*length)--; + } + /* trim off quotes */ + if (at[*pos] == '\'' || at[*pos] == '"') + { + (*pos)++; + *length -= 2; + } + + uri = xmalloc (*length + 1); + if (uri) + { + strncpy (uri, at + *pos, *length); + uri[*length] = '\0'; + } + + return uri; +} + +void +get_urls_css (struct map_context *ctx, int offset, int buf_length) +{ + int token; + /*char tmp[2048];*/ + int buffer_pos = 0; + int pos, length; + char *uri; + + /* + strncpy(tmp,ctx->text + offset, buf_length); + tmp[buf_length] = '\0'; + DEBUGP (("get_urls_css: \"%s\"\n", tmp)); + */ + + /* tell flex to scan from this buffer */ + yy_scan_bytes (ctx->text + offset, buf_length); + + while((token = yylex()) != CSSEOF) + { + /*DEBUGP (("%s ", token_names[token]));*/ + /* @import "foo.css" + or @import url(foo.css) + */ + if(token == IMPORT_SYM) + { + do { + buffer_pos += yyleng; + } while((token = yylex()) == S); + + /*DEBUGP (("%s ", token_names[token]));*/ + + if (token == STRING || token == URI) + { + /*DEBUGP (("Got URI "));*/ + pos = buffer_pos + offset; + length = yyleng; + + if (token == URI) + { + uri = get_uri_string (ctx->text, &pos, &length); + } + else + { + /* cut out quote characters */ + pos++; + length -= 2; + uri = xmalloc (length + 1); + strncpy (uri, yytext + 1, length); + uri[length] = '\0'; + } + + if (uri) + { + struct urlpos *up = append_url (uri, pos, length, ctx); + DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri)); + + if (up) + { + up->link_inline_p = 1; + up->link_css_p = 1; + up->link_expect_css = 1; + } + + xfree(uri); + } + } + } + /* background-image: url(foo.png) + note that we don't care what + property this is actually on. + */ + else if(token == URI) + { + pos = buffer_pos + offset; + length = yyleng; + uri = get_uri_string (ctx->text, &pos, &length); + + if (uri) + { + struct urlpos *up = append_url (uri, pos, length, ctx); + DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri)); + if (up) + { + up->link_inline_p = 1; + up->link_css_p = 1; + } + + xfree (uri); + } + } + buffer_pos += yyleng; + } + DEBUGP (("\n")); +} + +struct urlpos * +get_urls_css_file (const char *file, const char *url) +{ + struct file_memory *fm; + struct map_context ctx; + + /* Load the file. */ + fm = read_file (file); + if (!fm) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return NULL; + } + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); + + ctx.text = fm->content; + ctx.head = ctx.tail = NULL; + ctx.base = NULL; + ctx.parent_base = url ? url : opt.base_href; + ctx.document_file = file; + ctx.nofollow = 0; + + get_urls_css (&ctx, 0, fm->length); + read_file_free (fm); + return ctx.head; +} diff --git a/src/css-url.h b/src/css-url.h new file mode 100644 index 00000000..772e2fd7 --- /dev/null +++ b/src/css-url.h @@ -0,0 +1,36 @@ +/* Declarations for css-url.c. + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +#ifndef CSS_URL_H +#define CSS_URL_H + +void get_urls_css (struct map_context *, int, int); +struct urlpos *get_urls_css_file (const char *, const char *); + +#endif /* CSS_URL_H */ diff --git a/src/css.lex b/src/css.lex new file mode 100644 index 00000000..8d1477a4 --- /dev/null +++ b/src/css.lex @@ -0,0 +1,137 @@ +%option case-insensitive +%option noyywrap +%option never-interactive + +%{ +/* Lex source for CSS tokenizing. + Taken from http://www.w3.org/TR/CSS21/grammar.html#q2 + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +#include "css-tokens.h" + +/* {s}+\/\*[^*]*\*+([^/*][^*]*\*+)*\/ {unput(' '); } */ +/*replace by space*/ +%} + +h [0-9a-f] +nonascii [\200-\377] +unicode \\{h}{1,6}(\r\n|[ \t\r\n\f])? +escape {unicode}|\\[^\r\n\f0-9a-f] +nmstart [_a-z]|{nonascii}|{escape} +nmchar [_a-z0-9-]|{nonascii}|{escape} +string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" +string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\' +invalid1 \"([^\n\r\f\\"]|\\{nl}|{escape})* +invalid2 \'([^\n\r\f\\']|\\{nl}|{escape})* + +comment \/\*[^*]*\*+([^/*][^*]*\*+)*\/ +ident -?{nmstart}{nmchar}* +name {nmchar}+ +num [0-9]+|[0-9]*"."[0-9]+ +string {string1}|{string2} +invalid {invalid1}|{invalid2} +url ([!#$%&*-~]|{nonascii}|{escape})* +s [ \t\r\n\f] +w ({s}|{comment})* +nl \n|\r\n|\r|\f + +A a|\\0{0,4}(41|61)(\r\n|[ \t\r\n\f])? +C c|\\0{0,4}(43|63)(\r\n|[ \t\r\n\f])? +D d|\\0{0,4}(44|64)(\r\n|[ \t\r\n\f])? +E e|\\0{0,4}(45|65)(\r\n|[ \t\r\n\f])? +G g|\\0{0,4}(47|67)(\r\n|[ \t\r\n\f])?|\\g +H h|\\0{0,4}(48|68)(\r\n|[ \t\r\n\f])?|\\h +I i|\\0{0,4}(49|69)(\r\n|[ \t\r\n\f])?|\\i +K k|\\0{0,4}(4b|6b)(\r\n|[ \t\r\n\f])?|\\k +M m|\\0{0,4}(4d|6d)(\r\n|[ \t\r\n\f])?|\\m +N n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n +P p|\\0{0,4}(50|70)(\r\n|[ \t\r\n\f])?|\\p +R r|\\0{0,4}(52|72)(\r\n|[ \t\r\n\f])?|\\r +S s|\\0{0,4}(53|73)(\r\n|[ \t\r\n\f])?|\\s +T t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t +X x|\\0{0,4}(58|78)(\r\n|[ \t\r\n\f])?|\\x +Z z|\\0{0,4}(5a|7a)(\r\n|[ \t\r\n\f])?|\\z + +%% + +{s} {return S;} + +\/\*[^*]*\*+([^/*][^*]*\*+)*\/ {return S;} /* ignore comments */ + +"" {return CDC;} +"~=" {return INCLUDES;} +"|=" {return DASHMATCH;} + +{w}"{" {return LBRACE;} +{w}"+" {return PLUS;} +{w}">" {return GREATER;} +{w}"," {return COMMA;} + +{string} {return STRING;} +{invalid} {return INVALID; /* unclosed string */} + +{ident} {return IDENT;} + +"#"{name} {return HASH;} + +"@import" {return IMPORT_SYM;} +"@page" {return PAGE_SYM;} +"@media" {return MEDIA_SYM;} +"@charset " {return CHARSET_SYM;} + +"!"{w}"important" {return IMPORTANT_SYM;} + +{num}{E}{M} {return EMS;} +{num}{E}{X} {return EXS;} +{num}{P}{X} {return LENGTH;} +{num}{C}{M} {return LENGTH;} +{num}{M}{M} {return LENGTH;} +{num}{I}{N} {return LENGTH;} +{num}{P}{T} {return LENGTH;} +{num}{P}{C} {return LENGTH;} +{num}{D}{E}{G} {return ANGLE;} +{num}{R}{A}{D} {return ANGLE;} +{num}{G}{R}{A}{D} {return ANGLE;} +{num}{M}{S} {return TIME;} +{num}{S} {return TIME;} +{num}{H}{Z} {return FREQ;} +{num}{K}{H}{Z} {return FREQ;} +{num}{ident} {return DIMENSION;} + +{num}% {return PERCENTAGE;} +{num} {return NUMBER;} + +"url("{w}{string}{w}")" {return URI;} +"url("{w}{url}{w}")" {return URI;} +{ident}"(" {return FUNCTION;} + +. {return *yytext;} + +%% diff --git a/src/html-parse.c b/src/html-parse.c index 10cc3697..8254c6dc 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -271,6 +271,94 @@ struct pool { to "prev = ts->next = NULL; + } + else + { + (*tail)->next = ts; + ts->prev = *tail; + *tail = ts; + ts->next = NULL; + } + + return ts; +} + +/* remove ts and everything after it from the stack */ +void +tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, + struct tagstack_item *ts) +{ + if (*head == NULL) + return; + + if (ts == *tail) + { + if (ts == *head) + { + xfree (ts); + *head = *tail = NULL; + } + else + { + ts->prev->next = NULL; + *tail = ts->prev; + xfree (ts); + } + } + else + { + if (ts == *head) + { + *head = NULL; + } + *tail = ts->prev; + + if (ts->prev) + { + ts->prev->next = NULL; + } + while (ts) + { + struct tagstack_item *p = ts->next; + xfree (ts); + ts = p; + } + } +} + +struct tagstack_item * +tagstack_find (struct tagstack_item *tail, const char *tagname_begin, + const char *tagname_end) +{ + int len = tagname_end - tagname_begin; + while (tail) + { + if (len == (tail->tagname_end - tail->tagname_begin)) + { + if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len)) + return tail; + } + tail = tail->prev; + } + return NULL; +} + /* Decode the HTML character entity at *PTR, considering END to be end of buffer. It is assumed that the "&" character that marks the beginning of the entity has been seen at *PTR-1. If a recognized @@ -756,6 +844,9 @@ map_html_tags (const char *text, int size, bool attr_pair_resized = false; struct attr_pair *pairs = attr_pair_initial_storage; + struct tagstack_item *head = NULL; + struct tagstack_item *tail = NULL; + if (!size) return; @@ -822,6 +913,18 @@ map_html_tags (const char *text, int size, goto look_for_tag; tag_name_end = p; SKIP_WS (p); + + if (!end_tag) + { + struct tagstack_item *ts = tagstack_push (&head, &tail); + if (ts) + { + ts->tagname_begin = tag_name_begin; + ts->tagname_end = tag_name_end; + ts->contents_begin = NULL; + } + } + if (end_tag && *p != '>') goto backout_tag; @@ -983,6 +1086,11 @@ map_html_tags (const char *text, int size, ++nattrs; } + if (!end_tag && tail && (tail->tagname_begin == tag_name_begin)) + { + tail->contents_begin = p+1; + } + if (uninteresting_tag) { ADVANCE (p); @@ -994,6 +1102,7 @@ map_html_tags (const char *text, int size, { int i; struct taginfo taginfo; + struct tagstack_item *ts = NULL; taginfo.name = pool.contents; taginfo.end_tag_p = end_tag; @@ -1010,6 +1119,23 @@ map_html_tags (const char *text, int size, taginfo.attrs = pairs; taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; + taginfo.contents_begin = NULL; + taginfo.contents_end = NULL; + + if (end_tag) + { + ts = tagstack_find (tail, tag_name_begin, tag_name_end); + if (ts) + { + if (ts->contents_begin) + { + taginfo.contents_begin = ts->contents_begin; + taginfo.contents_end = tag_start_position; + } + tagstack_pop (&head, &tail, ts); + } + } + mapfun (&taginfo, maparg); ADVANCE (p); } @@ -1029,6 +1155,8 @@ map_html_tags (const char *text, int size, POOL_FREE (&pool); if (attr_pair_resized) xfree (pairs); + /* pop any tag stack that's left */ + tagstack_pop (&head, &tail, head); } #undef ADVANCE diff --git a/src/html-parse.h b/src/html-parse.h index 05a82483..371a4f86 100644 --- a/src/html-parse.h +++ b/src/html-parse.h @@ -51,6 +51,9 @@ struct taginfo { const char *start_position; /* start position of tag */ const char *end_position; /* end position of tag */ + + const char *contents_begin; /* delimiters of tag contents */ + const char *contents_end; /* only valid if end_tag_p */ }; struct hash_table; /* forward declaration */ diff --git a/src/html-url.c b/src/html-url.c index 0f2a0725..ebf8494d 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -41,9 +41,9 @@ so, delete this exception statement from your version. */ #include "utils.h" #include "hash.h" #include "convert.h" -#include "recur.h" /* declaration of get_urls_html */ - -struct map_context; +#include "recur.h" +#include "html-url.h" +#include "css-url.h" typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); @@ -163,11 +163,12 @@ static struct { from the information above. However, some places in the code refer to the attributes not mentioned here. We add them manually. */ static const char *additional_attributes[] = { - "rel", /* used by tag_handle_link */ - "http-equiv", /* used by tag_handle_meta */ - "name", /* used by tag_handle_meta */ - "content", /* used by tag_handle_meta */ - "action" /* used by tag_handle_form */ + "rel", /* used by tag_handle_link */ + "http-equiv", /* used by tag_handle_meta */ + "name", /* used by tag_handle_meta */ + "content", /* used by tag_handle_meta */ + "action", /* used by tag_handle_form */ + "style" /* used by check_style_attr */ }; static struct hash_table *interesting_tags; @@ -246,28 +247,20 @@ find_attr (struct taginfo *tag, const char *name, int *attrind) return NULL; } -struct map_context { - char *text; /* HTML text. */ - char *base; /* Base URI of the document, possibly - changed through . */ - const char *parent_base; /* Base of the current document. */ - const char *document_file; /* File name of this document. */ - bool nofollow; /* whether NOFOLLOW was specified in a - tag. */ - - struct urlpos *head, *tail; /* List of URLs that is being - built. */ -}; +/* used for calls to append_url */ +#define ATTR_POS(tag, attrind, ctx) \ + (tag->attrs[attrind].value_raw_beginning - ctx->text) +#define ATTR_SIZE(tag, attrind) \ + (tag->attrs[attrind].value_raw_size) /* Append LINK_URI to the urlpos structure that is being built. - LINK_URI will be merged with the current document base. TAG and - ATTRIND are the necessary context to store the position and - size. */ + LINK_URI will be merged with the current document base. +*/ -static struct urlpos * -append_url (const char *link_uri, - struct taginfo *tag, int attrind, struct map_context *ctx) +struct urlpos * +append_url (const char *link_uri, int position, int size, + struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -325,8 +318,8 @@ append_url (const char *link_uri, newel = xnew0 (struct urlpos); newel->url = url; - newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; - newel->size = tag->attrs[attrind].value_raw_size; + newel->pos = position; + newel->size = size; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -346,6 +339,18 @@ append_url (const char *link_uri, return newel; } +static void +check_style_attr (struct taginfo *tag, struct map_context *ctx) +{ + int attrind; + char *style = find_attr (tag, "style", &attrind); + if (!style) + return; + + /* raw pos and raw size include the quotes, hence the +1 -2 */ + get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2); +} + /* All the tag_* functions are called from collect_tags_mapper, as specified by KNOWN_TAGS. */ @@ -393,7 +398,8 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - struct urlpos *up = append_url (link, tag, attrind, ctx); + struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) { int flags = tag_url_attributes[i].flags; @@ -418,7 +424,8 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_url (newbase, tag, attrind, ctx); + base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -439,9 +446,11 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) { int attrind; char *action = find_attr (tag, "action", &attrind); + if (action) { - struct urlpos *up = append_url (action, tag, attrind, ctx); + struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) up->ignore_when_downloading = 1; } @@ -464,14 +473,23 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) */ if (href) { - struct urlpos *up = append_url (href, tag, attrind, ctx); + struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (up) { char *rel = find_attr (tag, "rel", NULL); - if (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))) - up->link_inline_p = 1; + if (rel) + { + if (0 == strcasecmp (rel, "stylesheet")) + { + up->link_inline_p = 1; + up->link_expect_css = 1; + } + else if (0 == strcasecmp (rel, "shortcut icon")) + { + up->link_inline_p = 1; + } + } else /* The external ones usually point to HTML pages, such as */ @@ -525,7 +543,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) while (ISSPACE (*p)) ++p; - entry = append_url (p, tag, attrind, ctx); + entry = append_url (p, ATTR_POS(tag,attrind,ctx), + ATTR_SIZE(tag,attrind), ctx); if (entry) { entry->link_refresh_p = 1; @@ -570,11 +589,26 @@ collect_tags_mapper (struct taginfo *tag, void *arg) struct map_context *ctx = (struct map_context *)arg; /* Find the tag in our table of tags. This must not fail because - map_html_tags only returns tags found in interesting_tags. */ + map_html_tags only returns tags found in interesting_tags. + + I've changed this for now, I'm passing NULL as interesting_tags + to map_html_tags. This way we can check all tags for a style + attribute. + */ struct known_tag *t = hash_table_get (interesting_tags, tag->name); - assert (t != NULL); - t->handler (t->tagid, tag, ctx); + if (t != NULL) + t->handler (t->tagid, tag, ctx); + + check_style_attr (tag, ctx); + + if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) && + tag->contents_begin && tag->contents_end) + { + /* parse contents */ + get_urls_css (ctx, tag->contents_begin - ctx->text, + tag->contents_end - tag->contents_begin); + } } /* Analyze HTML tags FILE and construct a list of URLs referenced from @@ -618,8 +652,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; + /* the NULL here used to be interesting_tags */ map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, - interesting_tags, interesting_attributes); + NULL, interesting_attributes); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) diff --git a/src/html-url.h b/src/html-url.h new file mode 100644 index 00000000..a94f0db6 --- /dev/null +++ b/src/html-url.h @@ -0,0 +1,51 @@ +/* Declarations for html-url.c. + Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +#ifndef HTML_URL_H +#define HTML_URL_H + +struct map_context { + char *text; /* HTML text. */ + char *base; /* Base URI of the document, possibly + changed through . */ + const char *parent_base; /* Base of the current document. */ + const char *document_file; /* File name of this document. */ + bool nofollow; /* whether NOFOLLOW was specified in a + tag. */ + + struct urlpos *head, *tail; /* List of URLs that is being + built. */ +}; + +struct urlpos *get_urls_file (const char *); +struct urlpos *get_urls_html (const char *, const char *, bool *); +struct urlpos *append_url (const char *, int, int, struct map_context *); +void free_urlpos (struct urlpos *); + +#endif /* HTML_URL_H */ diff --git a/src/http.c b/src/http.c index 99a059e5..d3f6704f 100644 --- a/src/http.c +++ b/src/http.c @@ -77,6 +77,7 @@ static struct cookie_jar *wget_cookie_jar; #define TEXTHTML_S "text/html" #define TEXTXHTML_S "application/xhtml+xml" +#define TEXTCSS_S "text/css" /* Some status code validation macros: */ #define H_20X(x) (((x) >= 200) && ((x) < 300)) @@ -1235,6 +1236,7 @@ static char *create_authorization_line (const char *, const char *, const char *, bool *); static char *basic_authentication_encode (const char *, const char *); static bool known_authentication_scheme_p (const char *, const char *); +static void ensure_extension (struct http_stat *, const char *, int *); static void load_cookies (void); #define BEGINS_WITH(line, string_constant) \ @@ -2017,34 +2019,25 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file); else *dt &= ~TEXTHTML; - if (opt.html_extension && (*dt & TEXTHTML)) - /* -E / --html-extension / html_extension = on was specified, and this is a - text/html file. If some case-insensitive variation on ".htm[l]" isn't - already the file's suffix, tack on ".html". */ - { - char *last_period_in_local_filename = strrchr (hs->local_file, '.'); + if (type && + 0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S))) + *dt |= TEXTCSS; + else + *dt &= ~TEXTCSS; - if (last_period_in_local_filename == NULL - || !(0 == strcasecmp (last_period_in_local_filename, ".htm") - || 0 == strcasecmp (last_period_in_local_filename, ".html"))) + if (opt.html_extension) + { + if (*dt & TEXTHTML) + /* -E / --html-extension / html_extension = on was specified, + and this is a text/html file. If some case-insensitive + variation on ".htm[l]" isn't already the file's suffix, + tack on ".html". */ { - int local_filename_len = strlen (hs->local_file); - /* Resize the local file, allowing for ".html" preceded by - optional ".NUMBER". */ - hs->local_file = xrealloc (hs->local_file, - local_filename_len + 24 + sizeof (".html")); - strcpy(hs->local_file + local_filename_len, ".html"); - /* If clobbering is not allowed and the file, as named, - exists, tack on ".NUMBER.html" instead. */ - if (!ALLOW_CLOBBER && file_exists_p (hs->local_file)) - { - int ext_num = 1; - do - sprintf (hs->local_file + local_filename_len, - ".%d.html", ext_num++); - while (file_exists_p (hs->local_file)); - } - *dt |= ADDED_HTML_EXTENSION; + ensure_extension (hs, ".html", dt); + } + else if (*dt & TEXTCSS) + { + ensure_extension (hs, ".css", dt); } } @@ -3018,6 +3011,42 @@ http_cleanup (void) cookie_jar_delete (wget_cookie_jar); } +void +ensure_extension (struct http_stat *hs, const char *ext, int *dt) +{ + char *last_period_in_local_filename = strrchr (hs->local_file, '.'); + char shortext[8]; + int len = strlen (ext); + if (len == 5) + { + strncpy (shortext, ext, len - 1); + shortext[len - 2] = '\0'; + } + + if (last_period_in_local_filename == NULL + || !(0 == strcasecmp (last_period_in_local_filename, shortext) + || 0 == strcasecmp (last_period_in_local_filename, ext))) + { + int local_filename_len = strlen (hs->local_file); + /* Resize the local file, allowing for ".html" preceded by + optional ".NUMBER". */ + hs->local_file = xrealloc (hs->local_file, + local_filename_len + 24 + len); + strcpy (hs->local_file + local_filename_len, ext); + /* If clobbering is not allowed and the file, as named, + exists, tack on ".NUMBER.html" instead. */ + if (!ALLOW_CLOBBER && file_exists_p (hs->local_file)) + { + int ext_num = 1; + do + sprintf (hs->local_file + local_filename_len, + ".%d%s", ext_num++, ext); + while (file_exists_p (hs->local_file)); + } + *dt |= ADDED_HTML_EXTENSION; + } +} + #ifdef TESTING diff --git a/src/recur.c b/src/recur.c index 980fc49d..024073ce 100644 --- a/src/recur.c +++ b/src/recur.c @@ -48,8 +48,10 @@ so, delete this exception statement from your version. */ #include "hash.h" #include "res.h" #include "convert.h" +#include "html-url.h" +#include "css-url.h" #include "spider.h" - + /* Functions for maintaining the URL queue. */ struct queue_element { @@ -58,7 +60,8 @@ struct queue_element { int depth; /* the depth */ bool html_allowed; /* whether the document is allowed to be treated as HTML. */ - + bool css_allowed; /* whether the document is allowed to + be treated as CSS. */ struct queue_element *next; /* next element in queue */ }; @@ -91,13 +94,15 @@ url_queue_delete (struct url_queue *queue) static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth, bool html_allowed) + const char *url, const char *referer, int depth, + bool html_allowed, bool css_allowed) { struct queue_element *qel = xnew (struct queue_element); qel->url = url; qel->referer = referer; qel->depth = depth; qel->html_allowed = html_allowed; + qel->css_allowed = css_allowed; qel->next = NULL; ++queue->count; @@ -121,7 +126,7 @@ url_enqueue (struct url_queue *queue, static bool url_dequeue (struct url_queue *queue, const char **url, const char **referer, int *depth, - bool *html_allowed) + bool *html_allowed, bool *css_allowed) { struct queue_element *qel = queue->head; @@ -136,6 +141,7 @@ url_dequeue (struct url_queue *queue, *referer = qel->referer; *depth = qel->depth; *html_allowed = qel->html_allowed; + *css_allowed = qel->css_allowed; --queue->count; @@ -200,7 +206,7 @@ retrieve_tree (const char *start_url) /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true); + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false); string_set_add (blacklist, start_url_parsed->url); while (1) @@ -208,7 +214,8 @@ retrieve_tree (const char *start_url) bool descend = false; char *url, *referer, *file = NULL; int depth; - bool html_allowed; + bool html_allowed, css_allowed; + bool is_css = false; bool dash_p_leaf_HTML = false; if (opt.quota && total_downloaded_bytes > opt.quota) @@ -220,7 +227,7 @@ retrieve_tree (const char *start_url) if (!url_dequeue (queue, (const char **)&url, (const char **)&referer, - &depth, &html_allowed)) + &depth, &html_allowed, &css_allowed)) break; /* ...and download it. Note that this download is in most cases @@ -238,10 +245,21 @@ retrieve_tree (const char *start_url) DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", url, file)); + /* this sucks, needs to be combined! */ if (html_allowed && downloaded_html_set && string_set_contains (downloaded_html_set, file)) - descend = true; + { + descend = true; + is_css = false; + } + if (css_allowed + && downloaded_css_set + && string_set_contains (downloaded_css_set, file)) + { + descend = 1; + is_css = true; + } } else { @@ -252,7 +270,21 @@ retrieve_tree (const char *start_url) if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) - descend = true; + { + descend = true; + is_css = false; + } + + /* a little different, css_allowed can override content type + lots of web servers serve css with an incorrect content type + */ + if (file && status == RETROK + && (dt & RETROKF) && + ((dt & TEXTCSS) || css_allowed)) + { + descend = true; + is_css = false; + } if (redirected) { @@ -306,14 +338,15 @@ retrieve_tree (const char *start_url) } } - /* If the downloaded document was HTML, parse it and enqueue the + /* If the downloaded document was HTML or CSS, parse it and enqueue the links it contains. */ if (descend) { bool meta_disallow_follow = false; struct urlpos *children - = get_urls_html (file, url, &meta_disallow_follow); + = is_css ? get_urls_css_file (file, url) : + get_urls_html (file, url, &meta_disallow_follow); if (opt.use_robots && meta_disallow_follow) { @@ -338,7 +371,8 @@ retrieve_tree (const char *start_url) { url_enqueue (queue, xstrdup (child->url->url), xstrdup (url), depth + 1, - child->link_expect_html); + child->link_expect_html, + child->link_expect_css); /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ @@ -385,9 +419,9 @@ retrieve_tree (const char *start_url) { char *d1, *d2; int d3; - bool d4; + bool d4, d5; while (url_dequeue (queue, - (const char **)&d1, (const char **)&d2, &d3, &d4)) + (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) { xfree (d1); xfree_null (d2); diff --git a/src/recur.h b/src/recur.h index b7e3c2e1..ed985467 100644 --- a/src/recur.h +++ b/src/recur.h @@ -43,9 +43,4 @@ struct urlpos; void recursive_cleanup (void); uerr_t retrieve_tree (const char *); -/* These are really in html-url.c. */ -struct urlpos *get_urls_file (const char *); -struct urlpos *get_urls_html (const char *, const char *, bool *); -void free_urlpos (struct urlpos *); - #endif /* RECUR_H */ diff --git a/src/retr.c b/src/retr.c index a2d462a8..245eb129 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,6 +51,7 @@ so, delete this exception statement from your version. */ #include "hash.h" #include "convert.h" #include "ptimer.h" +#include "html-url.h" /* Total size of downloaded files. Used to enforce quota. */ SUM_SIZE_INT total_downloaded_bytes; @@ -784,6 +785,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, register_redirection (origurl, u->url); if (*dt & TEXTHTML) register_html (u->url, local_file); + if (*dt & TEXTCSS) + register_css (u->url, local_file); } } diff --git a/src/wget.h b/src/wget.h index 740d2a9d..c6dd19c1 100644 --- a/src/wget.h +++ b/src/wget.h @@ -304,7 +304,8 @@ enum HEAD_ONLY = 0x0004, /* only send the HEAD request */ SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */ ACCEPTRANGES = 0x0010, /* Accept-ranges header was found */ - ADDED_HTML_EXTENSION = 0x0020 /* added ".html" extension due to -E */ + ADDED_HTML_EXTENSION = 0x0020, /* added ".html" extension due to -E */ + TEXTCSS = 0x0040 /* document is of type text/css */ }; /* Universal error type -- used almost everywhere. Error reporting of -- 2.39.2