Ted Mielczarek's CSS wonder-patch, applied against the source from around the time...

author Micah Cowan <micah@cowan.name>

Tue, 22 Apr 2008 07:15:48 +0000 (00:15 -0700)

committer Micah Cowan <micah@cowan.name>

Tue, 22 Apr 2008 07:15:48 +0000 (00:15 -0700)
author Micah Cowan <micah@cowan.name>
Tue, 22 Apr 2008 07:15:48 +0000 (00:15 -0700)
committer Micah Cowan <micah@cowan.name>
Tue, 22 Apr 2008 07:15:48 +0000 (00:15 -0700)
diff --git a/configure.in b/configure.in

index 9f735391c4c0202b293fe50232facf43d67823f3..2dc7177145fe0c0805c953f2aa7387f23df59c57 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -115,6 +115,9 @@ test -z "$CC" && cc_specified=yes
  AC_PROG_CC
  AC_AIX
  
+YYTEXT_POINTER=1
+AC_PROG_LEX
+
  dnl Turn on optimization by default.  Specifically:
  dnl
  dnl if the user hasn't specified CFLAGS, then
diff --git a/src/Makefile.in b/src/Makefile.in

index bcacd7ddc794f31f6571482a6d6fb09356c1bea9..75fe22c2343b0fb12edfd5beeba20c4f73c55cd5 100644 (file)
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -54,6 +54,7 @@ CFLAGS   = @CFLAGS@
  LDFLAGS  = @LDFLAGS@ 
  LIBS     = @LIBS@ @LIBSSL@ @LIBGNUTLS@
  exeext   = @exeext@
+LEX      = @LEX@
  
  INCLUDES = -I. -I$(srcdir)
  
@@ -72,12 +73,12 @@ NTLM_OBJ   = @NTLM_OBJ@
  SSL_OBJ    = @SSL_OBJ@
  GETOPT_OBJ = @GETOPT_OBJ@
  
-OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o              \
-      ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
-      host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o    \
-      log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
-      res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ)    \
-      url.o utils.o version.o xmalloc.o
+OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o css-url.o     \
+      ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o  \
+      host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o     \
+      lex.yy.o log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o \
+      recur.o res.o retr.o safe-ctype.o snprintf.o spider.o        \
+      $(SSL_OBJ) url.o utils.o version.o xmalloc.o
  
  .SUFFIXES:
  .SUFFIXES: .c .o
@@ -90,16 +91,19 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o              \
  wget$(exeext): $(OBJ)
         $(LINK) $(OBJ) $(LIBS)
  
+lex.yy.c: css.lex
+       $(LEX) $<
+
  # We make object files depend on every header.  Rather than attempt to
  # track dependencies, everything gets recompiled when a header
  # changes.  With a program of Wget's size this doesn't waste much
  # time, and it's a lot safer than attempting to get all the
  # dependencies right.
  
-$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
-        gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h    \
-        http-ntlm.h init.h log.h mswindows.h netrc.h options.h     \
-        progress.h ptimer.h recur.h res.h retr.h safe-ctype.h      \
+$(OBJ): config-post.h config.h connect.h convert.h cookies.h css-url.h \
+        ftp.h gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h  \
+        http-ntlm.h init.h log.h mswindows.h netrc.h options.h         \
+        progress.h ptimer.h recur.h res.h retr.h safe-ctype.h          \
          spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
  
  #
@@ -122,7 +126,7 @@ uninstall.bin:
  #
  
  clean:
-       $(RM) *.o wget$(exeext) *~ *.bak core core.[0-9]*
+       $(RM) *.o wget$(exeext) *~ *.bak core core.[0-9]* lex.yy.c
  
  distclean: clean
         $(RM) Makefile config.h
diff --git a/src/convert.c b/src/convert.c

index 4274bc5b1650873c39e51578d4ee9f9f328b177a..7b38550be5ee803ba839a8223373c8a8b5a66fe7 100644 (file)
--- a/src/convert.c
+++ b/src/convert.c
@@ -46,50 +46,37 @@ so, delete this exception statement from your version.  */
  #include "hash.h"
  #include "ptimer.h"
  #include "res.h"
+#include "html-url.h"
+#include "css-url.h"
  
  static struct hash_table *dl_file_url_map;
  struct hash_table *dl_url_file_map;
  
-/* Set of HTML files downloaded in this Wget run, used for link
+/* Set of HTML/CSS files downloaded in this Wget run, used for link
     conversion after Wget is done.  */
  struct hash_table *downloaded_html_set;
+struct hash_table *downloaded_css_set;
  
  static void convert_links (const char *, struct urlpos *);
  
-/* This function is called when the retrieval is done to convert the
-   links that have been downloaded.  It has to be called at the end of
-   the retrieval, because only then does Wget know conclusively which
-   URLs have been downloaded, and which not, so it can tell which
-   direction to convert to.
-
-   The "direction" means that the URLs to the files that have been
-   downloaded get converted to the relative URL which will point to
-   that file.  And the other URLs get converted to the remote URL on
-   the server.
-
-   All the downloaded HTMLs are kept in downloaded_html_files, and
-   downloaded URLs in urls_downloaded.  All the information is
-   extracted from these two lists.  */
  
  void
-convert_all_links (void)
+convert_links_in_hashtable (struct hash_table *downloaded_set,
+                            int is_css,
+                            int *file_count)
  {
    int i;
-  double secs;
-  int file_count = 0;
-
-  struct ptimer *timer = ptimer_new ();
  
    int cnt;
    char **file_array;
  
    cnt = 0;
-  if (downloaded_html_set)
-    cnt = hash_table_count (downloaded_html_set);
+  if (downloaded_set)
+    cnt = hash_table_count (downloaded_set);
    if (cnt == 0)
      return;
    file_array = alloca_array (char *, cnt);
-  string_set_to_array (downloaded_html_set, file_array);
+  string_set_to_array (downloaded_set, file_array);
  
    for (i = 0; i < cnt; i++)
      {
@@ -97,7 +84,7 @@ convert_all_links (void)
        char *url;
        char *file = file_array[i];
  
-      /* Determine the URL of the HTML file.  get_urls_html will need
+      /* Determine the URL of the file.  get_urls_{html,css} will need
           it.  */
        url = hash_table_get (dl_file_url_map, file);
        if (!url)
@@ -108,8 +95,9 @@ convert_all_links (void)
  
        DEBUGP (("Scanning %s (from %s)\n", file, url));
  
-      /* Parse the HTML file...  */
-      urls = get_urls_html (file, url, NULL);
+      /* Parse the file...  */
+      urls = is_css ? get_urls_css_file (file, url) :
+                      get_urls_html (file, url, NULL);
  
        /* We don't respect meta_disallow_follow here because, even if
           the file is not followed, we might still want to convert the
@@ -161,11 +149,38 @@ convert_all_links (void)
  
        /* Convert the links in the file.  */
        convert_links (file, urls);
-      ++file_count;
+      ++*file_count;
  
        /* Free the data.  */
        free_urlpos (urls);
      }
+}
+
+/* This function is called when the retrieval is done to convert the
+   links that have been downloaded.  It has to be called at the end of
+   the retrieval, because only then does Wget know conclusively which
+   URLs have been downloaded, and which not, so it can tell which
+   direction to convert to.
+
+   The "direction" means that the URLs to the files that have been
+   downloaded get converted to the relative URL which will point to
+   that file.  And the other URLs get converted to the remote URL on
+   the server.
+
+   All the downloaded HTMLs are kept in downloaded_html_files, and
+   downloaded URLs in urls_downloaded.  All the information is
+   extracted from these two lists.  */
+
+void
+convert_all_links (void)
+{
+  double secs;
+  int file_count = 0;
+
+  struct ptimer *timer = ptimer_new ();
+
+  convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
+  convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
  
    secs = ptimer_measure (timer);
    ptimer_destroy (timer);
@@ -174,13 +189,14 @@ convert_all_links (void)
  }
  
  static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_plain (const char*, int, FILE*, const char *);
  static const char *replace_attr (const char *, int, FILE *, const char *);
  static const char *replace_attr_refresh_hack (const char *, int, FILE *,
                                                const char *, int);
  static char *local_quote_string (const char *);
  static char *construct_relative (const char *, const char *);
  
-/* Change the links in one HTML file.  LINKS is a list of links in the
+/* Change the links in one file.  LINKS is a list of links in the
     document, along with their positions and the desired direction of
     the conversion.  */
  static void
@@ -277,7 +293,9 @@ convert_links (const char *file, struct urlpos *links)
              char *newname = construct_relative (file, link->local_name);
              char *quoted_newname = local_quote_string (newname);
  
-            if (!link->link_refresh_p)
+            if (link->link_css_p)
+              p = replace_plain (p, link->size, fp, quoted_newname);
+            else if (!link->link_refresh_p)
                p = replace_attr (p, link->size, fp, quoted_newname);
              else
                p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
@@ -296,7 +314,9 @@ convert_links (const char *file, struct urlpos *links)
              char *newlink = link->url->url;
              char *quoted_newlink = html_quote_string (newlink);
  
-            if (!link->link_refresh_p)
+            if (link->link_css_p)
+              p = replace_plain (p, link->size, fp, quoted_newlink);
+            else if (!link->link_refresh_p)
                p = replace_attr (p, link->size, fp, quoted_newlink);
              else
                p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
@@ -406,6 +426,7 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
    size_t         filename_len = strlen (file);
    char*          filename_plus_orig_suffix;
  
+  /* TODO: hack this to work with css files */
    if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
      {
        /* Just write "orig" over "html".  We need to do it this way
@@ -465,6 +486,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
  
  static bool find_fragment (const char *, int, const char **, const char **);
  
+/* Replace a string with NEW_TEXT.  Ignore quoting. */
+static const char *
+replace_plain (const char *p, int size, FILE *fp, const char *new_text)
+{
+  fputs (new_text, fp);
+  p += size;
+  return p;
+}
+
  /* Replace an attribute's original text with NEW_TEXT. */
  
  static const char *
@@ -832,6 +862,16 @@ register_html (const char *url, const char *file)
    string_set_add (downloaded_html_set, file);
  }
  
+/* Register that FILE is a CSS file that has been downloaded. */
+
+void
+register_css (const char *url, const char *file)
+{
+  if (!downloaded_css_set)
+    downloaded_css_set = make_string_hash_table (0);
+  string_set_add (downloaded_css_set, file);
+}
+
  static void downloaded_files_free (void);
  
  /* Cleanup the data structures associated with this file.  */
diff --git a/src/convert.h b/src/convert.h

index 11d6a5f149ba3e453da4678dcff1b5a5b7a2621f..1cd05f36389ecccab2330b2418ac4119d31ecb4c 100644 (file)
--- a/src/convert.h
+++ b/src/convert.h
@@ -33,6 +33,7 @@ so, delete this exception statement from your version.  */
  struct hash_table;             /* forward decl */
  extern struct hash_table *dl_url_file_map;
  extern struct hash_table *downloaded_html_set;
+extern struct hash_table *downloaded_css_set;
  
  enum convert_options {
    CO_NOCONVERT = 0,            /* don't convert this URL */
@@ -64,7 +65,9 @@ struct urlpos {
    unsigned int link_complete_p :1; /* the link was complete (had host name) */
    unsigned int link_base_p     :1; /* the url came from <base href=...> */
    unsigned int link_inline_p   :1; /* needed to render the page */
+  unsigned int link_css_p      :1; /* the url came from CSS */
    unsigned int link_expect_html        :1; /* expected to contain HTML */
+  unsigned int link_expect_css :1; /* expected to contain CSS */
  
    unsigned int link_refresh_p  :1; /* link was received from
                                        <meta http-equiv=refresh content=...> */
@@ -98,6 +101,7 @@ downloaded_file_t downloaded_file (downloaded_file_t, const char *);
  void register_download (const char *, const char *);
  void register_redirection (const char *, const char *);
  void register_html (const char *, const char *);
+void register_css (const char *, const char *);
  void register_delete_file (const char *);
  void convert_all_links (void);
  void convert_cleanup (void);
diff --git a/src/css-tokens.h b/src/css-tokens.h

new file mode 100644 (file)

index 0000000..4feef42
--- /dev/null
+++ b/src/css-tokens.h
@@ -0,0 +1,66 @@
+/* Declarations for css.lex
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#ifndef CSS_TOKENS_H
+#define CSS_TOKENS_H
+
+enum {
+  CSSEOF,
+  S,
+  CDO,
+  CDC,
+  INCLUDES,
+  DASHMATCH,
+  LBRACE,
+  PLUS,
+  GREATER,
+  COMMA,
+  STRING,
+  INVALID,
+  IDENT,
+  HASH,
+  IMPORT_SYM,
+  PAGE_SYM,
+  MEDIA_SYM,
+  CHARSET_SYM,
+  IMPORTANT_SYM,
+  EMS,
+  EXS,
+  LENGTH,
+  ANGLE,
+  TIME,
+  FREQ,
+  DIMENSION,
+  PERCENTAGE,
+  NUMBER,
+  URI,
+  FUNCTION
+} css_tokens;
+
+#endif /* CSS_TOKENS_H */
diff --git a/src/css-url.c b/src/css-url.c

new file mode 100644 (file)

index 0000000..42c8fc3
--- /dev/null
+++ b/src/css-url.c
@@ -0,0 +1,273 @@
+/* Collect URLs from CSS source.
+   Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+/*
+  Note that this is not an actual CSS parser, but just a lexical
+  scanner with a tiny bit more smarts bolted on top.  A full parser
+  is somewhat overkill for this job.  The only things we're interested
+  in are @import rules and url() tokens, so it's easy enough to
+  grab those without truly understanding the input.  The only downside
+  to this is that we might be coerced into downloading files that
+  a browser would ignore.  That might merit some more investigation.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "utils.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-tokens.h"
+
+/* from lex.yy.c */
+extern char *yytext;
+extern int yyleng;
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len  );
+extern int yylex (void);
+
+#if 1
+const char *token_names[] = {
+  "CSSEOF",
+  "S",
+  "CDO",
+  "CDC",
+  "INCLUDES",
+  "DASHMATCH",
+  "LBRACE",
+  "PLUS",
+  "GREATER",
+  "COMMA",
+  "STRING",
+  "INVALID",
+  "IDENT",
+  "HASH",
+  "IMPORT_SYM",
+  "PAGE_SYM",
+  "MEDIA_SYM",
+  "CHARSET_SYM",
+  "IMPORTANT_SYM",
+  "EMS",
+  "EXS",
+  "LENGTH",
+  "ANGLE",
+  "TIME",
+  "FREQ",
+  "DIMENSION",
+  "PERCENTAGE",
+  "NUMBER",
+  "URI",
+  "FUNCTION"
+};
+#endif
+
+/*
+  Given a detected URI token, get only the URI specified within.
+  Also adjust the starting position and length of the string.
+
+  A URI can be specified with or without quotes, and the quotes
+  can be single or double quotes.  In addition there can be
+  whitespace after the opening parenthesis and before the closing
+  parenthesis.
+*/
+char *
+get_uri_string (const char *at, int *pos, int *length)
+{
+  char *uri;
+  /*char buf[1024];
+  strncpy(buf,at + *pos, *length);
+  buf[*length] = '\0';
+  DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
+
+  if (0 != strncasecmp (at + *pos, "url(", 4))
+    return NULL;
+
+  *pos += 4;
+  *length -= 5; /* url() */
+  /* skip leading space */
+  while (isspace (at[*pos]))
+    {
+    (*pos)++;
+    (*length)--;
+    }
+  /* skip trailing space */
+  while (isspace (at[*pos + *length - 1]))
+    {
+      (*length)--;
+    }
+  /* trim off quotes */
+  if (at[*pos] == '\'' || at[*pos] == '"')
+    {
+      (*pos)++;
+      *length -= 2;
+    }
+
+  uri = xmalloc (*length + 1);
+  if (uri)
+    {
+      strncpy (uri, at + *pos, *length);
+      uri[*length] = '\0';      
+    }
+
+  return uri;
+}
+
+void
+get_urls_css (struct map_context *ctx, int offset, int buf_length)
+{
+  int token;
+  /*char tmp[2048];*/
+  int buffer_pos = 0;
+  int pos, length;
+  char *uri;
+
+  /*
+  strncpy(tmp,ctx->text + offset, buf_length);
+  tmp[buf_length] = '\0';
+  DEBUGP (("get_urls_css: \"%s\"\n", tmp));
+  */
+
+  /* tell flex to scan from this buffer */
+  yy_scan_bytes (ctx->text + offset, buf_length);
+
+  while((token = yylex()) != CSSEOF)
+    {
+      /*DEBUGP (("%s ", token_names[token]));*/
+      /* @import "foo.css"
+         or @import url(foo.css)
+      */
+      if(token == IMPORT_SYM)
+        {
+          do {
+            buffer_pos += yyleng;
+          } while((token = yylex()) == S);
+
+          /*DEBUGP (("%s ", token_names[token]));*/
+
+          if (token == STRING || token == URI)
+            {
+              /*DEBUGP (("Got URI "));*/
+              pos = buffer_pos + offset;
+              length = yyleng;
+
+              if (token == URI)
+                {
+                  uri = get_uri_string (ctx->text, &pos, &length);
+                }
+              else
+                {
+                  /* cut out quote characters */
+                  pos++;
+                  length -= 2;
+                  uri = xmalloc (length + 1);
+                  strncpy (uri, yytext + 1, length);
+                  uri[length] = '\0';
+                }
+
+              if (uri)
+                {
+                  struct urlpos *up = append_url (uri, pos, length, ctx);
+                  DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+
+                  if (up)
+                    {
+                      up->link_inline_p = 1;
+                      up->link_css_p = 1;
+                      up->link_expect_css = 1;
+                    }
+
+                  xfree(uri);
+                }
+            }
+        }
+      /* background-image: url(foo.png)
+         note that we don't care what
+         property this is actually on.
+      */
+      else if(token == URI)
+        {
+          pos = buffer_pos + offset;
+          length = yyleng;
+          uri = get_uri_string (ctx->text, &pos, &length);
+
+          if (uri)
+            {
+              struct urlpos *up = append_url (uri, pos, length, ctx);
+              DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+              if (up)
+                {
+                  up->link_inline_p = 1;
+                  up->link_css_p = 1;
+                }
+
+              xfree (uri);
+            }
+        }
+      buffer_pos += yyleng;
+    }
+  DEBUGP (("\n"));
+}
+
+struct urlpos *
+get_urls_css_file (const char *file, const char *url)
+{
+  struct file_memory *fm;
+  struct map_context ctx;
+
+  /* Load the file. */
+  fm = read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
+    }
+  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+
+  ctx.text = fm->content;
+  ctx.head = ctx.tail = NULL;
+  ctx.base = NULL;
+  ctx.parent_base = url ? url : opt.base_href;
+  ctx.document_file = file;
+  ctx.nofollow = 0;
+
+  get_urls_css (&ctx, 0, fm->length);
+  read_file_free (fm);
+  return ctx.head;
+}
diff --git a/src/css-url.h b/src/css-url.h

new file mode 100644 (file)

index 0000000..772e2fd
--- /dev/null
+++ b/src/css-url.h
@@ -0,0 +1,36 @@
+/* Declarations for css-url.c.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#ifndef CSS_URL_H
+#define CSS_URL_H
+
+void get_urls_css (struct map_context *, int, int);
+struct urlpos *get_urls_css_file (const char *, const char *);
+
+#endif /* CSS_URL_H */
diff --git a/src/css.lex b/src/css.lex

new file mode 100644 (file)

index 0000000..8d1477a
--- /dev/null
+++ b/src/css.lex
@@ -0,0 +1,137 @@
+%option case-insensitive
+%option noyywrap
+%option never-interactive
+
+%{
+/* Lex source for CSS tokenizing.
+   Taken from http://www.w3.org/TR/CSS21/grammar.html#q2
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#include "css-tokens.h"
+
+/* {s}+\/\*[^*]*\*+([^/*][^*]*\*+)*\/      {unput(' '); } */
+/*replace by space*/
+%}
+
+h               [0-9a-f]
+nonascii        [\200-\377]
+unicode         \\{h}{1,6}(\r\n|[ \t\r\n\f])?
+escape          {unicode}|\\[^\r\n\f0-9a-f]
+nmstart         [_a-z]|{nonascii}|{escape}
+nmchar          [_a-z0-9-]|{nonascii}|{escape}
+string1         \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
+string2         \'([^\n\r\f\\']|\\{nl}|{escape})*\'
+invalid1        \"([^\n\r\f\\"]|\\{nl}|{escape})*
+invalid2        \'([^\n\r\f\\']|\\{nl}|{escape})*
+
+comment         \/\*[^*]*\*+([^/*][^*]*\*+)*\/
+ident           -?{nmstart}{nmchar}*
+name            {nmchar}+
+num             [0-9]+|[0-9]*"."[0-9]+
+string          {string1}|{string2}
+invalid         {invalid1}|{invalid2}
+url             ([!#$%&*-~]|{nonascii}|{escape})*
+s               [ \t\r\n\f]
+w               ({s}|{comment})*
+nl              \n|\r\n|\r|\f
+
+A               a|\\0{0,4}(41|61)(\r\n|[ \t\r\n\f])?
+C               c|\\0{0,4}(43|63)(\r\n|[ \t\r\n\f])?
+D               d|\\0{0,4}(44|64)(\r\n|[ \t\r\n\f])?
+E               e|\\0{0,4}(45|65)(\r\n|[ \t\r\n\f])?
+G               g|\\0{0,4}(47|67)(\r\n|[ \t\r\n\f])?|\\g
+H               h|\\0{0,4}(48|68)(\r\n|[ \t\r\n\f])?|\\h
+I               i|\\0{0,4}(49|69)(\r\n|[ \t\r\n\f])?|\\i
+K               k|\\0{0,4}(4b|6b)(\r\n|[ \t\r\n\f])?|\\k
+M               m|\\0{0,4}(4d|6d)(\r\n|[ \t\r\n\f])?|\\m
+N               n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n
+P               p|\\0{0,4}(50|70)(\r\n|[ \t\r\n\f])?|\\p
+R               r|\\0{0,4}(52|72)(\r\n|[ \t\r\n\f])?|\\r
+S               s|\\0{0,4}(53|73)(\r\n|[ \t\r\n\f])?|\\s
+T               t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t
+X               x|\\0{0,4}(58|78)(\r\n|[ \t\r\n\f])?|\\x
+Z               z|\\0{0,4}(5a|7a)(\r\n|[ \t\r\n\f])?|\\z
+
+%%
+
+{s}                     {return S;}
+
+\/\*[^*]*\*+([^/*][^*]*\*+)*\/          {return S;} /* ignore comments */
+
+"<!--"          {return CDO;}
+"-->"                   {return CDC;}
+"~="                    {return INCLUDES;}
+"|="                    {return DASHMATCH;}
+
+{w}"{"                  {return LBRACE;}
+{w}"+"                  {return PLUS;}
+{w}">"                  {return GREATER;}
+{w}","                  {return COMMA;}
+
+{string}                {return STRING;}
+{invalid}               {return INVALID; /* unclosed string */}
+
+{ident}                 {return IDENT;}
+
+"#"{name}               {return HASH;}
+
+"@import"               {return IMPORT_SYM;}
+"@page"                 {return PAGE_SYM;}
+"@media"                {return MEDIA_SYM;}
+"@charset "             {return CHARSET_SYM;}
+
+"!"{w}"important"       {return IMPORTANT_SYM;}
+
+{num}{E}{M}             {return EMS;}
+{num}{E}{X}             {return EXS;}
+{num}{P}{X}             {return LENGTH;}
+{num}{C}{M}             {return LENGTH;}
+{num}{M}{M}             {return LENGTH;}
+{num}{I}{N}             {return LENGTH;}
+{num}{P}{T}             {return LENGTH;}
+{num}{P}{C}             {return LENGTH;}
+{num}{D}{E}{G}          {return ANGLE;}
+{num}{R}{A}{D}          {return ANGLE;}
+{num}{G}{R}{A}{D}       {return ANGLE;}
+{num}{M}{S}             {return TIME;}
+{num}{S}                {return TIME;}
+{num}{H}{Z}             {return FREQ;}
+{num}{K}{H}{Z}          {return FREQ;}
+{num}{ident}            {return DIMENSION;}
+
+{num}%                  {return PERCENTAGE;}
+{num}                   {return NUMBER;}
+
+"url("{w}{string}{w}")" {return URI;}
+"url("{w}{url}{w}")"    {return URI;}
+{ident}"("              {return FUNCTION;}
+
+.                       {return *yytext;}
+
+%%
diff --git a/src/html-parse.c b/src/html-parse.c

index 10cc36974ee4da20aa0902fe8b79ca3cde80a08a..8254c6dc15d416d42909aec750028b697bf0557a 100644 (file)
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -271,6 +271,94 @@ struct pool {
     to "<foo", but "&lt,foo" to "<,foo".  */
  #define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
  
+struct tagstack_item {
+  const char *tagname_begin;
+  const char *tagname_end;
+  const char *contents_begin;
+  struct tagstack_item *prev;
+  struct tagstack_item *next;
+};
+
+struct tagstack_item *
+tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
+{
+  struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
+  if (*head == NULL)
+    {
+      *head = *tail = ts;
+      ts->prev = ts->next = NULL;
+    }
+  else
+    {
+      (*tail)->next = ts;
+      ts->prev = *tail;
+      *tail = ts;
+      ts->next = NULL;
+    }
+
+  return ts;
+}
+
+/* remove ts and everything after it from the stack */
+void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+              struct tagstack_item *ts)
+{
+  if (*head == NULL)
+    return;
+
+  if (ts == *tail)
+    {
+      if (ts == *head)
+        {
+          xfree (ts);
+          *head = *tail = NULL;
+        }
+      else
+        {
+          ts->prev->next = NULL;
+          *tail = ts->prev;
+          xfree (ts);
+        }
+    }
+  else
+    {
+      if (ts == *head)
+        {
+          *head = NULL;
+        }
+      *tail = ts->prev;
+
+      if (ts->prev)
+        {
+          ts->prev->next = NULL;
+        }
+      while (ts)
+        {
+          struct tagstack_item *p = ts->next;
+          xfree (ts);
+          ts = p;
+        }
+    }
+}
+
+struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+               const char *tagname_end)
+{
+  int len = tagname_end - tagname_begin;
+  while (tail)
+    {
+      if (len == (tail->tagname_end - tail->tagname_begin))
+        {
+          if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+            return tail;
+        }
+      tail = tail->prev;
+    }
+  return NULL;
+}
+
  /* Decode the HTML character entity at *PTR, considering END to be end
     of buffer.  It is assumed that the "&" character that marks the
     beginning of the entity has been seen at *PTR-1.  If a recognized
@@ -756,6 +844,9 @@ map_html_tags (const char *text, int size,
    bool attr_pair_resized = false;
    struct attr_pair *pairs = attr_pair_initial_storage;
  
+  struct tagstack_item *head = NULL;
+  struct tagstack_item *tail = NULL;
+
    if (!size)
      return;
  
@@ -822,6 +913,18 @@ map_html_tags (const char *text, int size,
        goto look_for_tag;
      tag_name_end = p;
      SKIP_WS (p);
+
+    if (!end_tag)
+      {
+        struct tagstack_item *ts = tagstack_push (&head, &tail);
+        if (ts)
+          {
+            ts->tagname_begin  = tag_name_begin;
+            ts->tagname_end    = tag_name_end;
+            ts->contents_begin = NULL;
+          }
+      }
+
      if (end_tag && *p != '>')
        goto backout_tag;
  
@@ -983,6 +1086,11 @@ map_html_tags (const char *text, int size,
         ++nattrs;
        }
  
+    if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+      {
+        tail->contents_begin = p+1;
+      }
+
      if (uninteresting_tag)
        {
         ADVANCE (p);
@@ -994,6 +1102,7 @@ map_html_tags (const char *text, int size,
      {
        int i;
        struct taginfo taginfo;
+      struct tagstack_item *ts = NULL;
  
        taginfo.name      = pool.contents;
        taginfo.end_tag_p = end_tag;
@@ -1010,6 +1119,23 @@ map_html_tags (const char *text, int size,
        taginfo.attrs = pairs;
        taginfo.start_position = tag_start_position;
        taginfo.end_position   = p + 1;
+      taginfo.contents_begin = NULL;
+      taginfo.contents_end = NULL;
+
+      if (end_tag)
+        {
+          ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+          if (ts)
+            {
+              if (ts->contents_begin)
+                {
+                  taginfo.contents_begin = ts->contents_begin;
+                  taginfo.contents_end   = tag_start_position;
+                }
+              tagstack_pop (&head, &tail, ts);
+            }
+        }
+
        mapfun (&taginfo, maparg);
        ADVANCE (p);
      }
@@ -1029,6 +1155,8 @@ map_html_tags (const char *text, int size,
    POOL_FREE (&pool);
    if (attr_pair_resized)
      xfree (pairs);
+  /* pop any tag stack that's left */
+  tagstack_pop (&head, &tail, head);
  }
  
  #undef ADVANCE
diff --git a/src/html-parse.h b/src/html-parse.h

index 05a8248323cafe53837bdbe7a6076bbd9f8ccd28..371a4f86a61987163a1a21019c8687f2521e01a7 100644 (file)
--- a/src/html-parse.h
+++ b/src/html-parse.h
@@ -51,6 +51,9 @@ struct taginfo {
  
    const char *start_position;  /* start position of tag */
    const char *end_position;    /* end position of tag */
+
+  const char *contents_begin;   /* delimiters of tag contents */
+  const char *contents_end;     /* only valid if end_tag_p */
  };
  
  struct hash_table;             /* forward declaration */
diff --git a/src/html-url.c b/src/html-url.c

index 0f2a07250f72123ddfaef37227df32e86311e51b..ebf8494db99b29f3e6941e70e96b3dbe8e8fc6b0 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -41,9 +41,9 @@ so, delete this exception statement from your version.  */
  #include "utils.h"
  #include "hash.h"
  #include "convert.h"
-#include "recur.h"             /* declaration of get_urls_html */
-
-struct map_context;
+#include "recur.h"
+#include "html-url.h"
+#include "css-url.h"
  
  typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
  
@@ -163,11 +163,12 @@ static struct {
     from the information above.  However, some places in the code refer
     to the attributes not mentioned here.  We add them manually.  */
  static const char *additional_attributes[] = {
-  "rel",                       /* used by tag_handle_link */
-  "http-equiv",                        /* used by tag_handle_meta */
-  "name",                      /* used by tag_handle_meta */
-  "content",                   /* used by tag_handle_meta */
-  "action"                     /* used by tag_handle_form */
+  "rel",                       /* used by tag_handle_link  */
+  "http-equiv",                        /* used by tag_handle_meta  */
+  "name",                      /* used by tag_handle_meta  */
+  "content",                   /* used by tag_handle_meta  */
+  "action",                    /* used by tag_handle_form  */
+  "style"                      /* used by check_style_attr */
  };
  
  static struct hash_table *interesting_tags;
@@ -246,28 +247,20 @@ find_attr (struct taginfo *tag, const char *name, int *attrind)
    return NULL;
  }
  
-struct map_context {
-  char *text;                  /* HTML text. */
-  char *base;                  /* Base URI of the document, possibly
-                                  changed through <base href=...>. */
-  const char *parent_base;     /* Base of the current document. */
-  const char *document_file;   /* File name of this document. */
-  bool nofollow;               /* whether NOFOLLOW was specified in a
-                                   <meta name=robots> tag. */
-
-  struct urlpos *head, *tail;  /* List of URLs that is being
-                                  built. */
-};
+/* used for calls to append_url */
+#define ATTR_POS(tag, attrind, ctx) \
+ (tag->attrs[attrind].value_raw_beginning - ctx->text)
+#define ATTR_SIZE(tag, attrind) \
+ (tag->attrs[attrind].value_raw_size)
  
  /* Append LINK_URI to the urlpos structure that is being built.
  
-   LINK_URI will be merged with the current document base.  TAG and
-   ATTRIND are the necessary context to store the position and
-   size.  */
+   LINK_URI will be merged with the current document base.
+*/
  
-static struct urlpos *
-append_url (const char *link_uri,
-           struct taginfo *tag, int attrind, struct map_context *ctx)
+struct urlpos *
+append_url (const char *link_uri, int position, int size,
+            struct map_context *ctx)
  {
    int link_has_scheme = url_has_scheme (link_uri);
    struct urlpos *newel;
@@ -325,8 +318,8 @@ append_url (const char *link_uri,
  
    newel = xnew0 (struct urlpos);
    newel->url = url;
-  newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
-  newel->size = tag->attrs[attrind].value_raw_size;
+  newel->pos = position;
+  newel->size = size;
  
    /* A URL is relative if the host is not named, and the name does not
       start with `/'.  */
@@ -346,6 +339,18 @@ append_url (const char *link_uri,
    return newel;
  }
  \f
+static void
+check_style_attr (struct taginfo *tag, struct map_context *ctx)
+{
+  int attrind;
+  char *style = find_attr (tag, "style", &attrind);
+  if (!style)
+    return;
+
+  /* raw pos and raw size include the quotes, hence the +1 -2 */
+  get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
+}
+
  /* All the tag_* functions are called from collect_tags_mapper, as
     specified by KNOWN_TAGS.  */
  
@@ -393,7 +398,8 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
           if (0 == strcasecmp (tag->attrs[attrind].name,
                                tag_url_attributes[i].attr_name))
             {
-             struct urlpos *up = append_url (link, tag, attrind, ctx);
+             struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
+                                              ATTR_SIZE(tag,attrind), ctx);
               if (up)
                 {
                   int flags = tag_url_attributes[i].flags;
@@ -418,7 +424,8 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
    if (!newbase)
      return;
  
-  base_urlpos = append_url (newbase, tag, attrind, ctx);
+  base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
+                            ATTR_SIZE(tag,attrind), ctx);
    if (!base_urlpos)
      return;
    base_urlpos->ignore_when_downloading = 1;
@@ -439,9 +446,11 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
  {
    int attrind;
    char *action = find_attr (tag, "action", &attrind);
+
    if (action)
      {
-      struct urlpos *up = append_url (action, tag, attrind, ctx);
+      struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
+                                      ATTR_SIZE(tag,attrind), ctx);
        if (up)
         up->ignore_when_downloading = 1;
      }
@@ -464,14 +473,23 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
    */
    if (href)
      {
-      struct urlpos *up = append_url (href, tag, attrind, ctx);
+      struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
+                                      ATTR_SIZE(tag,attrind), ctx);
        if (up)
         {
           char *rel = find_attr (tag, "rel", NULL);
-         if (rel
-             && (0 == strcasecmp (rel, "stylesheet")
-                 || 0 == strcasecmp (rel, "shortcut icon")))
-           up->link_inline_p = 1;
+         if (rel)
+            {
+             if (0 == strcasecmp (rel, "stylesheet"))
+                {
+                  up->link_inline_p = 1;
+                  up->link_expect_css = 1;
+                }
+             else if (0 == strcasecmp (rel, "shortcut icon"))
+                {
+                  up->link_inline_p = 1;
+                }
+            }
           else
             /* The external ones usually point to HTML pages, such as
                <link rel="next" href="..."> */
@@ -525,7 +543,8 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
        while (ISSPACE (*p))
         ++p;
  
-      entry = append_url (p, tag, attrind, ctx);
+      entry = append_url (p, ATTR_POS(tag,attrind,ctx),
+                          ATTR_SIZE(tag,attrind), ctx);
        if (entry)
         {
           entry->link_refresh_p = 1;
@@ -570,11 +589,26 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
    struct map_context *ctx = (struct map_context *)arg;
  
    /* Find the tag in our table of tags.  This must not fail because
-     map_html_tags only returns tags found in interesting_tags.  */
+     map_html_tags only returns tags found in interesting_tags.
+     
+     I've changed this for now, I'm passing NULL as interesting_tags
+     to map_html_tags.  This way we can check all tags for a style
+     attribute.
+  */
    struct known_tag *t = hash_table_get (interesting_tags, tag->name);
-  assert (t != NULL);
  
-  t->handler (t->tagid, tag, ctx);
+  if (t != NULL)
+    t->handler (t->tagid, tag, ctx);
+
+  check_style_attr (tag, ctx);
+
+  if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
+      tag->contents_begin && tag->contents_end)
+  {
+    /* parse contents */
+    get_urls_css (ctx, tag->contents_begin - ctx->text,
+                  tag->contents_end - tag->contents_begin);
+  }
  }
  \f
  /* Analyze HTML tags FILE and construct a list of URLs referenced from
@@ -618,8 +652,9 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
    if (opt.strict_comments)
      flags |= MHT_STRICT_COMMENTS;
  
+  /* the NULL here used to be interesting_tags */
    map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
-                interesting_tags, interesting_attributes);
+                NULL, interesting_attributes);
  
    DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
    if (meta_disallow_follow)
diff --git a/src/html-url.h b/src/html-url.h

new file mode 100644 (file)

index 0000000..a94f0db
--- /dev/null
+++ b/src/html-url.h
@@ -0,0 +1,51 @@
+/* Declarations for html-url.c.
+   Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#ifndef HTML_URL_H
+#define HTML_URL_H
+
+struct map_context {
+  char *text;                  /* HTML text. */
+  char *base;                  /* Base URI of the document, possibly
+                                  changed through <base href=...>. */
+  const char *parent_base;     /* Base of the current document. */
+  const char *document_file;   /* File name of this document. */
+  bool nofollow;               /* whether NOFOLLOW was specified in a
+                                   <meta name=robots> tag. */
+
+  struct urlpos *head, *tail;  /* List of URLs that is being
+                                  built. */
+};
+
+struct urlpos *get_urls_file (const char *);
+struct urlpos *get_urls_html (const char *, const char *, bool *);
+struct urlpos *append_url (const char *, int, int, struct map_context *);
+void free_urlpos (struct urlpos *);
+
+#endif /* HTML_URL_H */
diff --git a/src/http.c b/src/http.c

index 99a059e57e2c70a6070f047cfa4cad02e9d78d06..d3f6704f87f7af033cf8239aa1101b94cb73ea5d 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -77,6 +77,7 @@ static struct cookie_jar *wget_cookie_jar;
  
  #define TEXTHTML_S "text/html"
  #define TEXTXHTML_S "application/xhtml+xml"
+#define TEXTCSS_S "text/css"
  
  /* Some status code validation macros: */
  #define H_20X(x)        (((x) >= 200) && ((x) < 300))
@@ -1235,6 +1236,7 @@ static char *create_authorization_line (const char *, const char *,
                                          const char *, bool *);
  static char *basic_authentication_encode (const char *, const char *);
  static bool known_authentication_scheme_p (const char *, const char *);
+static void ensure_extension (struct http_stat *, const char *, int *);
  static void load_cookies (void);
  
  #define BEGINS_WITH(line, string_constant)                               \
@@ -2017,34 +2019,25 @@ File `%s' already there; not retrieving.\n\n"), hs->local_file);
    else
      *dt &= ~TEXTHTML;
  
-  if (opt.html_extension && (*dt & TEXTHTML))
-    /* -E / --html-extension / html_extension = on was specified, and this is a
-       text/html file.  If some case-insensitive variation on ".htm[l]" isn't
-       already the file's suffix, tack on ".html". */
-    {
-      char *last_period_in_local_filename = strrchr (hs->local_file, '.');
+  if (type &&
+      0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S)))
+    *dt |= TEXTCSS;
+  else
+    *dt &= ~TEXTCSS;
  
-      if (last_period_in_local_filename == NULL
-          || !(0 == strcasecmp (last_period_in_local_filename, ".htm")
-               || 0 == strcasecmp (last_period_in_local_filename, ".html")))
+  if (opt.html_extension)
+    {
+      if (*dt & TEXTHTML)
+        /* -E / --html-extension / html_extension = on was specified,
+           and this is a text/html file.  If some case-insensitive
+           variation on ".htm[l]" isn't already the file's suffix,
+           tack on ".html". */
          {
-          int local_filename_len = strlen (hs->local_file);
-          /* Resize the local file, allowing for ".html" preceded by
-             optional ".NUMBER".  */
-          hs->local_file = xrealloc (hs->local_file,
-                                     local_filename_len + 24 + sizeof (".html"));
-          strcpy(hs->local_file + local_filename_len, ".html");
-          /* If clobbering is not allowed and the file, as named,
-             exists, tack on ".NUMBER.html" instead. */
-          if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
-            {
-              int ext_num = 1;
-              do
-                sprintf (hs->local_file + local_filename_len,
-                         ".%d.html", ext_num++);
-              while (file_exists_p (hs->local_file));
-            }
-          *dt |= ADDED_HTML_EXTENSION;
+          ensure_extension (hs, ".html", dt);
+        }
+      else if (*dt & TEXTCSS)
+        {
+          ensure_extension (hs, ".css", dt);
          }
      }
  
@@ -3018,6 +3011,42 @@ http_cleanup (void)
      cookie_jar_delete (wget_cookie_jar);
  }
  
+void
+ensure_extension (struct http_stat *hs, const char *ext, int *dt)
+{
+  char *last_period_in_local_filename = strrchr (hs->local_file, '.');
+  char shortext[8];
+  int len = strlen (ext);
+  if (len == 5)
+    {
+      strncpy (shortext, ext, len - 1);
+      shortext[len - 2] = '\0';
+    }
+
+  if (last_period_in_local_filename == NULL
+      || !(0 == strcasecmp (last_period_in_local_filename, shortext)
+           || 0 == strcasecmp (last_period_in_local_filename, ext)))
+    {
+      int local_filename_len = strlen (hs->local_file);
+      /* Resize the local file, allowing for ".html" preceded by
+         optional ".NUMBER".  */
+      hs->local_file = xrealloc (hs->local_file,
+                                 local_filename_len + 24 + len);
+      strcpy (hs->local_file + local_filename_len, ext);
+      /* If clobbering is not allowed and the file, as named,
+         exists, tack on ".NUMBER.html" instead. */
+      if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
+        {
+          int ext_num = 1;
+          do
+            sprintf (hs->local_file + local_filename_len,
+                     ".%d%s", ext_num++, ext);
+          while (file_exists_p (hs->local_file));
+        }
+      *dt |= ADDED_HTML_EXTENSION;
+    }
+}
+
  
  #ifdef TESTING
  
diff --git a/src/recur.c b/src/recur.c

index 980fc49d9636e96b62774cb05b6ed5a026056f12..024073ce3cf43aa900e0a3a90f2a7892bd6ef76a 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -48,8 +48,10 @@ so, delete this exception statement from your version.  */
  #include "hash.h"
  #include "res.h"
  #include "convert.h"
+#include "html-url.h"
+#include "css-url.h"
  #include "spider.h"
-\f
+
  /* Functions for maintaining the URL queue.  */
  
  struct queue_element {
@@ -58,7 +60,8 @@ struct queue_element {
    int depth;                   /* the depth */
    bool html_allowed;           /* whether the document is allowed to
                                    be treated as HTML. */
-
+  bool css_allowed;            /* whether the document is allowed to
+                                  be treated as CSS. */
    struct queue_element *next;  /* next element in queue */
  };
  
@@ -91,13 +94,15 @@ url_queue_delete (struct url_queue *queue)
  
  static void
  url_enqueue (struct url_queue *queue,
-            const char *url, const char *referer, int depth, bool html_allowed)
+            const char *url, const char *referer, int depth,
+             bool html_allowed, bool css_allowed)
  {
    struct queue_element *qel = xnew (struct queue_element);
    qel->url = url;
    qel->referer = referer;
    qel->depth = depth;
    qel->html_allowed = html_allowed;
+  qel->css_allowed = css_allowed;
    qel->next = NULL;
  
    ++queue->count;
@@ -121,7 +126,7 @@ url_enqueue (struct url_queue *queue,
  static bool
  url_dequeue (struct url_queue *queue,
              const char **url, const char **referer, int *depth,
-            bool *html_allowed)
+            bool *html_allowed, bool *css_allowed)
  {
    struct queue_element *qel = queue->head;
  
@@ -136,6 +141,7 @@ url_dequeue (struct url_queue *queue,
    *referer = qel->referer;
    *depth = qel->depth;
    *html_allowed = qel->html_allowed;
+  *css_allowed = qel->css_allowed;
  
    --queue->count;
  
@@ -200,7 +206,7 @@ retrieve_tree (const char *start_url)
  
    /* Enqueue the starting URL.  Use start_url_parsed->url rather than
       just URL so we enqueue the canonical form of the URL.  */
-  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
+  url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
    string_set_add (blacklist, start_url_parsed->url);
  
    while (1)
@@ -208,7 +214,8 @@ retrieve_tree (const char *start_url)
        bool descend = false;
        char *url, *referer, *file = NULL;
        int depth;
-      bool html_allowed;
+      bool html_allowed, css_allowed;
+      bool is_css = false;
        bool dash_p_leaf_HTML = false;
  
        if (opt.quota && total_downloaded_bytes > opt.quota)
@@ -220,7 +227,7 @@ retrieve_tree (const char *start_url)
  
        if (!url_dequeue (queue,
                         (const char **)&url, (const char **)&referer,
-                       &depth, &html_allowed))
+                       &depth, &html_allowed, &css_allowed))
         break;
  
        /* ...and download it.  Note that this download is in most cases
@@ -238,10 +245,21 @@ retrieve_tree (const char *start_url)
           DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
                    url, file));
  
+          /* this sucks, needs to be combined! */
           if (html_allowed
               && downloaded_html_set
               && string_set_contains (downloaded_html_set, file))
-           descend = true;
+            {
+              descend = true;
+              is_css = false;
+            }
+          if (css_allowed
+              && downloaded_css_set
+              && string_set_contains (downloaded_css_set, file))
+            {
+              descend = 1;
+              is_css = true;
+            }
         }
        else
         {
@@ -252,7 +270,21 @@ retrieve_tree (const char *start_url)
  
           if (html_allowed && file && status == RETROK
               && (dt & RETROKF) && (dt & TEXTHTML))
-           descend = true;
+            {
+              descend = true;
+              is_css = false;
+            }
+
+          /* a little different, css_allowed can override content type
+             lots of web servers serve css with an incorrect content type
+          */
+          if (file && status == RETROK
+              && (dt & RETROKF) &&
+              ((dt & TEXTCSS) || css_allowed))
+            {
+              descend = true;
+              is_css = false;
+            }
  
           if (redirected)
             {
@@ -306,14 +338,15 @@ retrieve_tree (const char *start_url)
             }
         }
  
-      /* If the downloaded document was HTML, parse it and enqueue the
+      /* If the downloaded document was HTML or CSS, parse it and enqueue the
          links it contains. */
  
        if (descend)
         {
           bool meta_disallow_follow = false;
           struct urlpos *children
-           = get_urls_html (file, url, &meta_disallow_follow);
+           = is_css ? get_urls_css_file (file, url) :
+                       get_urls_html (file, url, &meta_disallow_follow);
  
           if (opt.use_robots && meta_disallow_follow)
             {
@@ -338,7 +371,8 @@ retrieve_tree (const char *start_url)
                     {
                       url_enqueue (queue, xstrdup (child->url->url),
                                    xstrdup (url), depth + 1,
-                                  child->link_expect_html);
+                                  child->link_expect_html,
+                                  child->link_expect_css);
                       /* We blacklist the URL we have enqueued, because we
                          don't want to enqueue (and hence download) the
                          same URL twice.  */
@@ -385,9 +419,9 @@ retrieve_tree (const char *start_url)
    {
      char *d1, *d2;
      int d3;
-    bool d4;
+    bool d4, d5;
      while (url_dequeue (queue,
-                       (const char **)&d1, (const char **)&d2, &d3, &d4))
+                       (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
        {
         xfree (d1);
         xfree_null (d2);
diff --git a/src/recur.h b/src/recur.h

index b7e3c2e114dd512860aaf50e4b1647c62d8562bf..ed9854676dc8bdbf7012bcb64b670ef830552512 100644 (file)
--- a/src/recur.h
+++ b/src/recur.h
@@ -43,9 +43,4 @@ struct urlpos;
  void recursive_cleanup (void);
  uerr_t retrieve_tree (const char *);
  
-/* These are really in html-url.c. */
-struct urlpos *get_urls_file (const char *);
-struct urlpos *get_urls_html (const char *, const char *, bool *);
-void free_urlpos (struct urlpos *);
-
  #endif /* RECUR_H */
diff --git a/src/retr.c b/src/retr.c

index a2d462a87b34a206c6875e678b11d16fcd7b4de2..245eb129f40d5f49644d0bb69de1e7489f54cf1d 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -51,6 +51,7 @@ so, delete this exception statement from your version.  */
  #include "hash.h"
  #include "convert.h"
  #include "ptimer.h"
+#include "html-url.h"
  
  /* Total size of downloaded files.  Used to enforce quota.  */
  SUM_SIZE_INT total_downloaded_bytes;
@@ -784,6 +785,8 @@ retrieve_url (const char *origurl, char **file, char **newloc,
             register_redirection (origurl, u->url);
           if (*dt & TEXTHTML)
             register_html (u->url, local_file);
+         if (*dt & TEXTCSS)
+           register_css (u->url, local_file);
         }
      }
  
diff --git a/src/wget.h b/src/wget.h

index 740d2a9d51a44b905d04ee8bdb082964b3cf4108..c6dd19c10eec07540236219afc990b99206312b8 100644 (file)
--- a/src/wget.h
+++ b/src/wget.h
@@ -304,7 +304,8 @@ enum
    HEAD_ONLY            = 0x0004,       /* only send the HEAD request */
    SEND_NOCACHE         = 0x0008,       /* send Pragma: no-cache directive */
    ACCEPTRANGES         = 0x0010,       /* Accept-ranges header was found */
-  ADDED_HTML_EXTENSION = 0x0020         /* added ".html" extension due to -E */
+  ADDED_HTML_EXTENSION = 0x0020,        /* added ".html" extension due to -E */
+  TEXTCSS              = 0x0040                /* document is of type text/css */
  };
  
  /* Universal error type -- used almost everywhere.  Error reporting of
author	Micah Cowan <micah@cowan.name>
	Tue, 22 Apr 2008 07:15:48 +0000 (00:15 -0700)
committer	Micah Cowan <micah@cowan.name>
	Tue, 22 Apr 2008 07:15:48 +0000 (00:15 -0700)
configure.in		patch \| blob \| history
src/Makefile.in		patch \| blob \| history
src/convert.c		patch \| blob \| history
src/convert.h		patch \| blob \| history
src/css-tokens.h	[new file with mode: 0644]	patch \| blob
src/css-url.c	[new file with mode: 0644]	patch \| blob
src/css-url.h	[new file with mode: 0644]	patch \| blob
src/css.lex	[new file with mode: 0644]	patch \| blob
src/html-parse.c		patch \| blob \| history
src/html-parse.h		patch \| blob \| history
src/html-url.c		patch \| blob \| history
src/html-url.h	[new file with mode: 0644]	patch \| blob
src/http.c		patch \| blob \| history
src/recur.c		patch \| blob \| history
src/recur.h		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/wget.h		patch \| blob \| history