Ted Mielczarek's CSS wonder-patch, applied against the source from around the time...

[wget] / src / css-url.c
diff --git a/src/css-url.c b/src/css-url.c

new file mode 100644 (file)

index 0000000..42c8fc3
--- /dev/null
+++ b/src/css-url.c
@@ -0,0 +1,273 @@
+/* Collect URLs from CSS source.
+   Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+/*
+  Note that this is not an actual CSS parser, but just a lexical
+  scanner with a tiny bit more smarts bolted on top.  A full parser
+  is somewhat overkill for this job.  The only things we're interested
+  in are @import rules and url() tokens, so it's easy enough to
+  grab those without truly understanding the input.  The only downside
+  to this is that we might be coerced into downloading files that
+  a browser would ignore.  That might merit some more investigation.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "utils.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-tokens.h"
+
+/* from lex.yy.c */
+extern char *yytext;
+extern int yyleng;
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len  );
+extern int yylex (void);
+
+#if 1
+const char *token_names[] = {
+  "CSSEOF",
+  "S",
+  "CDO",
+  "CDC",
+  "INCLUDES",
+  "DASHMATCH",
+  "LBRACE",
+  "PLUS",
+  "GREATER",
+  "COMMA",
+  "STRING",
+  "INVALID",
+  "IDENT",
+  "HASH",
+  "IMPORT_SYM",
+  "PAGE_SYM",
+  "MEDIA_SYM",
+  "CHARSET_SYM",
+  "IMPORTANT_SYM",
+  "EMS",
+  "EXS",
+  "LENGTH",
+  "ANGLE",
+  "TIME",
+  "FREQ",
+  "DIMENSION",
+  "PERCENTAGE",
+  "NUMBER",
+  "URI",
+  "FUNCTION"
+};
+#endif
+
+/*
+  Given a detected URI token, get only the URI specified within.
+  Also adjust the starting position and length of the string.
+
+  A URI can be specified with or without quotes, and the quotes
+  can be single or double quotes.  In addition there can be
+  whitespace after the opening parenthesis and before the closing
+  parenthesis.
+*/
+char *
+get_uri_string (const char *at, int *pos, int *length)
+{
+  char *uri;
+  /*char buf[1024];
+  strncpy(buf,at + *pos, *length);
+  buf[*length] = '\0';
+  DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
+
+  if (0 != strncasecmp (at + *pos, "url(", 4))
+    return NULL;
+
+  *pos += 4;
+  *length -= 5; /* url() */
+  /* skip leading space */
+  while (isspace (at[*pos]))
+    {
+    (*pos)++;
+    (*length)--;
+    }
+  /* skip trailing space */
+  while (isspace (at[*pos + *length - 1]))
+    {
+      (*length)--;
+    }
+  /* trim off quotes */
+  if (at[*pos] == '\'' || at[*pos] == '"')
+    {
+      (*pos)++;
+      *length -= 2;
+    }
+
+  uri = xmalloc (*length + 1);
+  if (uri)
+    {
+      strncpy (uri, at + *pos, *length);
+      uri[*length] = '\0';      
+    }
+
+  return uri;
+}
+
+void
+get_urls_css (struct map_context *ctx, int offset, int buf_length)
+{
+  int token;
+  /*char tmp[2048];*/
+  int buffer_pos = 0;
+  int pos, length;
+  char *uri;
+
+  /*
+  strncpy(tmp,ctx->text + offset, buf_length);
+  tmp[buf_length] = '\0';
+  DEBUGP (("get_urls_css: \"%s\"\n", tmp));
+  */
+
+  /* tell flex to scan from this buffer */
+  yy_scan_bytes (ctx->text + offset, buf_length);
+
+  while((token = yylex()) != CSSEOF)
+    {
+      /*DEBUGP (("%s ", token_names[token]));*/
+      /* @import "foo.css"
+         or @import url(foo.css)
+      */
+      if(token == IMPORT_SYM)
+        {
+          do {
+            buffer_pos += yyleng;
+          } while((token = yylex()) == S);
+
+          /*DEBUGP (("%s ", token_names[token]));*/
+
+          if (token == STRING || token == URI)
+            {
+              /*DEBUGP (("Got URI "));*/
+              pos = buffer_pos + offset;
+              length = yyleng;
+
+              if (token == URI)
+                {
+                  uri = get_uri_string (ctx->text, &pos, &length);
+                }
+              else
+                {
+                  /* cut out quote characters */
+                  pos++;
+                  length -= 2;
+                  uri = xmalloc (length + 1);
+                  strncpy (uri, yytext + 1, length);
+                  uri[length] = '\0';
+                }
+
+              if (uri)
+                {
+                  struct urlpos *up = append_url (uri, pos, length, ctx);
+                  DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+
+                  if (up)
+                    {
+                      up->link_inline_p = 1;
+                      up->link_css_p = 1;
+                      up->link_expect_css = 1;
+                    }
+
+                  xfree(uri);
+                }
+            }
+        }
+      /* background-image: url(foo.png)
+         note that we don't care what
+         property this is actually on.
+      */
+      else if(token == URI)
+        {
+          pos = buffer_pos + offset;
+          length = yyleng;
+          uri = get_uri_string (ctx->text, &pos, &length);
+
+          if (uri)
+            {
+              struct urlpos *up = append_url (uri, pos, length, ctx);
+              DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+              if (up)
+                {
+                  up->link_inline_p = 1;
+                  up->link_css_p = 1;
+                }
+
+              xfree (uri);
+            }
+        }
+      buffer_pos += yyleng;
+    }
+  DEBUGP (("\n"));
+}
+
+struct urlpos *
+get_urls_css_file (const char *file, const char *url)
+{
+  struct file_memory *fm;
+  struct map_context ctx;
+
+  /* Load the file. */
+  fm = read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
+    }
+  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+
+  ctx.text = fm->content;
+  ctx.head = ctx.tail = NULL;
+  ctx.base = NULL;
+  ctx.parent_base = url ? url : opt.base_href;
+  ctx.document_file = file;
+  ctx.nofollow = 0;
+
+  get_urls_css (&ctx, 0, fm->length);
+  read_file_free (fm);
+  return ctx.head;
+}