--- /dev/null
+/* Collect URLs from CSS source.
+ Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
+
+/*
+ Note that this is not an actual CSS parser, but just a lexical
+ scanner with a tiny bit more smarts bolted on top. A full parser
+ is somewhat overkill for this job. The only things we're interested
+ in are @import rules and url() tokens, so it's easy enough to
+ grab those without truly understanding the input. The only downside
+ to this is that we might be coerced into downloading files that
+ a browser would ignore. That might merit some more investigation.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#else
+# include <strings.h>
+#endif
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "utils.h"
+#include "convert.h"
+#include "html-url.h"
+#include "css-tokens.h"
+
+/* from lex.yy.c */
+extern char *yytext;
+extern int yyleng;
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len );
+extern int yylex (void);
+
+#if 1
+const char *token_names[] = {
+ "CSSEOF",
+ "S",
+ "CDO",
+ "CDC",
+ "INCLUDES",
+ "DASHMATCH",
+ "LBRACE",
+ "PLUS",
+ "GREATER",
+ "COMMA",
+ "STRING",
+ "INVALID",
+ "IDENT",
+ "HASH",
+ "IMPORT_SYM",
+ "PAGE_SYM",
+ "MEDIA_SYM",
+ "CHARSET_SYM",
+ "IMPORTANT_SYM",
+ "EMS",
+ "EXS",
+ "LENGTH",
+ "ANGLE",
+ "TIME",
+ "FREQ",
+ "DIMENSION",
+ "PERCENTAGE",
+ "NUMBER",
+ "URI",
+ "FUNCTION"
+};
+#endif
+
+/*
+ Given a detected URI token, get only the URI specified within.
+ Also adjust the starting position and length of the string.
+
+ A URI can be specified with or without quotes, and the quotes
+ can be single or double quotes. In addition there can be
+ whitespace after the opening parenthesis and before the closing
+ parenthesis.
+*/
+char *
+get_uri_string (const char *at, int *pos, int *length)
+{
+ char *uri;
+ /*char buf[1024];
+ strncpy(buf,at + *pos, *length);
+ buf[*length] = '\0';
+ DEBUGP (("get_uri_string: \"%s\"\n", buf));*/
+
+ if (0 != strncasecmp (at + *pos, "url(", 4))
+ return NULL;
+
+ *pos += 4;
+ *length -= 5; /* url() */
+ /* skip leading space */
+ while (isspace (at[*pos]))
+ {
+ (*pos)++;
+ (*length)--;
+ }
+ /* skip trailing space */
+ while (isspace (at[*pos + *length - 1]))
+ {
+ (*length)--;
+ }
+ /* trim off quotes */
+ if (at[*pos] == '\'' || at[*pos] == '"')
+ {
+ (*pos)++;
+ *length -= 2;
+ }
+
+ uri = xmalloc (*length + 1);
+ if (uri)
+ {
+ strncpy (uri, at + *pos, *length);
+ uri[*length] = '\0';
+ }
+
+ return uri;
+}
+
+void
+get_urls_css (struct map_context *ctx, int offset, int buf_length)
+{
+ int token;
+ /*char tmp[2048];*/
+ int buffer_pos = 0;
+ int pos, length;
+ char *uri;
+
+ /*
+ strncpy(tmp,ctx->text + offset, buf_length);
+ tmp[buf_length] = '\0';
+ DEBUGP (("get_urls_css: \"%s\"\n", tmp));
+ */
+
+ /* tell flex to scan from this buffer */
+ yy_scan_bytes (ctx->text + offset, buf_length);
+
+ while((token = yylex()) != CSSEOF)
+ {
+ /*DEBUGP (("%s ", token_names[token]));*/
+ /* @import "foo.css"
+ or @import url(foo.css)
+ */
+ if(token == IMPORT_SYM)
+ {
+ do {
+ buffer_pos += yyleng;
+ } while((token = yylex()) == S);
+
+ /*DEBUGP (("%s ", token_names[token]));*/
+
+ if (token == STRING || token == URI)
+ {
+ /*DEBUGP (("Got URI "));*/
+ pos = buffer_pos + offset;
+ length = yyleng;
+
+ if (token == URI)
+ {
+ uri = get_uri_string (ctx->text, &pos, &length);
+ }
+ else
+ {
+ /* cut out quote characters */
+ pos++;
+ length -= 2;
+ uri = xmalloc (length + 1);
+ strncpy (uri, yytext + 1, length);
+ uri[length] = '\0';
+ }
+
+ if (uri)
+ {
+ struct urlpos *up = append_url (uri, pos, length, ctx);
+ DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+
+ if (up)
+ {
+ up->link_inline_p = 1;
+ up->link_css_p = 1;
+ up->link_expect_css = 1;
+ }
+
+ xfree(uri);
+ }
+ }
+ }
+ /* background-image: url(foo.png)
+ note that we don't care what
+ property this is actually on.
+ */
+ else if(token == URI)
+ {
+ pos = buffer_pos + offset;
+ length = yyleng;
+ uri = get_uri_string (ctx->text, &pos, &length);
+
+ if (uri)
+ {
+ struct urlpos *up = append_url (uri, pos, length, ctx);
+ DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri));
+ if (up)
+ {
+ up->link_inline_p = 1;
+ up->link_css_p = 1;
+ }
+
+ xfree (uri);
+ }
+ }
+ buffer_pos += yyleng;
+ }
+ DEBUGP (("\n"));
+}
+
+struct urlpos *
+get_urls_css_file (const char *file, const char *url)
+{
+ struct file_memory *fm;
+ struct map_context ctx;
+
+ /* Load the file. */
+ fm = read_file (file);
+ if (!fm)
+ {
+ logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+ return NULL;
+ }
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+
+ ctx.text = fm->content;
+ ctx.head = ctx.tail = NULL;
+ ctx.base = NULL;
+ ctx.parent_base = url ? url : opt.base_href;
+ ctx.document_file = file;
+ ctx.nofollow = 0;
+
+ get_urls_css (&ctx, 0, fm->length);
+ read_file_free (fm);
+ return ctx.head;
+}