X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fcss-url.c;fp=src%2Fcss-url.c;h=42c8fc3e7e9747bd8be5beec605af236d575698f;hp=0000000000000000000000000000000000000000;hb=a0d0f332d5f230e40fe7fff8fc76839c4f4704ce;hpb=3f51773542ff65075706f08088b91af6bf9e278e diff --git a/src/css-url.c b/src/css-url.c new file mode 100644 index 00000000..42c8fc3e --- /dev/null +++ b/src/css-url.c @@ -0,0 +1,273 @@ +/* Collect URLs from CSS source. + Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ + +/* + Note that this is not an actual CSS parser, but just a lexical + scanner with a tiny bit more smarts bolted on top. A full parser + is somewhat overkill for this job. The only things we're interested + in are @import rules and url() tokens, so it's easy enough to + grab those without truly understanding the input. The only downside + to this is that we might be coerced into downloading files that + a browser would ignore. That might merit some more investigation. + */ + +#include + +#include +#ifdef HAVE_STRING_H +# include +#else +# include +#endif +#include +#include +#include +#include + +#include "wget.h" +#include "utils.h" +#include "convert.h" +#include "html-url.h" +#include "css-tokens.h" + +/* from lex.yy.c */ +extern char *yytext; +extern int yyleng; +typedef struct yy_buffer_state *YY_BUFFER_STATE; +extern YY_BUFFER_STATE yy_scan_bytes (const char *bytes,int len ); +extern int yylex (void); + +#if 1 +const char *token_names[] = { + "CSSEOF", + "S", + "CDO", + "CDC", + "INCLUDES", + "DASHMATCH", + "LBRACE", + "PLUS", + "GREATER", + "COMMA", + "STRING", + "INVALID", + "IDENT", + "HASH", + "IMPORT_SYM", + "PAGE_SYM", + "MEDIA_SYM", + "CHARSET_SYM", + "IMPORTANT_SYM", + "EMS", + "EXS", + "LENGTH", + "ANGLE", + "TIME", + "FREQ", + "DIMENSION", + "PERCENTAGE", + "NUMBER", + "URI", + "FUNCTION" +}; +#endif + +/* + Given a detected URI token, get only the URI specified within. + Also adjust the starting position and length of the string. + + A URI can be specified with or without quotes, and the quotes + can be single or double quotes. In addition there can be + whitespace after the opening parenthesis and before the closing + parenthesis. +*/ +char * +get_uri_string (const char *at, int *pos, int *length) +{ + char *uri; + /*char buf[1024]; + strncpy(buf,at + *pos, *length); + buf[*length] = '\0'; + DEBUGP (("get_uri_string: \"%s\"\n", buf));*/ + + if (0 != strncasecmp (at + *pos, "url(", 4)) + return NULL; + + *pos += 4; + *length -= 5; /* url() */ + /* skip leading space */ + while (isspace (at[*pos])) + { + (*pos)++; + (*length)--; + } + /* skip trailing space */ + while (isspace (at[*pos + *length - 1])) + { + (*length)--; + } + /* trim off quotes */ + if (at[*pos] == '\'' || at[*pos] == '"') + { + (*pos)++; + *length -= 2; + } + + uri = xmalloc (*length + 1); + if (uri) + { + strncpy (uri, at + *pos, *length); + uri[*length] = '\0'; + } + + return uri; +} + +void +get_urls_css (struct map_context *ctx, int offset, int buf_length) +{ + int token; + /*char tmp[2048];*/ + int buffer_pos = 0; + int pos, length; + char *uri; + + /* + strncpy(tmp,ctx->text + offset, buf_length); + tmp[buf_length] = '\0'; + DEBUGP (("get_urls_css: \"%s\"\n", tmp)); + */ + + /* tell flex to scan from this buffer */ + yy_scan_bytes (ctx->text + offset, buf_length); + + while((token = yylex()) != CSSEOF) + { + /*DEBUGP (("%s ", token_names[token]));*/ + /* @import "foo.css" + or @import url(foo.css) + */ + if(token == IMPORT_SYM) + { + do { + buffer_pos += yyleng; + } while((token = yylex()) == S); + + /*DEBUGP (("%s ", token_names[token]));*/ + + if (token == STRING || token == URI) + { + /*DEBUGP (("Got URI "));*/ + pos = buffer_pos + offset; + length = yyleng; + + if (token == URI) + { + uri = get_uri_string (ctx->text, &pos, &length); + } + else + { + /* cut out quote characters */ + pos++; + length -= 2; + uri = xmalloc (length + 1); + strncpy (uri, yytext + 1, length); + uri[length] = '\0'; + } + + if (uri) + { + struct urlpos *up = append_url (uri, pos, length, ctx); + DEBUGP (("Found @import: [%s] at %d [%s]\n", yytext, buffer_pos, uri)); + + if (up) + { + up->link_inline_p = 1; + up->link_css_p = 1; + up->link_expect_css = 1; + } + + xfree(uri); + } + } + } + /* background-image: url(foo.png) + note that we don't care what + property this is actually on. + */ + else if(token == URI) + { + pos = buffer_pos + offset; + length = yyleng; + uri = get_uri_string (ctx->text, &pos, &length); + + if (uri) + { + struct urlpos *up = append_url (uri, pos, length, ctx); + DEBUGP (("Found URI: [%s] at %d [%s]\n", yytext, buffer_pos, uri)); + if (up) + { + up->link_inline_p = 1; + up->link_css_p = 1; + } + + xfree (uri); + } + } + buffer_pos += yyleng; + } + DEBUGP (("\n")); +} + +struct urlpos * +get_urls_css_file (const char *file, const char *url) +{ + struct file_memory *fm; + struct map_context ctx; + + /* Load the file. */ + fm = read_file (file); + if (!fm) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return NULL; + } + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); + + ctx.text = fm->content; + ctx.head = ctx.tail = NULL; + ctx.base = NULL; + ctx.parent_base = url ? url : opt.base_href; + ctx.document_file = file; + ctx.nofollow = 0; + + get_urls_css (&ctx, 0, fm->length); + read_file_free (fm); + return ctx.head; +}