X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-parse.c;h=e454860b4506c72302e51ef60031fe8326a0ed66;hb=7c802e58d3e45e3a21d99c8d24dc5be806ecf174;hp=75ee031fa698d1703a43cb06adf448d482a52a7f;hpb=e559249a48cdf13099529e5418e699e27b38dbd6;p=wget diff --git a/src/html-parse.c b/src/html-parse.c index 75ee031f..e454860b 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -1,21 +1,31 @@ /* HTML parser for Wget. - Copyright (C) 1998, 2000 Free Software Foundation, Inc. + Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc. -This file is part of Wget. +This file is part of GNU Wget. -This program is free software; you can redistribute it and/or modify +GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -This program is distributed in the hope that it will be useful, +GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +along with Wget; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ /* The only entry point to this module is map_html_tags(), which see. */ @@ -83,6 +93,10 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include +#ifdef STANDALONE +# define I_REALLY_WANT_CTYPE_MACROS +#endif + #include #include #ifdef HAVE_STRING_H @@ -99,12 +113,24 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ # define xmalloc malloc # define xrealloc realloc # define xfree free + +# define ISSPACE(x) isspace (x) +# define ISDIGIT(x) isdigit (x) +# define ISALPHA(x) isalpha (x) +# define ISALNUM(x) isalnum (x) +# define TOLOWER(x) tolower (x) #endif /* STANDALONE */ -/* Pool support. For efficiency, map_html_tags() stores temporary - string data to a single stack-allocated pool. If the pool proves - too small, additional memory is allocated/resized with - malloc()/realloc(). */ +/* Pool support. A pool is a resizable chunk of memory. It is first + allocated on the stack, and moved to the heap if it needs to be + larger than originally expected. map_html_tags() uses it to store + the zero-terminated names and values of tags and attributes. + + Thus taginfo->name, and attr->name and attr->value for each + attribute, do not point into separately allocated areas, but into + different parts of the pool, separated only by terminating zeros. + This ensures minimum amount of allocation and, for most tags, no + allocation because the entire pool is kept on the stack. */ struct pool { char *contents; /* pointer to the contents. */ @@ -318,34 +344,31 @@ array_allowed (const char **array, const char *beg, const char *end) return 1; } -/* RFC1866: name [of attribute or tag] consists of letters, digits, - periods, or hyphens. We also allow _, for compatibility with - brain-damaged generators. */ -#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_') - -/* States while advancing through comments. */ -#define AC_S_DONE 0 -#define AC_S_BACKOUT 1 -#define AC_S_BANG 2 -#define AC_S_DEFAULT 3 -#define AC_S_DCLNAME 4 -#define AC_S_DASH1 5 -#define AC_S_DASH2 6 -#define AC_S_COMMENT 7 -#define AC_S_DASH3 8 -#define AC_S_DASH4 9 -#define AC_S_QUOTE1 10 -#define AC_S_IN_QUOTE 11 -#define AC_S_QUOTE2 12 +/* Originally we used to adhere to rfc 1866 here, and allowed only + letters, digits, periods, and hyphens as names (of tags or + attributes). However, this broke too many pages which used + proprietary or strange attributes, e.g. . + + So now we allow any character except: + * whitespace + * 8-bit and control chars + * characters that clearly cannot be part of name: + '=', '>', '/'. + + This only affects attribute and tag names; attribute values allow + an even greater variety of characters. */ + +#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ + && (x) != '=' && (x) != '>' && (x) != '/') #ifdef STANDALONE static int comment_backout_count; #endif -/* Advance over an SGML declaration (the forms you find in HTML - documents). The function returns the location after the - declaration. The reason we need this is that HTML comments are - expressed as comments in so-called "empty declarations". +/* Advance over an SGML declaration, such as . In + strict comments mode, this is used for skipping over comments as + well. To recap: any SGML declaration may have comments associated with it, e.g. @@ -359,17 +382,31 @@ static int comment_backout_count; Whitespace is allowed between and after the comments, but not - before the first comment. + before the first comment. Additionally, this function attempts to + handle double quotes in SGML declarations correctly. */ - Additionally, this function attempts to handle double quotes in - SGML declarations correctly. */ static const char * advance_declaration (const char *beg, const char *end) { const char *p = beg; char quote_char = '\0'; /* shut up, gcc! */ char ch; - int state = AC_S_BANG; + + enum { + AC_S_DONE, + AC_S_BACKOUT, + AC_S_BANG, + AC_S_DEFAULT, + AC_S_DCLNAME, + AC_S_DASH1, + AC_S_DASH2, + AC_S_COMMENT, + AC_S_DASH3, + AC_S_DASH4, + AC_S_QUOTE1, + AC_S_IN_QUOTE, + AC_S_QUOTE2, + } state = AC_S_BANG; if (beg == end) return beg; @@ -424,15 +461,17 @@ advance_declaration (const char *beg, const char *end) } break; case AC_S_DCLNAME: - if (NAME_CHAR_P (ch)) - ch = *p++; - else if (ch == '-') + if (ch == '-') state = AC_S_DASH1; + else if (NAME_CHAR_P (ch)) + ch = *p++; else state = AC_S_DEFAULT; break; case AC_S_QUOTE1: - assert (ch == '\'' || ch == '"'); + /* We must use 0x22 because broken assert macros choke on + '"' and '\"'. */ + assert (ch == '\'' || ch == 0x22); quote_char = ch; /* cheating -- I really don't feel like introducing more different states for different quote characters. */ @@ -506,6 +545,55 @@ advance_declaration (const char *beg, const char *end) } return p; } + +/* Find the first occurrence of the substring "-->" in [BEG, END) and + return the pointer to the character after the substring. If the + substring is not found, return NULL. */ + +static const char * +find_comment_end (const char *beg, const char *end) +{ + /* Open-coded Boyer-Moore search for "-->". Examine the third char; + if it's not '>' or '-', advance by three characters. Otherwise, + look at the preceding characters and try to find a match. */ + + const char *p = beg - 1; + + while ((p += 3) < end) + switch (p[0]) + { + case '>': + if (p[-1] == '-' && p[-2] == '-') + return p + 1; + break; + case '-': + at_dash: + if (p[-1] == '-') + { + at_dash_dash: + if (++p == end) return NULL; + switch (p[0]) + { + case '>': return p + 1; + case '-': goto at_dash_dash; + } + } + else + { + if ((p += 2) >= end) return NULL; + switch (p[0]) + { + case '>': + if (p[-1] == '-') + return p + 1; + break; + case '-': + goto at_dash; + } + } + } + return NULL; +} /* Advance P (a char pointer), with the explicit intent of being able to read the next character. If this is not possible, go to finish. */ @@ -597,8 +685,26 @@ map_html_tags (const char *text, int size, declaration). */ if (*p == '!') { - /* This is an SGML declaration -- just skip it. */ - p = advance_declaration (p, end); + if (!opt.strict_comments + && p < end + 3 && p[1] == '-' && p[2] == '-') + { + /* If strict comments are not enforced and if we know + we're looking at a comment, simply look for the + terminating "-->". Non-strict is the default because + it works in other browsers and most HTML writers can't + be bothered with getting the comments right. */ + const char *comment_end = find_comment_end (p + 3, end); + if (comment_end) + p = comment_end; + } + else + { + /* Either in strict comment mode or looking at a non-empty + declaration. Real declarations are much less likely to + be misused the way comments are, so advance over them + properly regardless of strictness. */ + p = advance_declaration (p, end); + } if (p == end) goto finish; goto look_for_tag; @@ -638,6 +744,19 @@ map_html_tags (const char *text, int size, SKIP_WS (p); + if (*p == '/') + { + /* A slash at this point means the tag is about to be + closed. This is legal in XML and has been popularized + in HTML via XHTML. */ + /* */ + /* ^ */ + ADVANCE (p); + SKIP_WS (p); + if (*p != '>') + goto backout_tag; + } + /* Check for end of tag definition. */ if (*p == '>') break; @@ -654,7 +773,7 @@ map_html_tags (const char *text, int size, /* Establish bounds of attribute value. */ SKIP_WS (p); - if (NAME_CHAR_P (*p) || *p == '>') + if (NAME_CHAR_P (*p) || *p == '/' || *p == '>') { /* Minimized attribute syntax allows `=' to be omitted. For example,
    is a valid shorthand for