X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-parse.c;h=e454860b4506c72302e51ef60031fe8326a0ed66;hb=7c802e58d3e45e3a21d99c8d24dc5be806ecf174;hp=e10c4855466de3fef61c6fce2d6763bf937f03c9;hpb=90cdb82942f5c904a933e6f9b05e6f046df0dd4c;p=wget diff --git a/src/html-parse.c b/src/html-parse.c index e10c4855..e454860b 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -1,5 +1,5 @@ /* HTML parser for Wget. - Copyright (C) 1998, 2000 Free Software Foundation, Inc. + Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -15,7 +15,17 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +In addition, as a special exception, the Free Software Foundation +gives permission to link the code of its release of Wget with the +OpenSSL project's "OpenSSL" library (or with modified versions of it +that use the same license as the "OpenSSL" library), and distribute +the linked executables. You must obey the GNU General Public License +in all respects for all of the code used other than "OpenSSL". If you +modify this file, you may extend this exception to your version of the +file, but you are not obligated to do so. If you do not wish to do +so, delete this exception statement from your version. */ /* The only entry point to this module is map_html_tags(), which see. */ @@ -83,6 +93,10 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include +#ifdef STANDALONE +# define I_REALLY_WANT_CTYPE_MACROS +#endif + #include #include #ifdef HAVE_STRING_H @@ -99,12 +113,24 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ # define xmalloc malloc # define xrealloc realloc # define xfree free + +# define ISSPACE(x) isspace (x) +# define ISDIGIT(x) isdigit (x) +# define ISALPHA(x) isalpha (x) +# define ISALNUM(x) isalnum (x) +# define TOLOWER(x) tolower (x) #endif /* STANDALONE */ -/* Pool support. For efficiency, map_html_tags() stores temporary - string data to a single stack-allocated pool. If the pool proves - too small, additional memory is allocated/resized with - malloc()/realloc(). */ +/* Pool support. A pool is a resizable chunk of memory. It is first + allocated on the stack, and moved to the heap if it needs to be + larger than originally expected. map_html_tags() uses it to store + the zero-terminated names and values of tags and attributes. + + Thus taginfo->name, and attr->name and attr->value for each + attribute, do not point into separately allocated areas, but into + different parts of the pool, separated only by terminating zeros. + This ensures minimum amount of allocation and, for most tags, no + allocation because the entire pool is kept on the stack. */ struct pool { char *contents; /* pointer to the contents. */ @@ -318,34 +344,31 @@ array_allowed (const char **array, const char *beg, const char *end) return 1; } -/* RFC1866: name [of attribute or tag] consists of letters, digits, - periods, or hyphens. We also allow _, for compatibility with - brain-damaged generators. */ -#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_') - -/* States while advancing through comments. */ -#define AC_S_DONE 0 -#define AC_S_BACKOUT 1 -#define AC_S_BANG 2 -#define AC_S_DEFAULT 3 -#define AC_S_DCLNAME 4 -#define AC_S_DASH1 5 -#define AC_S_DASH2 6 -#define AC_S_COMMENT 7 -#define AC_S_DASH3 8 -#define AC_S_DASH4 9 -#define AC_S_QUOTE1 10 -#define AC_S_IN_QUOTE 11 -#define AC_S_QUOTE2 12 +/* Originally we used to adhere to rfc 1866 here, and allowed only + letters, digits, periods, and hyphens as names (of tags or + attributes). However, this broke too many pages which used + proprietary or strange attributes, e.g. . + + So now we allow any character except: + * whitespace + * 8-bit and control chars + * characters that clearly cannot be part of name: + '=', '>', '/'. + + This only affects attribute and tag names; attribute values allow + an even greater variety of characters. */ + +#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ + && (x) != '=' && (x) != '>' && (x) != '/') #ifdef STANDALONE static int comment_backout_count; #endif -/* Advance over an SGML declaration (the forms you find in HTML - documents). The function returns the location after the - declaration. The reason we need this is that HTML comments are - expressed as comments in so-called "empty declarations". +/* Advance over an SGML declaration, such as . In + strict comments mode, this is used for skipping over comments as + well. To recap: any SGML declaration may have comments associated with it, e.g. @@ -359,17 +382,31 @@ static int comment_backout_count; Whitespace is allowed between and after the comments, but not - before the first comment. + before the first comment. Additionally, this function attempts to + handle double quotes in SGML declarations correctly. */ - Additionally, this function attempts to handle double quotes in - SGML declarations correctly. */ static const char * advance_declaration (const char *beg, const char *end) { const char *p = beg; char quote_char = '\0'; /* shut up, gcc! */ char ch; - int state = AC_S_BANG; + + enum { + AC_S_DONE, + AC_S_BACKOUT, + AC_S_BANG, + AC_S_DEFAULT, + AC_S_DCLNAME, + AC_S_DASH1, + AC_S_DASH2, + AC_S_COMMENT, + AC_S_DASH3, + AC_S_DASH4, + AC_S_QUOTE1, + AC_S_IN_QUOTE, + AC_S_QUOTE2, + } state = AC_S_BANG; if (beg == end) return beg; @@ -424,10 +461,10 @@ advance_declaration (const char *beg, const char *end) } break; case AC_S_DCLNAME: - if (NAME_CHAR_P (ch)) - ch = *p++; - else if (ch == '-') + if (ch == '-') state = AC_S_DASH1; + else if (NAME_CHAR_P (ch)) + ch = *p++; else state = AC_S_DEFAULT; break; @@ -508,6 +545,55 @@ advance_declaration (const char *beg, const char *end) } return p; } + +/* Find the first occurrence of the substring "-->" in [BEG, END) and + return the pointer to the character after the substring. If the + substring is not found, return NULL. */ + +static const char * +find_comment_end (const char *beg, const char *end) +{ + /* Open-coded Boyer-Moore search for "-->". Examine the third char; + if it's not '>' or '-', advance by three characters. Otherwise, + look at the preceding characters and try to find a match. */ + + const char *p = beg - 1; + + while ((p += 3) < end) + switch (p[0]) + { + case '>': + if (p[-1] == '-' && p[-2] == '-') + return p + 1; + break; + case '-': + at_dash: + if (p[-1] == '-') + { + at_dash_dash: + if (++p == end) return NULL; + switch (p[0]) + { + case '>': return p + 1; + case '-': goto at_dash_dash; + } + } + else + { + if ((p += 2) >= end) return NULL; + switch (p[0]) + { + case '>': + if (p[-1] == '-') + return p + 1; + break; + case '-': + goto at_dash; + } + } + } + return NULL; +} /* Advance P (a char pointer), with the explicit intent of being able to read the next character. If this is not possible, go to finish. */ @@ -599,8 +685,26 @@ map_html_tags (const char *text, int size, declaration). */ if (*p == '!') { - /* This is an SGML declaration -- just skip it. */ - p = advance_declaration (p, end); + if (!opt.strict_comments + && p < end + 3 && p[1] == '-' && p[2] == '-') + { + /* If strict comments are not enforced and if we know + we're looking at a comment, simply look for the + terminating "-->". Non-strict is the default because + it works in other browsers and most HTML writers can't + be bothered with getting the comments right. */ + const char *comment_end = find_comment_end (p + 3, end); + if (comment_end) + p = comment_end; + } + else + { + /* Either in strict comment mode or looking at a non-empty + declaration. Real declarations are much less likely to + be misused the way comments are, so advance over them + properly regardless of strictness. */ + p = advance_declaration (p, end); + } if (p == end) goto finish; goto look_for_tag;