X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-parse.c;h=20791cd83450272210c17a129ecd6904aec86231;hp=243ead236524d1465fcae4a79226190ea58b88a4;hb=HEAD;hpb=c2c71c32cf7e75336bb45fc299658910e0f9f8af diff --git a/src/html-parse.c b/src/html-parse.c index 243ead23..20791cd8 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -1,6 +1,6 @@ /* HTML parser for Wget. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -17,15 +17,16 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Wget. If not, see . -In addition, as a special exception, the Free Software Foundation -gives permission to link the code of its release of Wget with the -OpenSSL project's "OpenSSL" library (or with modified versions of it -that use the same license as the "OpenSSL" library), and distribute -the linked executables. You must obey the GNU General Public License -in all respects for all of the code used other than "OpenSSL". If you -modify this file, you may extend this exception to your version of the -file, but you are not obligated to do so. If you do not wish to do -so, delete this exception statement from your version. */ +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ /* The only entry point to this module is map_html_tags(), which see. */ @@ -88,7 +89,7 @@ so, delete this exception statement from your version. */ /* To test as standalone, compile with `-DSTANDALONE -I.'. You'll still need Wget headers to compile. */ -#include +#include "wget.h" #ifdef STANDALONE # define I_REALLY_WANT_CTYPE_MACROS @@ -99,7 +100,7 @@ so, delete this exception statement from your version. */ #include #include -#include "wget.h" +#include "utils.h" #include "html-parse.h" #ifdef STANDALONE @@ -110,21 +111,21 @@ so, delete this exception statement from your version. */ # define xrealloc realloc # define xfree free -# undef ISSPACE -# undef ISDIGIT -# undef ISXDIGIT -# undef ISALPHA -# undef ISALNUM -# undef TOLOWER -# undef TOUPPER - -# define ISSPACE(x) isspace (x) -# define ISDIGIT(x) isdigit (x) -# define ISXDIGIT(x) isxdigit (x) -# define ISALPHA(x) isalpha (x) -# define ISALNUM(x) isalnum (x) -# define TOLOWER(x) tolower (x) -# define TOUPPER(x) toupper (x) +# undef c_isspace +# undef c_isdigit +# undef c_isxdigit +# undef c_isalpha +# undef c_isalnum +# undef c_tolower +# undef c_toupper + +# define c_isspace(x) isspace (x) +# define c_isdigit(x) isdigit (x) +# define c_isxdigit(x) isxdigit (x) +# define c_isalpha(x) isalpha (x) +# define c_isalnum(x) isalnum (x) +# define c_tolower(x) tolower (x) +# define c_toupper(x) toupper (x) struct hash_table { int dummy; @@ -258,7 +259,7 @@ struct pool { However, "<foo" will work, as will "<!foo", "<", etc. In other words an entity needs to be terminated by either a non-alphanumeric or the end of string. */ -#define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n]))) +#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n]))) /* Macros that test entity names by returning true if P is followed by the specified characters. */ @@ -271,6 +272,94 @@ struct pool { to "prev = ts->next = NULL; + } + else + { + (*tail)->next = ts; + ts->prev = *tail; + *tail = ts; + ts->next = NULL; + } + + return ts; +} + +/* remove ts and everything after it from the stack */ +static void +tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, + struct tagstack_item *ts) +{ + if (*head == NULL) + return; + + if (ts == *tail) + { + if (ts == *head) + { + xfree (ts); + *head = *tail = NULL; + } + else + { + ts->prev->next = NULL; + *tail = ts->prev; + xfree (ts); + } + } + else + { + if (ts == *head) + { + *head = NULL; + } + *tail = ts->prev; + + if (ts->prev) + { + ts->prev->next = NULL; + } + while (ts) + { + struct tagstack_item *p = ts->next; + xfree (ts); + ts = p; + } + } +} + +static struct tagstack_item * +tagstack_find (struct tagstack_item *tail, const char *tagname_begin, + const char *tagname_end) +{ + int len = tagname_end - tagname_begin; + while (tail) + { + if (len == (tail->tagname_end - tail->tagname_begin)) + { + if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len)) + return tail; + } + tail = tail->prev; + } + return NULL; +} + /* Decode the HTML character entity at *PTR, considering END to be end of buffer. It is assumed that the "&" character that marks the beginning of the entity has been seen at *PTR-1. If a recognized @@ -296,10 +385,10 @@ decode_entity (const char **ptr, const char *end) int digits = 0; value = 0; if (*p == 'x') - for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++) + for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++) value = (value << 4) + XDIGIT_TO_NUM (*p); else - for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++) + for (; value < 256 && p < end && c_isdigit (*p); p++, digits++) value = (value * 10) + (*p - '0'); if (!digits) return -1; @@ -368,9 +457,9 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags ` '. */ if (flags & AP_TRIM_BLANKS) { - while (beg < end && ISSPACE (*beg)) + while (beg < end && c_isspace (*beg)) ++beg; - while (end > beg && ISSPACE (end[-1])) + while (end > beg && c_isspace (end[-1])) --end; } @@ -425,7 +514,7 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags { char *p = pool->contents + old_tail; for (; *p; p++) - *p = TOLOWER (*p); + *p = c_tolower (*p); } } @@ -439,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags * whitespace * 8-bit and control chars * characters that clearly cannot be part of name: - '=', '>', '/'. + '=', '<', '>', '/'. This only affects attribute and tag names; attribute values allow an even greater variety of characters. */ #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ - && (x) != '=' && (x) != '>' && (x) != '/') + && (x) != '=' && (x) != '<' && (x) != '>' \ + && (x) != '/') #ifdef STANDALONE static int comment_backout_count; @@ -530,6 +620,7 @@ advance_declaration (const char *beg, const char *end) case '\n': ch = *p++; break; + case '<': case '>': state = AC_S_DONE; break; @@ -705,7 +796,7 @@ name_allowed (const struct hash_table *ht, const char *b, const char *e) /* Skip whitespace, if any. */ #define SKIP_WS(p) do { \ - while (ISSPACE (*p)) { \ + while (c_isspace (*p)) { \ ADVANCE (p); \ } \ } while (0) @@ -713,7 +804,7 @@ name_allowed (const struct hash_table *ht, const char *b, const char *e) /* Skip non-whitespace, if any. */ #define SKIP_NON_WS(p) do { \ - while (!ISSPACE (*p)) { \ + while (!c_isspace (*p)) { \ ADVANCE (p); \ } \ } while (0) @@ -756,6 +847,9 @@ map_html_tags (const char *text, int size, bool attr_pair_resized = false; struct attr_pair *pairs = attr_pair_initial_storage; + struct tagstack_item *head = NULL; + struct tagstack_item *tail = NULL; + if (!size) return; @@ -822,7 +916,19 @@ map_html_tags (const char *text, int size, goto look_for_tag; tag_name_end = p; SKIP_WS (p); - if (end_tag && *p != '>') + + if (!end_tag) + { + struct tagstack_item *ts = tagstack_push (&head, &tail); + if (ts) + { + ts->tagname_begin = tag_name_begin; + ts->tagname_end = tag_name_end; + ts->contents_begin = NULL; + } + } + + if (end_tag && *p != '>' && *p != '<') goto backout_tag; if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) @@ -854,12 +960,12 @@ map_html_tags (const char *text, int size, /* ^ */ ADVANCE (p); SKIP_WS (p); - if (*p != '>') + if (*p != '<' && *p != '>') goto backout_tag; } /* Check for end of tag definition. */ - if (*p == '>') + if (*p == '<' || *p == '>') break; /* Establish bounds of attribute name. */ @@ -874,7 +980,8 @@ map_html_tags (const char *text, int size, /* Establish bounds of attribute value. */ SKIP_WS (p); - if (NAME_CHAR_P (*p) || *p == '/' || *p == '>') + + if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>') { /* Minimized attribute syntax allows `=' to be omitted. For example,
    is a valid shorthand for
      ') + else if (newline_seen && (*p == '<' || *p == '>')) break; ADVANCE (p); } @@ -936,7 +1043,7 @@ map_html_tags (const char *text, int size, violated by, for instance, `%' in `width=75%'. We'll be liberal and allow just about anything as an attribute value. */ - while (!ISSPACE (*p) && *p != '>') + while (!c_isspace (*p) && *p != '<' && *p != '>') ADVANCE (p); attr_value_end = p; /* */ /* ^ */ @@ -983,6 +1090,11 @@ map_html_tags (const char *text, int size, ++nattrs; } + if (!end_tag && tail && (tail->tagname_begin == tag_name_begin)) + { + tail->contents_begin = p+1; + } + if (uninteresting_tag) { ADVANCE (p); @@ -994,6 +1106,7 @@ map_html_tags (const char *text, int size, { int i; struct taginfo taginfo; + struct tagstack_item *ts = NULL; taginfo.name = pool.contents; taginfo.end_tag_p = end_tag; @@ -1010,8 +1123,26 @@ map_html_tags (const char *text, int size, taginfo.attrs = pairs; taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; + taginfo.contents_begin = NULL; + taginfo.contents_end = NULL; + + if (end_tag) + { + ts = tagstack_find (tail, tag_name_begin, tag_name_end); + if (ts) + { + if (ts->contents_begin) + { + taginfo.contents_begin = ts->contents_begin; + taginfo.contents_end = tag_start_position; + } + tagstack_pop (&head, &tail, ts); + } + } + mapfun (&taginfo, maparg); - ADVANCE (p); + if (*p != '<') + ADVANCE (p); } goto look_for_tag; @@ -1029,6 +1160,8 @@ map_html_tags (const char *text, int size, POOL_FREE (&pool); if (attr_pair_resized) xfree (pairs); + /* pop any tag stack that's left */ + tagstack_pop (&head, &tail, head); } #undef ADVANCE