X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-parse.c;h=20791cd83450272210c17a129ecd6904aec86231;hp=fdf5b99bc791c8c420633be631903b8ab3f1be64;hb=HEAD;hpb=ec84142901fc685d7a08267fc0be8962e468968d diff --git a/src/html-parse.c b/src/html-parse.c index fdf5b99b..20791cd8 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -1,6 +1,6 @@ /* HTML parser for Wget. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007, 2008 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -272,6 +272,94 @@ struct pool { to "prev = ts->next = NULL; + } + else + { + (*tail)->next = ts; + ts->prev = *tail; + *tail = ts; + ts->next = NULL; + } + + return ts; +} + +/* remove ts and everything after it from the stack */ +static void +tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, + struct tagstack_item *ts) +{ + if (*head == NULL) + return; + + if (ts == *tail) + { + if (ts == *head) + { + xfree (ts); + *head = *tail = NULL; + } + else + { + ts->prev->next = NULL; + *tail = ts->prev; + xfree (ts); + } + } + else + { + if (ts == *head) + { + *head = NULL; + } + *tail = ts->prev; + + if (ts->prev) + { + ts->prev->next = NULL; + } + while (ts) + { + struct tagstack_item *p = ts->next; + xfree (ts); + ts = p; + } + } +} + +static struct tagstack_item * +tagstack_find (struct tagstack_item *tail, const char *tagname_begin, + const char *tagname_end) +{ + int len = tagname_end - tagname_begin; + while (tail) + { + if (len == (tail->tagname_end - tail->tagname_begin)) + { + if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len)) + return tail; + } + tail = tail->prev; + } + return NULL; +} + /* Decode the HTML character entity at *PTR, considering END to be end of buffer. It is assumed that the "&" character that marks the beginning of the entity has been seen at *PTR-1. If a recognized @@ -440,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags * whitespace * 8-bit and control chars * characters that clearly cannot be part of name: - '=', '>', '/'. + '=', '<', '>', '/'. This only affects attribute and tag names; attribute values allow an even greater variety of characters. */ #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ - && (x) != '=' && (x) != '>' && (x) != '/') + && (x) != '=' && (x) != '<' && (x) != '>' \ + && (x) != '/') #ifdef STANDALONE static int comment_backout_count; @@ -531,6 +620,7 @@ advance_declaration (const char *beg, const char *end) case '\n': ch = *p++; break; + case '<': case '>': state = AC_S_DONE; break; @@ -757,6 +847,9 @@ map_html_tags (const char *text, int size, bool attr_pair_resized = false; struct attr_pair *pairs = attr_pair_initial_storage; + struct tagstack_item *head = NULL; + struct tagstack_item *tail = NULL; + if (!size) return; @@ -823,7 +916,19 @@ map_html_tags (const char *text, int size, goto look_for_tag; tag_name_end = p; SKIP_WS (p); - if (end_tag && *p != '>') + + if (!end_tag) + { + struct tagstack_item *ts = tagstack_push (&head, &tail); + if (ts) + { + ts->tagname_begin = tag_name_begin; + ts->tagname_end = tag_name_end; + ts->contents_begin = NULL; + } + } + + if (end_tag && *p != '>' && *p != '<') goto backout_tag; if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) @@ -855,12 +960,12 @@ map_html_tags (const char *text, int size, /* ^ */ ADVANCE (p); SKIP_WS (p); - if (*p != '>') + if (*p != '<' && *p != '>') goto backout_tag; } /* Check for end of tag definition. */ - if (*p == '>') + if (*p == '<' || *p == '>') break; /* Establish bounds of attribute name. */ @@ -875,7 +980,8 @@ map_html_tags (const char *text, int size, /* Establish bounds of attribute value. */ SKIP_WS (p); - if (NAME_CHAR_P (*p) || *p == '/' || *p == '>') + + if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>') { /* Minimized attribute syntax allows `=' to be omitted. For example,