X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-parse.c;h=692e0de4b8c463d0eaa36bd4d3d34f7f42147f41;hp=55d5d9156265623e415b2cf31f2af750ced6881d;hb=d763f8bf6d6e13ce006ffab616cc8a77e747a633;hpb=462e643a7e31676eceda23e634241f7b4d2cd7bb diff --git a/src/html-parse.c b/src/html-parse.c index 55d5d915..692e0de4 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -1,6 +1,6 @@ /* HTML parser for Wget. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007 Free Software Foundation, Inc. + 2007, 2008 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -100,6 +100,7 @@ as that of the covered work. */ #include #include +#include "utils.h" #include "html-parse.h" #ifdef STANDALONE @@ -271,6 +272,94 @@ struct pool { to "prev = ts->next = NULL; + } + else + { + (*tail)->next = ts; + ts->prev = *tail; + *tail = ts; + ts->next = NULL; + } + + return ts; +} + +/* remove ts and everything after it from the stack */ +void +tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, + struct tagstack_item *ts) +{ + if (*head == NULL) + return; + + if (ts == *tail) + { + if (ts == *head) + { + xfree (ts); + *head = *tail = NULL; + } + else + { + ts->prev->next = NULL; + *tail = ts->prev; + xfree (ts); + } + } + else + { + if (ts == *head) + { + *head = NULL; + } + *tail = ts->prev; + + if (ts->prev) + { + ts->prev->next = NULL; + } + while (ts) + { + struct tagstack_item *p = ts->next; + xfree (ts); + ts = p; + } + } +} + +struct tagstack_item * +tagstack_find (struct tagstack_item *tail, const char *tagname_begin, + const char *tagname_end) +{ + int len = tagname_end - tagname_begin; + while (tail) + { + if (len == (tail->tagname_end - tail->tagname_begin)) + { + if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len)) + return tail; + } + tail = tail->prev; + } + return NULL; +} + /* Decode the HTML character entity at *PTR, considering END to be end of buffer. It is assumed that the "&" character that marks the beginning of the entity has been seen at *PTR-1. If a recognized @@ -756,6 +845,9 @@ map_html_tags (const char *text, int size, bool attr_pair_resized = false; struct attr_pair *pairs = attr_pair_initial_storage; + struct tagstack_item *head = NULL; + struct tagstack_item *tail = NULL; + if (!size) return; @@ -822,6 +914,18 @@ map_html_tags (const char *text, int size, goto look_for_tag; tag_name_end = p; SKIP_WS (p); + + if (!end_tag) + { + struct tagstack_item *ts = tagstack_push (&head, &tail); + if (ts) + { + ts->tagname_begin = tag_name_begin; + ts->tagname_end = tag_name_end; + ts->contents_begin = NULL; + } + } + if (end_tag && *p != '>') goto backout_tag; @@ -983,6 +1087,11 @@ map_html_tags (const char *text, int size, ++nattrs; } + if (!end_tag && tail && (tail->tagname_begin == tag_name_begin)) + { + tail->contents_begin = p+1; + } + if (uninteresting_tag) { ADVANCE (p); @@ -994,6 +1103,7 @@ map_html_tags (const char *text, int size, { int i; struct taginfo taginfo; + struct tagstack_item *ts = NULL; taginfo.name = pool.contents; taginfo.end_tag_p = end_tag; @@ -1010,6 +1120,23 @@ map_html_tags (const char *text, int size, taginfo.attrs = pairs; taginfo.start_position = tag_start_position; taginfo.end_position = p + 1; + taginfo.contents_begin = NULL; + taginfo.contents_end = NULL; + + if (end_tag) + { + ts = tagstack_find (tail, tag_name_begin, tag_name_end); + if (ts) + { + if (ts->contents_begin) + { + taginfo.contents_begin = ts->contents_begin; + taginfo.contents_end = tag_start_position; + } + tagstack_pop (&head, &tail, ts); + } + } + mapfun (&taginfo, maparg); ADVANCE (p); } @@ -1029,6 +1156,8 @@ map_html_tags (const char *text, int size, POOL_FREE (&pool); if (attr_pair_resized) xfree (pairs); + /* pop any tag stack that's left */ + tagstack_pop (&head, &tail, head); } #undef ADVANCE