/* HTML parser for Wget.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007, 2008 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
This file is part of GNU Wget.
#include <string.h>
#include <assert.h>
+#include "utils.h"
#include "html-parse.h"
#ifdef STANDALONE
to "<foo", but "<,foo" to "<,foo". */
#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
+struct tagstack_item {
+ const char *tagname_begin;
+ const char *tagname_end;
+ const char *contents_begin;
+ struct tagstack_item *prev;
+ struct tagstack_item *next;
+};
+
+struct tagstack_item *
+tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
+{
+ struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
+ if (*head == NULL)
+ {
+ *head = *tail = ts;
+ ts->prev = ts->next = NULL;
+ }
+ else
+ {
+ (*tail)->next = ts;
+ ts->prev = *tail;
+ *tail = ts;
+ ts->next = NULL;
+ }
+
+ return ts;
+}
+
+/* remove ts and everything after it from the stack */
+void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+ struct tagstack_item *ts)
+{
+ if (*head == NULL)
+ return;
+
+ if (ts == *tail)
+ {
+ if (ts == *head)
+ {
+ xfree (ts);
+ *head = *tail = NULL;
+ }
+ else
+ {
+ ts->prev->next = NULL;
+ *tail = ts->prev;
+ xfree (ts);
+ }
+ }
+ else
+ {
+ if (ts == *head)
+ {
+ *head = NULL;
+ }
+ *tail = ts->prev;
+
+ if (ts->prev)
+ {
+ ts->prev->next = NULL;
+ }
+ while (ts)
+ {
+ struct tagstack_item *p = ts->next;
+ xfree (ts);
+ ts = p;
+ }
+ }
+}
+
+struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+ const char *tagname_end)
+{
+ int len = tagname_end - tagname_begin;
+ while (tail)
+ {
+ if (len == (tail->tagname_end - tail->tagname_begin))
+ {
+ if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+ return tail;
+ }
+ tail = tail->prev;
+ }
+ return NULL;
+}
+
/* Decode the HTML character entity at *PTR, considering END to be end
of buffer. It is assumed that the "&" character that marks the
beginning of the entity has been seen at *PTR-1. If a recognized
* whitespace
* 8-bit and control chars
* characters that clearly cannot be part of name:
- '=', '>', '/'.
+ '=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
- && (x) != '=' && (x) != '>' && (x) != '/')
+ && (x) != '=' && (x) != '<' && (x) != '>' \
+ && (x) != '/')
#ifdef STANDALONE
static int comment_backout_count;
case '\n':
ch = *p++;
break;
+ case '<':
case '>':
state = AC_S_DONE;
break;
bool attr_pair_resized = false;
struct attr_pair *pairs = attr_pair_initial_storage;
+ struct tagstack_item *head = NULL;
+ struct tagstack_item *tail = NULL;
+
if (!size)
return;
goto look_for_tag;
tag_name_end = p;
SKIP_WS (p);
- if (end_tag && *p != '>')
+
+ if (!end_tag)
+ {
+ struct tagstack_item *ts = tagstack_push (&head, &tail);
+ if (ts)
+ {
+ ts->tagname_begin = tag_name_begin;
+ ts->tagname_end = tag_name_end;
+ ts->contents_begin = NULL;
+ }
+ }
+
+ if (end_tag && *p != '>' && *p != '<')
goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
/* ^ */
ADVANCE (p);
SKIP_WS (p);
- if (*p != '>')
+ if (*p != '<' && *p != '>')
goto backout_tag;
}
/* Check for end of tag definition. */
- if (*p == '>')
+ if (*p == '<' || *p == '>')
break;
/* Establish bounds of attribute name. */
/* Establish bounds of attribute value. */
SKIP_WS (p);
- if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
+
+ if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
newline_seen = true;
continue;
}
- else if (newline_seen && *p == '>')
+ else if (newline_seen && (*p == '<' || *p == '>'))
break;
ADVANCE (p);
}
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
- while (!c_isspace (*p) && *p != '>')
+ while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */
++nattrs;
}
+ if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+ {
+ tail->contents_begin = p+1;
+ }
+
if (uninteresting_tag)
{
ADVANCE (p);
{
int i;
struct taginfo taginfo;
+ struct tagstack_item *ts = NULL;
taginfo.name = pool.contents;
taginfo.end_tag_p = end_tag;
taginfo.attrs = pairs;
taginfo.start_position = tag_start_position;
taginfo.end_position = p + 1;
+ taginfo.contents_begin = NULL;
+ taginfo.contents_end = NULL;
+
+ if (end_tag)
+ {
+ ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+ if (ts)
+ {
+ if (ts->contents_begin)
+ {
+ taginfo.contents_begin = ts->contents_begin;
+ taginfo.contents_end = tag_start_position;
+ }
+ tagstack_pop (&head, &tail, ts);
+ }
+ }
+
mapfun (&taginfo, maparg);
- ADVANCE (p);
+ if (*p != '<')
+ ADVANCE (p);
}
goto look_for_tag;
POOL_FREE (&pool);
if (attr_pair_resized)
xfree (pairs);
+ /* pop any tag stack that's left */
+ tagstack_pop (&head, &tail, head);
}
#undef ADVANCE