/* HTML parser for Wget.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
This file is part of GNU Wget.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
/* The only entry point to this module is map_html_tags(), which see. */
/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
still need Wget headers to compile. */
-#include <config.h>
+#include "wget.h"
#ifdef STANDALONE
# define I_REALLY_WANT_CTYPE_MACROS
#include <string.h>
#include <assert.h>
-#include "wget.h"
+#include "utils.h"
#include "html-parse.h"
#ifdef STANDALONE
# define xrealloc realloc
# define xfree free
-# undef ISSPACE
-# undef ISDIGIT
-# undef ISXDIGIT
-# undef ISALPHA
-# undef ISALNUM
-# undef TOLOWER
-# undef TOUPPER
-
-# define ISSPACE(x) isspace (x)
-# define ISDIGIT(x) isdigit (x)
-# define ISXDIGIT(x) isxdigit (x)
-# define ISALPHA(x) isalpha (x)
-# define ISALNUM(x) isalnum (x)
-# define TOLOWER(x) tolower (x)
-# define TOUPPER(x) toupper (x)
+# undef c_isspace
+# undef c_isdigit
+# undef c_isxdigit
+# undef c_isalpha
+# undef c_isalnum
+# undef c_tolower
+# undef c_toupper
+
+# define c_isspace(x) isspace (x)
+# define c_isdigit(x) isdigit (x)
+# define c_isxdigit(x) isxdigit (x)
+# define c_isalpha(x) isalpha (x)
+# define c_isalnum(x) isalnum (x)
+# define c_tolower(x) tolower (x)
+# define c_toupper(x) toupper (x)
struct hash_table {
int dummy;
However, "<foo" will work, as will "<!foo", "<", etc. In
other words an entity needs to be terminated by either a
non-alphanumeric or the end of string. */
-#define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
+#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
/* Macros that test entity names by returning true if P is followed by
the specified characters. */
to "<foo", but "<,foo" to "<,foo". */
#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
+struct tagstack_item {
+ const char *tagname_begin;
+ const char *tagname_end;
+ const char *contents_begin;
+ struct tagstack_item *prev;
+ struct tagstack_item *next;
+};
+
+static struct tagstack_item *
+tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
+{
+ struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
+ if (*head == NULL)
+ {
+ *head = *tail = ts;
+ ts->prev = ts->next = NULL;
+ }
+ else
+ {
+ (*tail)->next = ts;
+ ts->prev = *tail;
+ *tail = ts;
+ ts->next = NULL;
+ }
+
+ return ts;
+}
+
+/* remove ts and everything after it from the stack */
+static void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+ struct tagstack_item *ts)
+{
+ if (*head == NULL)
+ return;
+
+ if (ts == *tail)
+ {
+ if (ts == *head)
+ {
+ xfree (ts);
+ *head = *tail = NULL;
+ }
+ else
+ {
+ ts->prev->next = NULL;
+ *tail = ts->prev;
+ xfree (ts);
+ }
+ }
+ else
+ {
+ if (ts == *head)
+ {
+ *head = NULL;
+ }
+ *tail = ts->prev;
+
+ if (ts->prev)
+ {
+ ts->prev->next = NULL;
+ }
+ while (ts)
+ {
+ struct tagstack_item *p = ts->next;
+ xfree (ts);
+ ts = p;
+ }
+ }
+}
+
+static struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+ const char *tagname_end)
+{
+ int len = tagname_end - tagname_begin;
+ while (tail)
+ {
+ if (len == (tail->tagname_end - tail->tagname_begin))
+ {
+ if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+ return tail;
+ }
+ tail = tail->prev;
+ }
+ return NULL;
+}
+
/* Decode the HTML character entity at *PTR, considering END to be end
of buffer. It is assumed that the "&" character that marks the
beginning of the entity has been seen at *PTR-1. If a recognized
int digits = 0;
value = 0;
if (*p == 'x')
- for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
+ for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
value = (value << 4) + XDIGIT_TO_NUM (*p);
else
- for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
+ for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
value = (value * 10) + (*p - '0');
if (!digits)
return -1;
` '. */
if (flags & AP_TRIM_BLANKS)
{
- while (beg < end && ISSPACE (*beg))
+ while (beg < end && c_isspace (*beg))
++beg;
- while (end > beg && ISSPACE (end[-1]))
+ while (end > beg && c_isspace (end[-1]))
--end;
}
{
char *p = pool->contents + old_tail;
for (; *p; p++)
- *p = TOLOWER (*p);
+ *p = c_tolower (*p);
}
}
\f
* whitespace
* 8-bit and control chars
* characters that clearly cannot be part of name:
- '=', '>', '/'.
+ '=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
- && (x) != '=' && (x) != '>' && (x) != '/')
+ && (x) != '=' && (x) != '<' && (x) != '>' \
+ && (x) != '/')
#ifdef STANDALONE
static int comment_backout_count;
case '\n':
ch = *p++;
break;
+ case '<':
case '>':
state = AC_S_DONE;
break;
/* Skip whitespace, if any. */
#define SKIP_WS(p) do { \
- while (ISSPACE (*p)) { \
+ while (c_isspace (*p)) { \
ADVANCE (p); \
} \
} while (0)
/* Skip non-whitespace, if any. */
#define SKIP_NON_WS(p) do { \
- while (!ISSPACE (*p)) { \
+ while (!c_isspace (*p)) { \
ADVANCE (p); \
} \
} while (0)
bool attr_pair_resized = false;
struct attr_pair *pairs = attr_pair_initial_storage;
+ struct tagstack_item *head = NULL;
+ struct tagstack_item *tail = NULL;
+
if (!size)
return;
goto look_for_tag;
tag_name_end = p;
SKIP_WS (p);
- if (end_tag && *p != '>')
+
+ if (!end_tag)
+ {
+ struct tagstack_item *ts = tagstack_push (&head, &tail);
+ if (ts)
+ {
+ ts->tagname_begin = tag_name_begin;
+ ts->tagname_end = tag_name_end;
+ ts->contents_begin = NULL;
+ }
+ }
+
+ if (end_tag && *p != '>' && *p != '<')
goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
/* ^ */
ADVANCE (p);
SKIP_WS (p);
- if (*p != '>')
+ if (*p != '<' && *p != '>')
goto backout_tag;
}
/* Check for end of tag definition. */
- if (*p == '>')
+ if (*p == '<' || *p == '>')
break;
/* Establish bounds of attribute name. */
/* Establish bounds of attribute value. */
SKIP_WS (p);
- if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
+
+ if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
newline_seen = true;
continue;
}
- else if (newline_seen && *p == '>')
+ else if (newline_seen && (*p == '<' || *p == '>'))
break;
ADVANCE (p);
}
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
- while (!ISSPACE (*p) && *p != '>')
+ while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */
++nattrs;
}
+ if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+ {
+ tail->contents_begin = p+1;
+ }
+
if (uninteresting_tag)
{
ADVANCE (p);
{
int i;
struct taginfo taginfo;
+ struct tagstack_item *ts = NULL;
taginfo.name = pool.contents;
taginfo.end_tag_p = end_tag;
taginfo.attrs = pairs;
taginfo.start_position = tag_start_position;
taginfo.end_position = p + 1;
+ taginfo.contents_begin = NULL;
+ taginfo.contents_end = NULL;
+
+ if (end_tag)
+ {
+ ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+ if (ts)
+ {
+ if (ts->contents_begin)
+ {
+ taginfo.contents_begin = ts->contents_begin;
+ taginfo.contents_end = tag_start_position;
+ }
+ tagstack_pop (&head, &tail, ts);
+ }
+ }
+
mapfun (&taginfo, maparg);
- ADVANCE (p);
+ if (*p != '<')
+ ADVANCE (p);
}
goto look_for_tag;
POOL_FREE (&pool);
if (attr_pair_resized)
xfree (pairs);
+ /* pop any tag stack that's left */
+ tagstack_pop (&head, &tail, head);
}
#undef ADVANCE