X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhtml-parse.c;h=20791cd83450272210c17a129ecd6904aec86231;hp=243ead236524d1465fcae4a79226190ea58b88a4;hb=320cfdcb658e8d6556ae9dfd902c2db1db866a6b;hpb=e7d78dd2a75d0ea21a63ac4968df6079b8018cd5
diff --git a/src/html-parse.c b/src/html-parse.c
index 243ead23..20791cd8 100644
--- a/src/html-parse.c
+++ b/src/html-parse.c
@@ -1,6 +1,6 @@
/* HTML parser for Wget.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
This file is part of GNU Wget.
@@ -17,15 +17,16 @@ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Wget. If not, see .
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
/* The only entry point to this module is map_html_tags(), which see. */
@@ -88,7 +89,7 @@ so, delete this exception statement from your version. */
/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
still need Wget headers to compile. */
-#include
+#include "wget.h"
#ifdef STANDALONE
# define I_REALLY_WANT_CTYPE_MACROS
@@ -99,7 +100,7 @@ so, delete this exception statement from your version. */
#include
#include
-#include "wget.h"
+#include "utils.h"
#include "html-parse.h"
#ifdef STANDALONE
@@ -110,21 +111,21 @@ so, delete this exception statement from your version. */
# define xrealloc realloc
# define xfree free
-# undef ISSPACE
-# undef ISDIGIT
-# undef ISXDIGIT
-# undef ISALPHA
-# undef ISALNUM
-# undef TOLOWER
-# undef TOUPPER
-
-# define ISSPACE(x) isspace (x)
-# define ISDIGIT(x) isdigit (x)
-# define ISXDIGIT(x) isxdigit (x)
-# define ISALPHA(x) isalpha (x)
-# define ISALNUM(x) isalnum (x)
-# define TOLOWER(x) tolower (x)
-# define TOUPPER(x) toupper (x)
+# undef c_isspace
+# undef c_isdigit
+# undef c_isxdigit
+# undef c_isalpha
+# undef c_isalnum
+# undef c_tolower
+# undef c_toupper
+
+# define c_isspace(x) isspace (x)
+# define c_isdigit(x) isdigit (x)
+# define c_isxdigit(x) isxdigit (x)
+# define c_isalpha(x) isalpha (x)
+# define c_isalnum(x) isalnum (x)
+# define c_tolower(x) tolower (x)
+# define c_toupper(x) toupper (x)
struct hash_table {
int dummy;
@@ -258,7 +259,7 @@ struct pool {
However, "<foo" will work, as will "<!foo", "<", etc. In
other words an entity needs to be terminated by either a
non-alphanumeric or the end of string. */
-#define FITS(p, n) (p + n == end || (p + n < end && !ISALNUM (p[n])))
+#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
/* Macros that test entity names by returning true if P is followed by
the specified characters. */
@@ -271,6 +272,94 @@ struct pool {
to "prev = ts->next = NULL;
+ }
+ else
+ {
+ (*tail)->next = ts;
+ ts->prev = *tail;
+ *tail = ts;
+ ts->next = NULL;
+ }
+
+ return ts;
+}
+
+/* remove ts and everything after it from the stack */
+static void
+tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
+ struct tagstack_item *ts)
+{
+ if (*head == NULL)
+ return;
+
+ if (ts == *tail)
+ {
+ if (ts == *head)
+ {
+ xfree (ts);
+ *head = *tail = NULL;
+ }
+ else
+ {
+ ts->prev->next = NULL;
+ *tail = ts->prev;
+ xfree (ts);
+ }
+ }
+ else
+ {
+ if (ts == *head)
+ {
+ *head = NULL;
+ }
+ *tail = ts->prev;
+
+ if (ts->prev)
+ {
+ ts->prev->next = NULL;
+ }
+ while (ts)
+ {
+ struct tagstack_item *p = ts->next;
+ xfree (ts);
+ ts = p;
+ }
+ }
+}
+
+static struct tagstack_item *
+tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
+ const char *tagname_end)
+{
+ int len = tagname_end - tagname_begin;
+ while (tail)
+ {
+ if (len == (tail->tagname_end - tail->tagname_begin))
+ {
+ if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
+ return tail;
+ }
+ tail = tail->prev;
+ }
+ return NULL;
+}
+
/* Decode the HTML character entity at *PTR, considering END to be end
of buffer. It is assumed that the "&" character that marks the
beginning of the entity has been seen at *PTR-1. If a recognized
@@ -296,10 +385,10 @@ decode_entity (const char **ptr, const char *end)
int digits = 0;
value = 0;
if (*p == 'x')
- for (++p; value < 256 && p < end && ISXDIGIT (*p); p++, digits++)
+ for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
value = (value << 4) + XDIGIT_TO_NUM (*p);
else
- for (; value < 256 && p < end && ISDIGIT (*p); p++, digits++)
+ for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
value = (value * 10) + (*p - '0');
if (!digits)
return -1;
@@ -368,9 +457,9 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
` '. */
if (flags & AP_TRIM_BLANKS)
{
- while (beg < end && ISSPACE (*beg))
+ while (beg < end && c_isspace (*beg))
++beg;
- while (end > beg && ISSPACE (end[-1]))
+ while (end > beg && c_isspace (end[-1]))
--end;
}
@@ -425,7 +514,7 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
{
char *p = pool->contents + old_tail;
for (; *p; p++)
- *p = TOLOWER (*p);
+ *p = c_tolower (*p);
}
}
@@ -439,13 +528,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
* whitespace
* 8-bit and control chars
* characters that clearly cannot be part of name:
- '=', '>', '/'.
+ '=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
- && (x) != '=' && (x) != '>' && (x) != '/')
+ && (x) != '=' && (x) != '<' && (x) != '>' \
+ && (x) != '/')
#ifdef STANDALONE
static int comment_backout_count;
@@ -530,6 +620,7 @@ advance_declaration (const char *beg, const char *end)
case '\n':
ch = *p++;
break;
+ case '<':
case '>':
state = AC_S_DONE;
break;
@@ -705,7 +796,7 @@ name_allowed (const struct hash_table *ht, const char *b, const char *e)
/* Skip whitespace, if any. */
#define SKIP_WS(p) do { \
- while (ISSPACE (*p)) { \
+ while (c_isspace (*p)) { \
ADVANCE (p); \
} \
} while (0)
@@ -713,7 +804,7 @@ name_allowed (const struct hash_table *ht, const char *b, const char *e)
/* Skip non-whitespace, if any. */
#define SKIP_NON_WS(p) do { \
- while (!ISSPACE (*p)) { \
+ while (!c_isspace (*p)) { \
ADVANCE (p); \
} \
} while (0)
@@ -756,6 +847,9 @@ map_html_tags (const char *text, int size,
bool attr_pair_resized = false;
struct attr_pair *pairs = attr_pair_initial_storage;
+ struct tagstack_item *head = NULL;
+ struct tagstack_item *tail = NULL;
+
if (!size)
return;
@@ -822,7 +916,19 @@ map_html_tags (const char *text, int size,
goto look_for_tag;
tag_name_end = p;
SKIP_WS (p);
- if (end_tag && *p != '>')
+
+ if (!end_tag)
+ {
+ struct tagstack_item *ts = tagstack_push (&head, &tail);
+ if (ts)
+ {
+ ts->tagname_begin = tag_name_begin;
+ ts->tagname_end = tag_name_end;
+ ts->contents_begin = NULL;
+ }
+ }
+
+ if (end_tag && *p != '>' && *p != '<')
goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
@@ -854,12 +960,12 @@ map_html_tags (const char *text, int size,
/* ^ */
ADVANCE (p);
SKIP_WS (p);
- if (*p != '>')
+ if (*p != '<' && *p != '>')
goto backout_tag;
}
/* Check for end of tag definition. */
- if (*p == '>')
+ if (*p == '<' || *p == '>')
break;
/* Establish bounds of attribute name. */
@@ -874,7 +980,8 @@ map_html_tags (const char *text, int size,
/* Establish bounds of attribute value. */
SKIP_WS (p);
- if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
+
+ if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, is a valid shorthand for ')
+ else if (newline_seen && (*p == '<' || *p == '>'))
break;
ADVANCE (p);
}
@@ -936,7 +1043,7 @@ map_html_tags (const char *text, int size,
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
- while (!ISSPACE (*p) && *p != '>')
+ while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p);
attr_value_end = p; /* */
/* ^ */
@@ -983,6 +1090,11 @@ map_html_tags (const char *text, int size,
++nattrs;
}
+ if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
+ {
+ tail->contents_begin = p+1;
+ }
+
if (uninteresting_tag)
{
ADVANCE (p);
@@ -994,6 +1106,7 @@ map_html_tags (const char *text, int size,
{
int i;
struct taginfo taginfo;
+ struct tagstack_item *ts = NULL;
taginfo.name = pool.contents;
taginfo.end_tag_p = end_tag;
@@ -1010,8 +1123,26 @@ map_html_tags (const char *text, int size,
taginfo.attrs = pairs;
taginfo.start_position = tag_start_position;
taginfo.end_position = p + 1;
+ taginfo.contents_begin = NULL;
+ taginfo.contents_end = NULL;
+
+ if (end_tag)
+ {
+ ts = tagstack_find (tail, tag_name_begin, tag_name_end);
+ if (ts)
+ {
+ if (ts->contents_begin)
+ {
+ taginfo.contents_begin = ts->contents_begin;
+ taginfo.contents_end = tag_start_position;
+ }
+ tagstack_pop (&head, &tail, ts);
+ }
+ }
+
mapfun (&taginfo, maparg);
- ADVANCE (p);
+ if (*p != '<')
+ ADVANCE (p);
}
goto look_for_tag;
@@ -1029,6 +1160,8 @@ map_html_tags (const char *text, int size,
POOL_FREE (&pool);
if (attr_pair_resized)
xfree (pairs);
+ /* pop any tag stack that's left */
+ tagstack_pop (&head, &tail, head);
}
#undef ADVANCE