/* HTML parser for Wget.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007, 2008 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
This file is part of GNU Wget.
* whitespace
* 8-bit and control chars
* characters that clearly cannot be part of name:
- '=', '>', '/'.
+ '=', '<', '>', '/'.
This only affects attribute and tag names; attribute values allow
an even greater variety of characters. */
#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
- && (x) != '=' && (x) != '>' && (x) != '/')
+ && (x) != '=' && (x) != '<' && (x) != '>' \
+ && (x) != '/')
#ifdef STANDALONE
static int comment_backout_count;
case '\n':
ch = *p++;
break;
+ case '<':
case '>':
state = AC_S_DONE;
break;
}
}
- if (end_tag && *p != '>')
+ if (end_tag && *p != '>' && *p != '<')
goto backout_tag;
if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
/* ^ */
ADVANCE (p);
SKIP_WS (p);
- if (*p != '>')
+ if (*p != '<' && *p != '>')
goto backout_tag;
}
/* Check for end of tag definition. */
- if (*p == '>')
+ if (*p == '<' || *p == '>')
break;
/* Establish bounds of attribute name. */
/* Establish bounds of attribute value. */
SKIP_WS (p);
- if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
+
+ if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
newline_seen = true;
continue;
}
- else if (newline_seen && *p == '>')
+ else if (newline_seen && (*p == '<' || *p == '>'))
break;
ADVANCE (p);
}
violated by, for instance, `%' in `width=75%'.
We'll be liberal and allow just about anything as
an attribute value. */
- while (!c_isspace (*p) && *p != '>')
+ while (!c_isspace (*p) && *p != '<' && *p != '>')
ADVANCE (p);
attr_value_end = p; /* <foo bar=baz qux=quix> */
/* ^ */
}
mapfun (&taginfo, maparg);
- ADVANCE (p);
+ if (*p != '<')
+ ADVANCE (p);
}
goto look_for_tag;