/* HTML parser for Wget.
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
-This file is part of Wget.
+This file is part of GNU Wget.
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version.
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+along with Wget; if not, write to the Free Software
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
/* The only entry point to this module is map_html_tags(), which see. */
as a backend for one.
Due to time and other constraints, this parser was not integrated
- into Wget until the version ???. */
+ into Wget until the version 1.7. */
/* DESCRIPTION:
#include <config.h>
+#ifdef STANDALONE
+# define I_REALLY_WANT_CTYPE_MACROS
+#endif
+
#include <stdio.h>
#include <stdlib.h>
-#include <ctype.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# define xmalloc malloc
# define xrealloc realloc
# define xfree free
+
+# define ISSPACE(x) isspace (x)
+# define ISDIGIT(x) isdigit (x)
+# define ISALPHA(x) isalpha (x)
+# define ISALNUM(x) isalnum (x)
+# define TOLOWER(x) tolower (x)
#endif /* STANDALONE */
-/* Pool support. For efficiency, map_html_tags() stores temporary
- string data to a single stack-allocated pool. If the pool proves
- too small, additional memory is allocated/resized with
- malloc()/realloc(). */
+/* Pool support. A pool is a resizable chunk of memory. It is first
+ allocated on the stack, and moved to the heap if it needs to be
+ larger than originally expected. map_html_tags() uses it to store
+ the zero-terminated names and values of tags and attributes.
+
+ Thus taginfo->name, and attr->name and attr->value for each
+ attribute, do not point into separately allocated areas, but into
+ different parts of the pool, separated only by terminating zeros.
+ This ensures minimum amount of allocation and, for most tags, no
+ allocation because the entire pool is kept on the stack. */
struct pool {
char *contents; /* pointer to the contents. */
return 1;
}
\f
-/* RFC1866: name [of attribute or tag] consists of letters, digits,
- periods, or hyphens. We also allow _, for compatibility with
- brain-damaged generators. */
-#define NAME_CHAR_P(x) (ISALNUM (x) || (x) == '.' || (x) == '-' || (x) == '_')
+/* Originally we used to adhere to RFC1866 here, and allowed only
+ letters, digits, periods, and hyphens as names (of tags or
+ attributes). However, this broke too many pages which used
+ proprietary or strange attributes, e.g. <img src="a.gif"
+ v:shapes="whatever">.
+
+ So now we allow any character except:
+ * whitespace
+ * 8-bit and control chars
+ * characters that clearly cannot be part of name:
+ '=', '>', '/'.
+
+ This only affects attribute and tag names; attribute values allow
+ an even greater variety of characters. */
+
+#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
+ && (x) != '=' && (x) != '>' && (x) != '/')
/* States while advancing through comments. */
#define AC_S_DONE 0
}
break;
case AC_S_DCLNAME:
- if (NAME_CHAR_P (ch))
- ch = *p++;
- else if (ch == '-')
+ if (ch == '-')
state = AC_S_DASH1;
+ else if (NAME_CHAR_P (ch))
+ ch = *p++;
else
state = AC_S_DEFAULT;
break;
case AC_S_QUOTE1:
- assert (ch == '\'' || ch == '"');
+ /* We must use 0x22 because broken assert macros choke on
+ '"' and '\"'. */
+ assert (ch == '\'' || ch == 0x22);
quote_char = ch; /* cheating -- I really don't feel like
introducing more different states for
different quote characters. */
SKIP_WS (p);
+ if (*p == '/')
+ {
+ /* A slash at this point means the tag is about to be
+ closed. This is legal in XML and has been popularized
+ in HTML via XHTML. */
+ /* <foo a=b c=d /> */
+ /* ^ */
+ ADVANCE (p);
+ SKIP_WS (p);
+ if (*p != '>')
+ goto backout_tag;
+ }
+
/* Check for end of tag definition. */
if (*p == '>')
break;
/* Establish bounds of attribute value. */
SKIP_WS (p);
- if (NAME_CHAR_P (*p) || *p == '>')
+ if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
{
/* Minimized attribute syntax allows `=' to be omitted.
For example, <UL COMPACT> is a valid shorthand for <UL
/* We skipped the whitespace and found something that is
neither `=' nor the beginning of the next attribute's
name. Back out. */
- goto backout_tag; /* <foo bar /... */
+ goto backout_tag; /* <foo bar [... */
/* ^ */
}