static int
idmatch (struct tag_attr *tags, const char *tag, const char *attr)
{
- int i;
-
- if (!tag || !attr)
- return 0;
-
+ int i, j;
+
+ if (tag == NULL || attr == NULL)
+ return FALSE;
+
for (i = 0; tags[i].tag; i++)
+ /* Loop through all the tags wget ever cares about. */
if (!strcasecmp (tags[i].tag, tag) && !strcasecmp (tags[i].attr, attr))
- return 1;
- return 0;
+ /* The tag and attribute matched one of the ones wget cares about. */
+ {
+ if (opt.ignore_tags)
+ /* --ignore-tags was specified. Do not match these specific tags.
+ --ignore-tags takes precedence over --follow-tags, so we process
+ --ignore first and fall through if there's no match. */
+ for (j = 0; opt.ignore_tags[j] != NULL; j++)
+ /* Loop through all the tags this user doesn't care about. */
+ if (strcasecmp(opt.ignore_tags[j], tag) == EQ)
+ return FALSE;
+
+ if (opt.follow_tags)
+ /* --follow-tags was specified. Only match these specific tags, so
+ return FALSE if we don't match one of them. */
+ {
+ for (j = 0; opt.follow_tags[j] != NULL; j++)
+ /* Loop through all the tags this user cares about. */
+ if (strcasecmp(opt.follow_tags[j], tag) == EQ)
+ return TRUE;
+
+ return FALSE; /* wasn't one of the explicitly desired tags */
+ }
+
+ /* If we get to here, --follow-tags isn't being used, and --ignore-tags,
+ if specified, didn't include this tag, so it's okay to follow. */
+ return TRUE;
+ }
+
+ return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
}
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
address and the length of the string. Return NULL if no URL is
found. */
const char *
-htmlfindurl (const char *buf, int bufsize, int *size, int init)
+htmlfindurl (const char *buf, int bufsize, int *size, int init,
+ int dash_p_leaf_HTML)
{
const char *p, *ph;
- state_t *s;
+ state_t *s = &global_state;
+
/* NULL-terminated list of tags and modifiers someone would want to
follow -- feel free to edit to suit your needs: */
static struct tag_attr html_allow[] = {
- { "a", "href" },
- { "link", "href" },
{ "script", "src" },
{ "img", "src" },
{ "img", "href" },
{ "script", "src" },
{ "embed", "src" },
{ "bgsound", "src" },
- { "area", "href" },
{ "img", "lowsrc" },
{ "input", "src" },
{ "layer", "src" },
{ "th", "background"},
{ "td", "background"},
/* Tags below this line are treated specially. */
+ { "a", "href" },
+ { "area", "href" },
{ "base", "href" },
+ { "link", "href" },
+ { "link", "rel" },
{ "meta", "content" },
{ NULL, NULL }
};
- s = &global_state;
-
if (init)
{
DEBUGP (("Resetting a parser state.\n"));
while (1)
{
+ const char* link_href = NULL;
+ const char* link_rel = NULL;
+ int link_href_saved_size = 0; /* init. just to shut up warning */
+
if (!bufsize)
break;
/* Let's look for a tag, if we are not already in one. */
/* Now we must skip the spaces to find '='. */
if (*buf != '=')
{
- for (; bufsize && ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize);
+ for (; bufsize && ISSPACE (*buf) && *buf != '>';
+ ++buf, --bufsize);
if (!bufsize || *buf == '>')
break;
}
for (++buf, --bufsize;
bufsize && *buf != s->quote_char && *buf != '\n';
++buf, --bufsize)
- if (*buf == '#')
+ if (!ph && *buf == '#' && *(buf - 1) != '&')
ph = buf;
if (!bufsize)
{
else
{
p = buf;
- for (; bufsize && !ISSPACE (*buf) && *buf != '>'; ++buf, --bufsize)
- if (*buf == '#')
+ for (; bufsize && !ISSPACE (*buf) && *buf != '>';
+ ++buf, --bufsize)
+ if (!ph && *buf == '#' && *(buf - 1) != '&')
ph = buf;
if (!bufsize)
break;
2) its tag and attribute are found in html_allow. */
if (*size && idmatch (html_allow, s->tag, s->attr))
{
- if (!strcasecmp (s->tag, "base") && !strcasecmp (s->attr, "href"))
+ if (strcasecmp(s->tag, "a") == EQ ||
+ strcasecmp(s->tag, "area") == EQ)
+ {
+ /* Only follow these if we're not at a -p leaf node, as they
+ always link to external documents. */
+ if (!dash_p_leaf_HTML)
+ {
+ s->at_value = 1;
+ return p;
+ }
+ }
+ else if (!strcasecmp (s->tag, "base") &&
+ !strcasecmp (s->attr, "href"))
{
FREE_MAYBE (s->base);
s->base = strdupdelim (p, buf);
}
- else if (!strcasecmp (s->tag, "meta") && !strcasecmp (s->attr, "content"))
+ else if (strcasecmp(s->tag, "link") == EQ)
+ {
+ if (strcasecmp(s->attr, "href") == EQ)
+ {
+ link_href = p;
+ link_href_saved_size = *size; /* for restoration below */
+ }
+ else if (strcasecmp(s->attr, "rel") == EQ)
+ link_rel = p;
+
+ if (link_href != NULL && link_rel != NULL)
+ /* Okay, we've now seen this <LINK> tag's HREF and REL
+ attributes (they may be in either order), so it's now
+ possible to decide if we want to traverse it. */
+ if (!dash_p_leaf_HTML ||
+ strncasecmp(link_rel, "stylesheet",
+ sizeof("stylesheet") - 1) == EQ)
+ /* In the normal case, all <LINK> tags are fair game.
+
+ In the special case of when -p is active, however, and
+ we're at a leaf node (relative to the -l max. depth) in
+ the HTML document tree, the only <LINK> tag we'll
+ follow is a <LINK REL="stylesheet">, as it's necessary
+ for displaying this document properly. We won't follow
+ other <LINK> tags, like <LINK REL="home">, for
+ instance, as they refer to external documents.
+
+ Note that the above strncasecmp() will incorrectly
+ consider something like '<LINK REL="stylesheet.old"' as
+ equivalent to '<LINK REL="stylesheet"'. Not really
+ worth the trouble to explicitly check for such cases --
+ if time is spent, it should be spent ripping out wget's
+ somewhat kludgy HTML parser and hooking in a real,
+ componentized one. */
+ {
+ /* When we return, the 'size' IN/OUT parameter
+ determines where in the buffer the end of the current
+ attribute value is. If REL came after HREF in this
+ <LINK> tag, size is currently set to the size for
+ REL's value -- set it to what it was when we were
+ looking at HREF's value. */
+ *size = link_href_saved_size;
+
+ s->at_value = 1;
+ return link_href;
+ }
+ }
+ else if (!strcasecmp (s->tag, "meta") &&
+ !strcasecmp (s->attr, "content"))
{
/* Some pages use a META tag to specify that the page
be refreshed by a new page after a given number of
for (; *size && ISDIGIT (*p); p++, *size -= 1);
if (*p == ';')
{
- for (p++, *size -= 1; *size && ISSPACE (*p); p++, *size -= 1) ;
+ for (p++, *size -= 1;
+ *size && ISSPACE (*p);
+ p++, *size -= 1) ;
if (!strncasecmp (p, "URL=", 4))
{
p += 4, *size -= 4;
return global_state.base;
}
+/* Create a malloc'ed copy of text in the range [beg, end), but with
+ the HTML entities processed. Recognized entities are <, >,
+ &, ",   and the numerical entities. */
+
+char *
+html_decode_entities (const char *beg, const char *end)
+{
+ char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
+ const char *from = beg;
+ char *to = newstr;
+
+ while (from < end)
+ {
+ if (*from != '&')
+ *to++ = *from++;
+ else
+ {
+ const char *save = from;
+ int remain;
+
+ if (++from == end) goto lose;
+ remain = end - from;
+
+ if (*from == '#')
+ {
+ int numeric;
+ ++from;
+ if (from == end || !ISDIGIT (*from)) goto lose;
+ for (numeric = 0; from < end && ISDIGIT (*from); from++)
+ numeric = 10 * numeric + (*from) - '0';
+ if (from < end && ISALPHA (*from)) goto lose;
+ numeric &= 0xff;
+ *to++ = numeric;
+ }
+#define FROB(literal) (remain >= (sizeof (literal) - 1) \
+ && !memcmp (from, literal, sizeof (literal) - 1) \
+ && (*(from + sizeof (literal) - 1) == ';' \
+ || remain == sizeof (literal) - 1 \
+ || !ISALNUM (*(from + sizeof (literal) - 1))))
+ else if (FROB ("lt"))
+ *to++ = '<', from += 2;
+ else if (FROB ("gt"))
+ *to++ = '>', from += 2;
+ else if (FROB ("amp"))
+ *to++ = '&', from += 3;
+ else if (FROB ("quot"))
+ *to++ = '\"', from += 4;
+ /* We don't implement the "Added Latin 1" entities proposed
+ by rfc1866 (except for nbsp), because it is unnecessary
+ in the context of Wget, and would require hashing to work
+ efficiently. */
+ else if (FROB ("nbsp"))
+ *to++ = 160, from += 4;
+ else
+ goto lose;
+#undef FROB
+ /* If the entity was followed by `;', we step over the `;'.
+ Otherwise, it was followed by either a non-alphanumeric
+ or EOB, in which case we do nothing. */
+ if (from < end && *from == ';')
+ ++from;
+ continue;
+
+ lose:
+ /* This was not an entity after all. Back out. */
+ from = save;
+ *to++ = *from++;
+ }
+ }
+ *to++ = '\0';
+ /* #### Should we try to do this: */
+#if 0
+ newstr = xrealloc (newstr, to - newstr);
+#endif
+ return newstr;
+}
+
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic
entities, as per RFC1866: