return FALSE; /* not one of the tag/attribute pairs wget ever cares about */
}
-
/* Parse BUF (a buffer of BUFSIZE characters) searching for HTML tags
describing URLs to follow. When a tag is encountered, extract its
components (as described by html_allow[] array), and return the
for (++buf, --bufsize;
bufsize && *buf != s->quote_char && *buf != '\n';
++buf, --bufsize)
- if (!ph && *buf == '#')
+ if (!ph && *buf == '#' && *(buf - 1) != '&')
ph = buf;
if (!bufsize)
{
p = buf;
for (; bufsize && !ISSPACE (*buf) && *buf != '>';
++buf, --bufsize)
- if (!ph && *buf == '#')
+ if (!ph && *buf == '#' && *(buf - 1) != '&')
ph = buf;
if (!bufsize)
break;
return global_state.base;
}
+/* Create a malloc'ed copy of text in the range [beg, end), but with
+ the HTML entities processed. Recognized entities are <, >,
+ &, ",   and the numerical entities. */
+
+char *
+html_decode_entities (const char *beg, const char *end)
+{
+ char *newstr = (char *)xmalloc (end - beg + 1); /* assume worst-case. */
+ const char *from = beg;
+ char *to = newstr;
+
+ while (from < end)
+ {
+ if (*from != '&')
+ *to++ = *from++;
+ else
+ {
+ const char *save = from;
+ int remain;
+
+ if (++from == end) goto lose;
+ remain = end - from;
+
+ if (*from == '#')
+ {
+ int numeric;
+ ++from;
+ if (from == end || !ISDIGIT (*from)) goto lose;
+ for (numeric = 0; from < end && ISDIGIT (*from); from++)
+ numeric = 10 * numeric + (*from) - '0';
+ if (from < end && ISALPHA (*from)) goto lose;
+ numeric &= 0xff;
+ *to++ = numeric;
+ }
+#define FROB(literal) (remain >= (sizeof (literal) - 1) \
+ && !memcmp (from, literal, sizeof (literal) - 1) \
+ && (*(from + sizeof (literal) - 1) == ';' \
+ || remain == sizeof (literal) - 1 \
+ || !ISALNUM (*(from + sizeof (literal) - 1))))
+ else if (FROB ("lt"))
+ *to++ = '<', from += 2;
+ else if (FROB ("gt"))
+ *to++ = '>', from += 2;
+ else if (FROB ("amp"))
+ *to++ = '&', from += 3;
+ else if (FROB ("quot"))
+ *to++ = '\"', from += 4;
+ /* We don't implement the "Added Latin 1" entities proposed
+ by rfc1866 (except for nbsp), because it is unnecessary
+ in the context of Wget, and would require hashing to work
+ efficiently. */
+ else if (FROB ("nbsp"))
+ *to++ = 160, from += 4;
+ else
+ goto lose;
+#undef FROB
+ /* If the entity was followed by `;', we step over the `;'.
+ Otherwise, it was followed by either a non-alphanumeric
+ or EOB, in which case we do nothing. */
+ if (from < end && *from == ';')
+ ++from;
+ continue;
+
+ lose:
+ /* This was not an entity after all. Back out. */
+ from = save;
+ *to++ = *from++;
+ }
+ }
+ *to++ = '\0';
+ /* #### Should we try to do this: */
+#if 0
+ newstr = xrealloc (newstr, to - newstr);
+#endif
+ return newstr;
+}
+
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic
entities, as per RFC1866:
const char *pbuf = buf;
char *constr, *base;
const char *cbase;
+ char *needs_freeing, *url_data;
first_time = 0;
if (!size)
break;
+ /* It would be nice if we could avoid allocating memory in this
+ loop, but I don't see an easy way. To process the entities,
+ we need to either copy the data, or change it destructively.
+ I choose the former.
+
+ We have two pointers: needs_freeing and url_data, because the
+ code below does thing like url_data += <something>, and we
+ want to pass the original string to free(). */
+ needs_freeing = url_data = html_decode_entities (pbuf, pbuf + size);
+ size = strlen (url_data);
+
for (i = 0; protostrings[i]; i++)
{
- if (!strncasecmp (protostrings[i], pbuf,
+ if (!strncasecmp (protostrings[i], url_data,
MINVAL (strlen (protostrings[i]), size)))
break;
}
/* Check for http:RELATIVE_URI. See below for details. */
if (protostrings[i]
- && !(strncasecmp (pbuf, "http:", 5) == 0
- && strncasecmp (pbuf, "http://", 7) != 0))
+ && !(strncasecmp (url_data, "http:", 5) == 0
+ && strncasecmp (url_data, "http://", 7) != 0))
{
no_proto = 0;
}
relative URI-s as <a href="http:URL">. Just strip off the
silly leading "http:" (as well as any leading blanks
before it). */
- if ((size > 5) && !strncasecmp ("http:", pbuf, 5))
- pbuf += 5, size -= 5;
+ if ((size > 5) && !strncasecmp ("http:", url_data, 5))
+ url_data += 5, size -= 5;
}
if (!no_proto)
{
for (i = 0; i < ARRAY_SIZE (sup_protos); i++)
{
- if (!strncasecmp (sup_protos[i].name, pbuf,
+ if (!strncasecmp (sup_protos[i].name, url_data,
MINVAL (strlen (sup_protos[i].name), size)))
break;
}
/* Do *not* accept a non-supported protocol. */
if (i == ARRAY_SIZE (sup_protos))
- continue;
+ {
+ free (needs_freeing);
+ continue;
+ }
}
if (no_proto)
{
/* Use malloc, not alloca because this is called in
a loop. */
char *temp = (char *)malloc (size + 1);
- strncpy (temp, pbuf, size);
+ strncpy (temp, url_data, size);
temp[size] = '\0';
logprintf (LOG_NOTQUIET,
_("Error (%s): Link %s without a base provided.\n"),
file, temp);
free (temp);
}
+ free (needs_freeing);
continue;
}
if (this_url)
logprintf (LOG_NOTQUIET, _("\
Error (%s): Base %s relative, without referer URL.\n"),
file, cbase);
+ free (needs_freeing);
continue;
}
base = xstrdup (cbase);
}
- constr = construct (base, pbuf, size, no_proto);
+ constr = construct (base, url_data, size, no_proto);
free (base);
}
else /* has proto */
{
constr = (char *)xmalloc (size + 1);
- strncpy (constr, pbuf, size);
+ strncpy (constr, url_data, size);
constr[size] = '\0';
}
#ifdef DEBUG
tmp2 = html_base ();
/* Use malloc, not alloca because this is called in a loop. */
tmp = (char *)xmalloc (size + 1);
- strncpy (tmp, pbuf, size);
+ strncpy (tmp, url_data, size);
tmp[size] = '\0';
logprintf (LOG_ALWAYS,
"file %s; this_url %s; base %s\nlink: %s; constr: %s\n",
memset (current, 0, sizeof (*current));
current->next = NULL;
current->url = constr;
- current->size = size;
- current->pos = pbuf - orig_buf;
+ current->size = step;
+ current->pos = buf - orig_buf;
/* A URL is relative if the host and protocol are not named,
and the name does not start with `/'. */
- if (no_proto && *pbuf != '/')
+ if (no_proto && *url_data != '/')
current->flags |= (URELATIVE | UNOPROTO);
else if (no_proto)
current->flags |= UNOPROTO;
+ free (needs_freeing);
}
free (orig_buf);