- case TC_LINK:
- {
- int i;
- int size = ARRAY_SIZE (url_tag_attr_map);
- for (i = 0; i < size; i++)
- if (url_tag_attr_map[i].tagid == tagid)
- break;
- /* We've found the index of url_tag_attr_map where the
- attributes of our tags begin. Now, look for every one of
- them, and handle it. */
- for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
- {
- char *attr_value;
- int id;
- if (closure->dash_p_leaf_HTML
- && (url_tag_attr_map[i].flags & AF_EXTERNAL))
- /* If we're at a -p leaf node, we don't want to retrieve
- links to references we know are external, such as <a
- href=...>. */
- continue;
-
- /* This find_attr() buried in a loop may seem inefficient
- (O(n^2)), but it's not, since the number of attributes
- (n) we loop over is extremely small. In the worst case
- of IMG with all its possible attributes, n^2 will be
- only 9. */
- attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
- if (attr_value)
- handle_link (closure, attr_value, tag, id);
- }
- }
- break;
- case TC_SPEC:
- switch (tagid)
- {
- case TAG_BASE:
- {
- char *newbase = find_attr (tag, "href", NULL);
- if (!newbase)
- break;
- if (closure->base)
- xfree (closure->base);
- if (closure->parent_base)
- closure->base = url_concat (closure->parent_base, newbase);
- else
- closure->base = xstrdup (newbase);
- }
- break;
- case TAG_LINK:
- {
- int id;
- char *rel = find_attr (tag, "rel", NULL);
- char *href = find_attr (tag, "href", &id);
- if (href)
- {
- /* In the normal case, all <link href=...> tags are
- fair game.
-
- In the special case of when -p is active, however,
- and we're at a leaf node (relative to the -l
- max. depth) in the HTML document tree, the only
- <LINK> tag we'll follow is a <LINK REL=
- "stylesheet">, as it's necessary for displaying
- this document properly. We won't follow other
- <LINK> tags, like <LINK REL="home">, for instance,
- as they refer to external documents. */
- if (!closure->dash_p_leaf_HTML
- || (rel && !strcasecmp (rel, "stylesheet")))
- handle_link (closure, href, tag, id);
- }
- }
- break;
- case TAG_META:
- /* Some pages use a META tag to specify that the page be
- refreshed by a new page after a given number of seconds.
- The general format for this is:
-
- <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
-
- So we just need to skip past the "NUMBER; URL=" garbage
- to get to the URL. */
- {
- int id;
- char *name = find_attr (tag, "name", NULL);
- char *http_equiv = find_attr (tag, "http-equiv", &id);
- if (http_equiv && !strcasecmp (http_equiv, "refresh"))
- {
- char *refresh = find_attr (tag, "content", NULL);
- char *p = refresh;
- int offset;
- while (ISDIGIT (*p))
- ++p;
- if (*p++ != ';')
- return;
- while (ISSPACE (*p))
- ++p;
- if (!(TOUPPER (*p) == 'U'
- && TOUPPER (*(p + 1)) == 'R'
- && TOUPPER (*(p + 2)) == 'L'
- && *(p + 3) == '='))
- return;
- p += 4;
- while (ISSPACE (*p))
- ++p;
- offset = p - refresh;
- tag->attrs[id].value_raw_beginning += offset;
- tag->attrs[id].value_raw_size -= offset;
- handle_link (closure, p, tag, id);
- }
- else if (name && !strcasecmp (name, "robots"))
- {
- /* Handle stuff like:
- <meta name="robots" content="index,nofollow"> */
- char *content = find_attr (tag, "content", NULL);
- if (!content)
- return;
- if (!strcasecmp (content, "none"))
- closure->nofollow = 1;
- else
- {
- while (*content)
- {
- /* Find the next occurrence of ',' or the end of
- the string. */
- char *end = strchr (content, ',');
- if (end)
- ++end;
- else
- end = content + strlen (content);
- if (!strncasecmp (content, "nofollow", end - content))
- closure->nofollow = 1;
- content = end;
- }
- }
- }
- }
- break;
- default:
- /* Category is TC_SPEC, but tag name is unhandled. This
- must not be. */
- abort ();
- }
- break;
+ /* Some pages use a META tag to specify that the page be
+ refreshed by a new page after a given number of seconds. The
+ general format for this is:
+
+ <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
+
+ So we just need to skip past the "NUMBER; URL=" garbage to
+ get to the URL. */
+
+ struct urlpos *entry;
+ int attrind;
+ int timeout = 0;
+ char *p;
+
+ char *refresh = find_attr (tag, "content", &attrind);
+ if (!refresh)
+ return;
+
+ for (p = refresh; c_isdigit (*p); p++)
+ timeout = 10 * timeout + *p - '0';
+ if (*p++ != ';')
+ return;
+
+ while (c_isspace (*p))
+ ++p;
+ if (!( c_toupper (*p) == 'U'
+ && c_toupper (*(p + 1)) == 'R'
+ && c_toupper (*(p + 2)) == 'L'
+ && *(p + 3) == '='))
+ return;
+ p += 4;
+ while (c_isspace (*p))
+ ++p;
+
+ entry = append_url (p, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (entry)
+ {
+ entry->link_refresh_p = 1;
+ entry->refresh_timeout = timeout;
+ entry->link_expect_html = 1;
+ }
+ }
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ xfree_null (meta_charset);
+ meta_charset = mcharset;
+ }
+ else if (name && 0 == strcasecmp (name, "robots"))
+ {
+ /* Handle stuff like:
+ <meta name="robots" content="index,nofollow"> */
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+ if (!strcasecmp (content, "none"))
+ ctx->nofollow = true;
+ else
+ {
+ while (*content)
+ {
+ char *end;
+ /* Skip any initial whitespace. */
+ content += strspn (content, " \f\n\r\t\v");
+ /* Find the next occurrence of ',' or whitespace,
+ * or the end of the string. */
+ end = content + strcspn (content, ", \f\n\r\t\v");
+ if (!strncasecmp (content, "nofollow", end - content))
+ ctx->nofollow = true;
+ /* Skip past the next comma, if any. */
+ if (*end == ',')
+ ++end;
+ else
+ {
+ end = strchr (end, ',');
+ if (end)
+ ++end;
+ else
+ end = content + strlen (content);
+ }
+ content = end;
+ }
+ }