+static void
+tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
+{
+ int attrind;
+ char *action = find_attr (tag, "action", &attrind);
+
+ if (action)
+ {
+ struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (up)
+ up->ignore_when_downloading = 1;
+ }
+}
+
+/* Handle the LINK tag. It requires special handling because how its
+ links will be followed in -p mode depends on the REL attribute. */
+
+static void
+tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
+{
+ int attrind;
+ char *href = find_attr (tag, "href", &attrind);
+
+ /* All <link href="..."> link references are external, except those
+ known not to be, such as style sheet and shortcut icon:
+
+ <link rel="stylesheet" href="...">
+ <link rel="shortcut icon" href="...">
+ */
+ if (href)
+ {
+ struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (up)
+ {
+ char *rel = find_attr (tag, "rel", NULL);
+ if (rel)
+ {
+ if (0 == strcasecmp (rel, "stylesheet"))
+ {
+ up->link_inline_p = 1;
+ up->link_expect_css = 1;
+ }
+ else if (0 == strcasecmp (rel, "shortcut icon"))
+ {
+ up->link_inline_p = 1;
+ }
+ else
+ {
+ /* The external ones usually point to HTML pages, such as
+ <link rel="next" href="...">
+ except when the type attribute says otherwise:
+ <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
+ */
+ char *type = find_attr (tag, "type", NULL);
+ if (!type || strcasecmp (type, "text/html") == 0)
+ up->link_expect_html = 1;
+ }
+ }
+ }
+ }
+}
+
+/* Handle the META tag. This requires special handling because of the
+ refresh feature and because of robot exclusion. */
+
+static void
+tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
+{
+ char *name = find_attr (tag, "name", NULL);
+ char *http_equiv = find_attr (tag, "http-equiv", NULL);
+
+ if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
+ {
+ /* Some pages use a META tag to specify that the page be
+ refreshed by a new page after a given number of seconds. The
+ general format for this is:
+
+ <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
+
+ So we just need to skip past the "NUMBER; URL=" garbage to
+ get to the URL. */
+
+ struct urlpos *entry;
+ int attrind;
+ int timeout = 0;
+ char *p;
+
+ char *refresh = find_attr (tag, "content", &attrind);
+ if (!refresh)
+ return;
+
+ for (p = refresh; c_isdigit (*p); p++)
+ timeout = 10 * timeout + *p - '0';
+ if (*p++ != ';')
+ return;
+
+ while (c_isspace (*p))
+ ++p;
+ if (!( c_toupper (*p) == 'U'
+ && c_toupper (*(p + 1)) == 'R'
+ && c_toupper (*(p + 2)) == 'L'
+ && *(p + 3) == '='))
+ return;
+ p += 4;
+ while (c_isspace (*p))
+ ++p;
+
+ entry = append_url (p, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (entry)
+ {
+ entry->link_refresh_p = 1;
+ entry->refresh_timeout = timeout;
+ entry->link_expect_html = 1;
+ }
+ }
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ xfree_null (meta_charset);
+ meta_charset = mcharset;
+ }
+ else if (name && 0 == strcasecmp (name, "robots"))
+ {
+ /* Handle stuff like:
+ <meta name="robots" content="index,nofollow"> */
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+ if (!strcasecmp (content, "none"))
+ ctx->nofollow = true;
+ else
+ {
+ while (*content)
+ {
+ char *end;
+ /* Skip any initial whitespace. */
+ content += strspn (content, " \f\n\r\t\v");
+ /* Find the next occurrence of ',' or whitespace,
+ * or the end of the string. */
+ end = content + strcspn (content, ", \f\n\r\t\v");
+ if (!strncasecmp (content, "nofollow", end - content))
+ ctx->nofollow = true;
+ /* Skip past the next comma, if any. */
+ if (*end == ',')
+ ++end;
+ else
+ {
+ end = strchr (end, ',');
+ if (end)
+ ++end;
+ else
+ end = content + strlen (content);
+ }
+ content = end;
+ }
+ }
+ }
+}
+
+/* Dispatch the tag handler appropriate for the tag we're mapping
+ over. See known_tags[] for definition of tag handlers. */
+
+static void
+collect_tags_mapper (struct taginfo *tag, void *arg)
+{
+ struct map_context *ctx = (struct map_context *)arg;
+
+ /* Find the tag in our table of tags. This must not fail because
+ map_html_tags only returns tags found in interesting_tags.
+
+ I've changed this for now, I'm passing NULL as interesting_tags
+ to map_html_tags. This way we can check all tags for a style
+ attribute.
+ */
+ struct known_tag *t = hash_table_get (interesting_tags, tag->name);
+
+ if (t != NULL)
+ t->handler (t->tagid, tag, ctx);
+
+ check_style_attr (tag, ctx);
+
+ if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style"))
+ && tag->contents_begin && tag->contents_end
+ && tag->contents_begin <= tag->contents_end)
+ {
+ /* parse contents */
+ get_urls_css (ctx, tag->contents_begin - ctx->text,
+ tag->contents_end - tag->contents_begin);
+ }
+}
+\f
+/* Analyze HTML tags FILE and construct a list of URLs referenced from
+ it. It merges relative links in FILE with URL. It is aware of
+ <base href=...> and does the right thing. */
+
+struct urlpos *
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
+ struct iri *iri)