- break;
- case TC_SPEC:
- switch (tagid)
- {
- case TAG_BASE:
- {
- char *newbase = find_attr (tag, "href", NULL);
- if (!newbase)
- break;
- if (closure->base)
- xfree (closure->base);
- if (closure->parent_base)
- closure->base = uri_merge (closure->parent_base, newbase);
- else
- closure->base = xstrdup (newbase);
- }
- break;
- case TAG_LINK:
- {
- int id;
- char *rel = find_attr (tag, "rel", NULL);
- char *href = find_attr (tag, "href", &id);
- if (href)
- {
- /* In the normal case, all <link href=...> tags are
- fair game.
-
- In the special case of when -p is active, however,
- and we're at a leaf node (relative to the -l
- max. depth) in the HTML document tree, the only
- <LINK> tag we'll follow is a <LINK REL=
- "stylesheet">, as it'll be necessary for displaying
- this document properly. We won't follow other
- <LINK> tags, like <LINK REL="home">, for instance,
- as they refer to external documents. */
- if (!closure->dash_p_leaf_HTML
- || (rel && !strcasecmp (rel, "stylesheet")))
- handle_link (closure, href, tag, id);
- }
- }
- break;
- case TAG_META:
- /* Some pages use a META tag to specify that the page be
- refreshed by a new page after a given number of seconds.
- The general format for this is:
-
- <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
-
- So we just need to skip past the "NUMBER; URL=" garbage
- to get to the URL. */
- {
- int id;
- char *name = find_attr (tag, "name", NULL);
- char *http_equiv = find_attr (tag, "http-equiv", &id);
- if (http_equiv && !strcasecmp (http_equiv, "refresh"))
- {
- char *refresh = find_attr (tag, "content", NULL);
- char *p = refresh;
- int offset;
- while (ISDIGIT (*p))
- ++p;
- if (*p++ != ';')
- return;
- while (ISSPACE (*p))
- ++p;
- if (!(TOUPPER (*p) == 'U'
- && TOUPPER (*(p + 1)) == 'R'
- && TOUPPER (*(p + 2)) == 'L'
- && *(p + 3) == '='))
- return;
- p += 4;
- while (ISSPACE (*p))
- ++p;
- offset = p - refresh;
- tag->attrs[id].value_raw_beginning += offset;
- tag->attrs[id].value_raw_size -= offset;
- handle_link (closure, p, tag, id);
- }
- else if (name && !strcasecmp (name, "robots"))
- {
- /* Handle stuff like:
- <meta name="robots" content="index,nofollow"> */
- char *content = find_attr (tag, "content", NULL);
- if (!content)
- return;
- if (!strcasecmp (content, "none"))
- closure->nofollow = 1;
- else
- {
- while (*content)
- {
- /* Find the next occurrence of ',' or the end of
- the string. */
- char *end = strchr (content, ',');
- if (end)
- ++end;
- else
- end = content + strlen (content);
- if (!strncasecmp (content, "nofollow", end - content))
- closure->nofollow = 1;
- content = end;
- }
- }
- }
- }
- break;
- default:
- /* Category is TC_SPEC, but tag name is unhandled. This
- must not be. */
- abort ();
- }
- break;
+ assert (first != -1);
+
+ /* Loop over the "interesting" attributes of this tag. In this
+ example, it will loop over "src" and "lowsrc".
+
+ <img src="foo.png" lowsrc="bar.png">
+
+ This has to be done in the outer loop so that the attributes are
+ processed in the same order in which they appear in the page.
+ This is required when converting links. */
+
+ for (attrind = 0; attrind < tag->nattrs; attrind++)
+ {
+ /* Find whether TAG/ATTRIND is a combination that contains a
+ URL. */
+ char *link = tag->attrs[attrind].value;
+ const size_t size = countof (tag_url_attributes);
+
+ /* If you're cringing at the inefficiency of the nested loops,
+ remember that they both iterate over a very small number of
+ items. The worst-case inner loop is for the IMG tag, which
+ has three attributes. */
+ for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
+ {
+ if (0 == strcasecmp (tag->attrs[attrind].name,
+ tag_url_attributes[i].attr_name))
+ {
+ struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (up)
+ {
+ int flags = tag_url_attributes[i].flags;
+ if (flags & ATTR_INLINE)
+ up->link_inline_p = 1;
+ if (flags & ATTR_HTML)
+ up->link_expect_html = 1;
+ }
+ }
+ }
+ }
+}
+
+/* Handle the BASE tag, for <base href=...>. */
+
+static void
+tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
+{
+ struct urlpos *base_urlpos;
+ int attrind;
+ char *newbase = find_attr (tag, "href", &attrind);
+ if (!newbase)
+ return;
+
+ base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (!base_urlpos)
+ return;
+ base_urlpos->ignore_when_downloading = 1;
+ base_urlpos->link_base_p = 1;
+
+ if (ctx->base)
+ xfree (ctx->base);
+ if (ctx->parent_base)
+ ctx->base = uri_merge (ctx->parent_base, newbase);
+ else
+ ctx->base = xstrdup (newbase);
+}
+
+/* Mark the URL found in <form action=...> for conversion. */
+
+static void
+tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
+{
+ int attrind;
+ char *action = find_attr (tag, "action", &attrind);
+
+ if (action)
+ {
+ struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (up)
+ up->ignore_when_downloading = 1;
+ }
+}
+
+/* Handle the LINK tag. It requires special handling because how its
+ links will be followed in -p mode depends on the REL attribute. */
+
+static void
+tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
+{
+ int attrind;
+ char *href = find_attr (tag, "href", &attrind);
+
+ /* All <link href="..."> link references are external, except those
+ known not to be, such as style sheet and shortcut icon:
+
+ <link rel="stylesheet" href="...">
+ <link rel="shortcut icon" href="...">
+ */
+ if (href)
+ {
+ struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
+ ATTR_SIZE(tag,attrind), ctx);
+ if (up)
+ {
+ char *rel = find_attr (tag, "rel", NULL);
+ if (rel)
+ {
+ if (0 == strcasecmp (rel, "stylesheet"))
+ {
+ up->link_inline_p = 1;
+ up->link_expect_css = 1;
+ }
+ else if (0 == strcasecmp (rel, "shortcut icon"))
+ {
+ up->link_inline_p = 1;
+ }
+ }
+ else
+ /* The external ones usually point to HTML pages, such as
+ <link rel="next" href="..."> */
+ up->link_expect_html = 1;
+ }