#include <config.h>
#include <stdio.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif
+#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "utils.h"
#include "hash.h"
#include "convert.h"
-
-#ifndef errno
-extern int errno;
-#endif
+#include "recur.h" /* declaration of get_urls_html */
struct map_context;
-typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
- struct map_context *));
+typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
-#define DECLARE_TAG_HANDLER(fun) \
- static void fun PARAMS ((int, struct taginfo *, struct map_context *))
+#define DECLARE_TAG_HANDLER(fun) \
+ static void fun (int, struct taginfo *, struct map_context *)
DECLARE_TAG_HANDLER (tag_find_urls);
DECLARE_TAG_HANDLER (tag_handle_base);
TAG_LAYER,
TAG_LINK,
TAG_META,
+ TAG_OBJECT,
TAG_OVERLAY,
TAG_SCRIPT,
TAG_TABLE,
{ TAG_LAYER, "layer", tag_find_urls },
{ TAG_LINK, "link", tag_handle_link },
{ TAG_META, "meta", tag_handle_meta },
+ { TAG_OBJECT, "object", tag_find_urls },
{ TAG_OVERLAY, "overlay", tag_find_urls },
{ TAG_SCRIPT, "script", tag_find_urls },
{ TAG_TABLE, "table", tag_find_urls },
{ TAG_IMG, "src", ATTR_INLINE },
{ TAG_INPUT, "src", ATTR_INLINE },
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
+ { TAG_OBJECT, "data", ATTR_INLINE },
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", ATTR_INLINE },
DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
newel = xnew0 (struct urlpos);
- newel->next = NULL;
newel->url = url;
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
newel->size = tag->attrs[attrind].value_raw_size;
&& (0 == strcasecmp (rel, "stylesheet")
|| 0 == strcasecmp (rel, "shortcut icon")))
up->link_inline_p = 1;
+ else
+ /* The external ones usually point to HTML pages, such as
+ <link rel="next" href="..."> */
+ up->link_expect_html = 1;
}
}
}
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
- DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
ctx.text = fm->content;
ctx.head = ctx.tail = NULL;
init_interesting ();
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
- generate <a href=" foo"> instead of <a href="foo"> (Netscape
- ignores spaces as well.) If you really mean space, use &32; or
- %20. */
+ generate <a href=" foo"> instead of <a href="foo"> (browsers
+ ignore spaces as well.) If you really mean space, use &32; or
+ %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
+ e.g. in <img src="foo.[newline]html">. Such newlines are also
+ ignored by IE and Mozilla and are presumably introduced by
+ writing HTML with editors that force word wrap. */
flags = MHT_TRIM_VALUES;
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
- DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
head = tail = NULL;
text = fm->content;
url = url_parse (url_text, &up_error_code);
if (!url)
{
- logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
+ logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
file, url_text, url_error (up_error_code));
xfree (url_text);
continue;