X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=853226d939da7c981b61bb6b86f7c6b1afe564c4;hb=4206f966148efcee7291a12ac49e653f7c25486c;hp=e26bd175afbec962eafe44ea79dd0a7c3f7bbdff;hpb=5f0a2b3f0846dd4c2f72fc62e7171200d1fd6e06;p=wget
diff --git a/src/html-url.c b/src/html-url.c
index e26bd175..853226d9 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -45,6 +45,7 @@ so, delete this exception statement from your version. */
#include "utils.h"
#include "hash.h"
#include "convert.h"
+#include "recur.h" /* declaration of get_urls_html */
#ifndef errno
extern int errno;
@@ -81,6 +82,7 @@ enum {
TAG_LAYER,
TAG_LINK,
TAG_META,
+ TAG_OBJECT,
TAG_OVERLAY,
TAG_SCRIPT,
TAG_TABLE,
@@ -111,6 +113,7 @@ static struct known_tag {
{ TAG_LAYER, "layer", tag_find_urls },
{ TAG_LINK, "link", tag_handle_link },
{ TAG_META, "meta", tag_handle_meta },
+ { TAG_OBJECT, "object", tag_find_urls },
{ TAG_OVERLAY, "overlay", tag_find_urls },
{ TAG_SCRIPT, "script", tag_find_urls },
{ TAG_TABLE, "table", tag_find_urls },
@@ -157,6 +160,7 @@ static struct {
{ TAG_IMG, "src", ATTR_INLINE },
{ TAG_INPUT, "src", ATTR_INLINE },
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
+ { TAG_OBJECT, "data", ATTR_INLINE },
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", ATTR_INLINE },
@@ -227,9 +231,10 @@ init_interesting (void)
/* Add the attributes we care about. */
interesting_attributes = make_nocase_string_hash_table (10);
for (i = 0; i < countof (additional_attributes); i++)
- string_set_add (interesting_attributes, additional_attributes[i]);
+ hash_table_put (interesting_attributes, additional_attributes[i], "1");
for (i = 0; i < countof (tag_url_attributes); i++)
- string_set_add (interesting_attributes, tag_url_attributes[i].attr_name);
+ hash_table_put (interesting_attributes,
+ tag_url_attributes[i].attr_name, "1");
}
/* Find the value of attribute named NAME in the taginfo TAG. If the
@@ -328,7 +333,6 @@ append_url (const char *link_uri,
DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
newel = xnew0 (struct urlpos);
- newel->next = NULL;
newel->url = url;
newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
newel->size = tag->attrs[attrind].value_raw_size;
@@ -477,6 +481,10 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
&& (0 == strcasecmp (rel, "stylesheet")
|| 0 == strcasecmp (rel, "shortcut icon")))
up->link_inline_p = 1;
+ else
+ /* The external ones usually point to HTML pages, such as
+ */
+ up->link_expect_html = 1;
}
}
}
@@ -596,7 +604,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
- DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
ctx.text = fm->content;
ctx.head = ctx.tail = NULL;
@@ -609,9 +617,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
init_interesting ();
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
- generate instead of (Netscape
- ignores spaces as well.) If you really mean space, use &32; or
- %20. */
+ generate instead of (browsers
+ ignore spaces as well.) If you really mean space, use &32; or
+ %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
+ e.g. in . Such newlines are also
+ ignored by IE and Mozilla and are presumably introduced by
+ writing HTML with editors that force word wrap. */
flags = MHT_TRIM_VALUES;
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;
@@ -623,7 +634,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
if (meta_disallow_follow)
*meta_disallow_follow = ctx.nofollow;
- FREE_MAYBE (ctx.base);
+ xfree_null (ctx.base);
read_file_free (fm);
return ctx.head;
}
@@ -645,7 +656,7 @@ get_urls_file (const char *file)
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
- DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
head = tail = NULL;
text = fm->content;
@@ -691,7 +702,7 @@ get_urls_file (const char *file)
url = url_parse (url_text, &up_error_code);
if (!url)
{
- logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
+ logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
file, url_text, url_error (up_error_code));
xfree (url_text);
continue;
@@ -715,6 +726,10 @@ get_urls_file (const char *file)
void
cleanup_html_url (void)
{
- FREE_MAYBE (interesting_tags);
- FREE_MAYBE (interesting_attributes);
+ /* Destroy the hash tables. The hash table keys and values are not
+ allocated by this code, so we don't need to free them here. */
+ if (interesting_tags)
+ hash_table_destroy (interesting_tags);
+ if (interesting_attributes)
+ hash_table_destroy (interesting_attributes);
}