X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=853226d939da7c981b61bb6b86f7c6b1afe564c4;hb=4206f966148efcee7291a12ac49e653f7c25486c;hp=e26bd175afbec962eafe44ea79dd0a7c3f7bbdff;hpb=5f0a2b3f0846dd4c2f72fc62e7171200d1fd6e06;p=wget diff --git a/src/html-url.c b/src/html-url.c index e26bd175..853226d9 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -45,6 +45,7 @@ so, delete this exception statement from your version. */ #include "utils.h" #include "hash.h" #include "convert.h" +#include "recur.h" /* declaration of get_urls_html */ #ifndef errno extern int errno; @@ -81,6 +82,7 @@ enum { TAG_LAYER, TAG_LINK, TAG_META, + TAG_OBJECT, TAG_OVERLAY, TAG_SCRIPT, TAG_TABLE, @@ -111,6 +113,7 @@ static struct known_tag { { TAG_LAYER, "layer", tag_find_urls }, { TAG_LINK, "link", tag_handle_link }, { TAG_META, "meta", tag_handle_meta }, + { TAG_OBJECT, "object", tag_find_urls }, { TAG_OVERLAY, "overlay", tag_find_urls }, { TAG_SCRIPT, "script", tag_find_urls }, { TAG_TABLE, "table", tag_find_urls }, @@ -157,6 +160,7 @@ static struct { { TAG_IMG, "src", ATTR_INLINE }, { TAG_INPUT, "src", ATTR_INLINE }, { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OBJECT, "data", ATTR_INLINE }, { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, { TAG_SCRIPT, "src", ATTR_INLINE }, { TAG_TABLE, "background", ATTR_INLINE }, @@ -227,9 +231,10 @@ init_interesting (void) /* Add the attributes we care about. */ interesting_attributes = make_nocase_string_hash_table (10); for (i = 0; i < countof (additional_attributes); i++) - string_set_add (interesting_attributes, additional_attributes[i]); + hash_table_put (interesting_attributes, additional_attributes[i], "1"); for (i = 0; i < countof (tag_url_attributes); i++) - string_set_add (interesting_attributes, tag_url_attributes[i].attr_name); + hash_table_put (interesting_attributes, + tag_url_attributes[i].attr_name, "1"); } /* Find the value of attribute named NAME in the taginfo TAG. If the @@ -328,7 +333,6 @@ append_url (const char *link_uri, DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); newel = xnew0 (struct urlpos); - newel->next = NULL; newel->url = url; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->size = tag->attrs[attrind].value_raw_size; @@ -477,6 +481,10 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) && (0 == strcasecmp (rel, "stylesheet") || 0 == strcasecmp (rel, "shortcut icon"))) up->link_inline_p = 1; + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; } } } @@ -596,7 +604,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = ctx.tail = NULL; @@ -609,9 +617,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) init_interesting (); /* Specify MHT_TRIM_VALUES because of buggy HTML generators that - generate instead of (Netscape - ignores spaces as well.) If you really mean space, use &32; or - %20. */ + generate instead of (browsers + ignore spaces as well.) If you really mean space, use &32; or + %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, + e.g. in . Such newlines are also + ignored by IE and Mozilla and are presumably introduced by + writing HTML with editors that force word wrap. */ flags = MHT_TRIM_VALUES; if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS; @@ -623,7 +634,7 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; - FREE_MAYBE (ctx.base); + xfree_null (ctx.base); read_file_free (fm); return ctx.head; } @@ -645,7 +656,7 @@ get_urls_file (const char *file) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; @@ -691,7 +702,7 @@ get_urls_file (const char *file) url = url_parse (url_text, &up_error_code); if (!url) { - logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n", + logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), file, url_text, url_error (up_error_code)); xfree (url_text); continue; @@ -715,6 +726,10 @@ get_urls_file (const char *file) void cleanup_html_url (void) { - FREE_MAYBE (interesting_tags); - FREE_MAYBE (interesting_attributes); + /* Destroy the hash tables. The hash table keys and values are not + allocated by this code, so we don't need to free them here. */ + if (interesting_tags) + hash_table_destroy (interesting_tags); + if (interesting_attributes) + hash_table_destroy (interesting_attributes); }