X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=3c7c409e82e4c9f48e490abee7f4131a9113da0f;hb=4d7c5e087b2bc82c9f503dff003916d1047903ce;hp=a3208e468395c33d22325dc5da982a85052b7090;hpb=7b5fb50cb1ce30fd0ddc3e77e376613a861c10aa;p=wget diff --git a/src/html-url.c b/src/html-url.c index a3208e46..3c7c409e 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,12 +1,12 @@ /* Collect URLs from HTML source. - Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc. + Copyright (C) 1998-2006 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. +the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +along with Wget. If not, see . In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -30,11 +29,7 @@ so, delete this exception statement from your version. */ #include #include -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #include #include @@ -43,18 +38,16 @@ so, delete this exception statement from your version. */ #include "html-parse.h" #include "url.h" #include "utils.h" - -#ifndef errno -extern int errno; -#endif +#include "hash.h" +#include "convert.h" +#include "recur.h" /* declaration of get_urls_html */ struct map_context; -typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *, - struct map_context *)); +typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); -#define DECLARE_TAG_HANDLER(fun) \ - static void fun PARAMS ((int, struct taginfo *, struct map_context *)) +#define DECLARE_TAG_HANDLER(fun) \ + static void fun (int, struct taginfo *, struct map_context *) DECLARE_TAG_HANDLER (tag_find_urls); DECLARE_TAG_HANDLER (tag_handle_base); @@ -62,64 +55,78 @@ DECLARE_TAG_HANDLER (tag_handle_form); DECLARE_TAG_HANDLER (tag_handle_link); DECLARE_TAG_HANDLER (tag_handle_meta); +enum { + TAG_A, + TAG_APPLET, + TAG_AREA, + TAG_BASE, + TAG_BGSOUND, + TAG_BODY, + TAG_EMBED, + TAG_FIG, + TAG_FORM, + TAG_FRAME, + TAG_IFRAME, + TAG_IMG, + TAG_INPUT, + TAG_LAYER, + TAG_LINK, + TAG_META, + TAG_OBJECT, + TAG_OVERLAY, + TAG_SCRIPT, + TAG_TABLE, + TAG_TD, + TAG_TH +}; + /* The list of known tags and functions used for handling them. Most tags are simply harvested for URLs. */ -static struct { +static struct known_tag { + int tagid; const char *name; tag_handler_t handler; } known_tags[] = { -#define TAG_A 0 - { "a", tag_find_urls }, -#define TAG_APPLET 1 - { "applet", tag_find_urls }, -#define TAG_AREA 2 - { "area", tag_find_urls }, -#define TAG_BASE 3 - { "base", tag_handle_base }, -#define TAG_BGSOUND 4 - { "bgsound", tag_find_urls }, -#define TAG_BODY 5 - { "body", tag_find_urls }, -#define TAG_EMBED 6 - { "embed", tag_find_urls }, -#define TAG_FIG 7 - { "fig", tag_find_urls }, -#define TAG_FORM 8 - { "form", tag_handle_form }, -#define TAG_FRAME 9 - { "frame", tag_find_urls }, -#define TAG_IFRAME 10 - { "iframe", tag_find_urls }, -#define TAG_IMG 11 - { "img", tag_find_urls }, -#define TAG_INPUT 12 - { "input", tag_find_urls }, -#define TAG_LAYER 13 - { "layer", tag_find_urls }, -#define TAG_LINK 14 - { "link", tag_handle_link }, -#define TAG_META 15 - { "meta", tag_handle_meta }, -#define TAG_OVERLAY 16 - { "overlay", tag_find_urls }, -#define TAG_SCRIPT 17 - { "script", tag_find_urls }, -#define TAG_TABLE 18 - { "table", tag_find_urls }, -#define TAG_TD 19 - { "td", tag_find_urls }, -#define TAG_TH 20 - { "th", tag_find_urls } + { TAG_A, "a", tag_find_urls }, + { TAG_APPLET, "applet", tag_find_urls }, + { TAG_AREA, "area", tag_find_urls }, + { TAG_BASE, "base", tag_handle_base }, + { TAG_BGSOUND, "bgsound", tag_find_urls }, + { TAG_BODY, "body", tag_find_urls }, + { TAG_EMBED, "embed", tag_find_urls }, + { TAG_FIG, "fig", tag_find_urls }, + { TAG_FORM, "form", tag_handle_form }, + { TAG_FRAME, "frame", tag_find_urls }, + { TAG_IFRAME, "iframe", tag_find_urls }, + { TAG_IMG, "img", tag_find_urls }, + { TAG_INPUT, "input", tag_find_urls }, + { TAG_LAYER, "layer", tag_find_urls }, + { TAG_LINK, "link", tag_handle_link }, + { TAG_META, "meta", tag_handle_meta }, + { TAG_OBJECT, "object", tag_find_urls }, + { TAG_OVERLAY, "overlay", tag_find_urls }, + { TAG_SCRIPT, "script", tag_find_urls }, + { TAG_TABLE, "table", tag_find_urls }, + { TAG_TD, "td", tag_find_urls }, + { TAG_TH, "th", tag_find_urls } }; /* tag_url_attributes documents which attributes of which tags contain URLs to harvest. It is used by tag_find_urls. */ -/* Defines for the FLAGS field; currently only one flag is defined. */ +/* Defines for the FLAGS. */ -/* This tag points to an external document not necessary for rendering this - document (i.e. it's not an inlined image, stylesheet, etc.). */ -#define TUA_EXTERNAL 1 +/* The link is "inline", i.e. needs to be retrieved for this document + to be correctly rendered. Inline links include inlined images, + stylesheets, children frames, etc. */ +#define ATTR_INLINE 1 + +/* The link is expected to yield HTML contents. It's important not to + try to follow HTML obtained by following e.g. + regardless of content-type. Doing this causes infinite loops for + "images" that return non-404 error pages with links to the same + image. */ +#define ATTR_HTML 2 /* For tags handled by tag_find_urls: attributes that contain URLs to download. */ @@ -128,26 +135,27 @@ static struct { const char *attr_name; int flags; } tag_url_attributes[] = { - { TAG_A, "href", TUA_EXTERNAL }, - { TAG_APPLET, "code", 0 }, - { TAG_AREA, "href", TUA_EXTERNAL }, - { TAG_BGSOUND, "src", 0 }, - { TAG_BODY, "background", 0 }, - { TAG_EMBED, "href", TUA_EXTERNAL }, - { TAG_EMBED, "src", 0 }, - { TAG_FIG, "src", 0 }, - { TAG_FRAME, "src", 0 }, - { TAG_IFRAME, "src", 0 }, - { TAG_IMG, "href", 0 }, - { TAG_IMG, "lowsrc", 0 }, - { TAG_IMG, "src", 0 }, - { TAG_INPUT, "src", 0 }, - { TAG_LAYER, "src", 0 }, - { TAG_OVERLAY, "src", 0 }, - { TAG_SCRIPT, "src", 0 }, - { TAG_TABLE, "background", 0 }, - { TAG_TD, "background", 0 }, - { TAG_TH, "background", 0 } + { TAG_A, "href", ATTR_HTML }, + { TAG_APPLET, "code", ATTR_INLINE }, + { TAG_AREA, "href", ATTR_HTML }, + { TAG_BGSOUND, "src", ATTR_INLINE }, + { TAG_BODY, "background", ATTR_INLINE }, + { TAG_EMBED, "href", ATTR_HTML }, + { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_FIG, "src", ATTR_INLINE }, + { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IMG, "href", ATTR_INLINE }, + { TAG_IMG, "lowsrc", ATTR_INLINE }, + { TAG_IMG, "src", ATTR_INLINE }, + { TAG_INPUT, "src", ATTR_INLINE }, + { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OBJECT, "data", ATTR_INLINE }, + { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_SCRIPT, "src", ATTR_INLINE }, + { TAG_TABLE, "background", ATTR_INLINE }, + { TAG_TD, "background", ATTR_INLINE }, + { TAG_TH, "background", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, @@ -161,8 +169,8 @@ static const char *additional_attributes[] = { "action" /* used by tag_handle_form */ }; -static const char **interesting_tags; -static const char **interesting_attributes; +static struct hash_table *interesting_tags; +static struct hash_table *interesting_attributes; static void init_interesting (void) @@ -174,125 +182,55 @@ init_interesting (void) Here we also make sure that what we put in interesting_tags matches the user's preferences as specified through --ignore-tags - and --follow-tags. - - This function is as large as this only because of the glorious - expressivity of the C programming language. */ - - { - int i, ind = 0; - int size = countof (known_tags); - interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *)); - - for (i = 0; i < size; i++) - { - const char *name = known_tags[i].name; - - /* Normally here we could say: - interesting_tags[i] = name; - But we need to respect the settings of --ignore-tags and - --follow-tags, so the code gets a bit hairier. */ - - if (opt.ignore_tags) - { - /* --ignore-tags was specified. Do not match these - specific tags. --ignore-tags takes precedence over - --follow-tags, so we process --ignore first and fall - through if there's no match. */ - int j, lose = 0; - for (j = 0; opt.ignore_tags[j] != NULL; j++) - /* Loop through all the tags this user doesn't care about. */ - if (strcasecmp(opt.ignore_tags[j], name) == EQ) - { - lose = 1; - break; - } - if (lose) - continue; - } - - if (opt.follow_tags) - { - /* --follow-tags was specified. Only match these specific tags, so - continue back to top of for if we don't match one of them. */ - int j, win = 0; - for (j = 0; opt.follow_tags[j] != NULL; j++) - /* Loop through all the tags this user cares about. */ - if (strcasecmp(opt.follow_tags[j], name) == EQ) - { - win = 1; - break; - } - if (!win) - continue; /* wasn't one of the explicitly desired tags */ - } - - /* If we get to here, --follow-tags isn't being used or the - tag is among the ones that are followed, and --ignore-tags, - if specified, didn't include this tag, so it's an - "interesting" one. */ - interesting_tags[ind++] = name; - } - interesting_tags[ind] = NULL; - } - - /* The same for attributes, except we loop through tag_url_attributes. - Here we also need to make sure that the list of attributes is - unique, and to include the attributes from additional_attributes. */ - { - int i, ind; - const char **att = xmalloc ((countof (additional_attributes) + 1) - * sizeof (char *)); - /* First copy the "additional" attributes. */ - for (i = 0; i < countof (additional_attributes); i++) - att[i] = additional_attributes[i]; - ind = i; - att[ind] = NULL; - for (i = 0; i < countof (tag_url_attributes); i++) - { - int j, seen = 0; - const char *look_for = tag_url_attributes[i].attr_name; - for (j = 0; j < ind - 1; j++) - if (!strcmp (att[j], look_for)) - { - seen = 1; - break; - } - if (!seen) - { - att = xrealloc (att, (ind + 2) * sizeof (*att)); - att[ind++] = look_for; - att[ind] = NULL; - } - } - interesting_attributes = att; - } -} + and --follow-tags. */ -static int -find_tag (const char *tag_name) -{ int i; + interesting_tags = make_nocase_string_hash_table (countof (known_tags)); - /* This is linear search; if the number of tags grow, we can switch - to binary search. */ - + /* First, add all the tags we know hot to handle, mapped to their + respective entries in known_tags. */ for (i = 0; i < countof (known_tags); i++) + hash_table_put (interesting_tags, known_tags[i].name, known_tags + i); + + /* Then remove the tags ignored through --ignore-tags. */ + if (opt.ignore_tags) { - int cmp = strcasecmp (known_tags[i].name, tag_name); - /* known_tags are sorted alphabetically, so we can - micro-optimize. */ - if (cmp > 0) - break; - else if (cmp == 0) - return i; + char **ignored; + for (ignored = opt.ignore_tags; *ignored; ignored++) + hash_table_remove (interesting_tags, *ignored); + } + + /* If --follow-tags is specified, use only those tags. */ + if (opt.follow_tags) + { + /* Create a new table intersecting --follow-tags and known_tags, + and use it as interesting_tags. */ + struct hash_table *intersect = make_nocase_string_hash_table (0); + char **followed; + for (followed = opt.follow_tags; *followed; followed++) + { + struct known_tag *t = hash_table_get (interesting_tags, *followed); + if (!t) + continue; /* ignore unknown --follow-tags entries. */ + hash_table_put (intersect, *followed, t); + } + hash_table_destroy (interesting_tags); + interesting_tags = intersect; } - return -1; + + /* Add the attributes we care about. */ + interesting_attributes = make_nocase_string_hash_table (10); + for (i = 0; i < countof (additional_attributes); i++) + hash_table_put (interesting_attributes, additional_attributes[i], "1"); + for (i = 0; i < countof (tag_url_attributes); i++) + hash_table_put (interesting_attributes, + tag_url_attributes[i].attr_name, "1"); } /* Find the value of attribute named NAME in the taginfo TAG. If the attribute is not present, return NULL. If ATTRIND is non-NULL, the index of the attribute in TAG will be stored there. */ + static char * find_attr (struct taginfo *tag, const char *name, int *attrind) { @@ -313,7 +251,7 @@ struct map_context { changed through . */ const char *parent_base; /* Base of the current document. */ const char *document_file; /* File name of this document. */ - int nofollow; /* whether NOFOLLOW was specified in a + bool nofollow; /* whether NOFOLLOW was specified in a tag. */ struct urlpos *head, *tail; /* List of URLs that is being @@ -327,8 +265,8 @@ struct map_context { size. */ static struct urlpos * -append_one_url (const char *link_uri, int inlinep, - struct taginfo *tag, int attrind, struct map_context *ctx) +append_url (const char *link_uri, + struct taginfo *tag, int attrind, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); struct urlpos *newel; @@ -384,14 +322,10 @@ append_one_url (const char *link_uri, int inlinep, DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); - newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (newel, 0, sizeof (*newel)); - - newel->next = NULL; + newel = xnew0 (struct urlpos); newel->url = url; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->size = tag->attrs[attrind].value_raw_size; - newel->link_inline_p = inlinep; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -420,10 +354,10 @@ append_one_url (const char *link_uri, int inlinep, static void tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) { - int i, attrind, first = -1; - int size = countof (tag_url_attributes); + int i, attrind; + int first = -1; - for (i = 0; i < size; i++) + for (i = 0; i < countof (tag_url_attributes); i++) if (tag_url_attributes[i].tagid == tagid) { /* We've found the index of tag_url_attributes where the @@ -447,18 +381,26 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) /* Find whether TAG/ATTRIND is a combination that contains a URL. */ char *link = tag->attrs[attrind].value; + const int size = countof (tag_url_attributes); /* If you're cringing at the inefficiency of the nested loops, - remember that they both iterate over a laughably small - quantity of items. The worst-case inner loop is for the IMG - tag, which has three attributes. */ + remember that they both iterate over a very small number of + items. The worst-case inner loop is for the IMG tag, which + has three attributes. */ for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) { if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - int flags = tag_url_attributes[i].flags; - append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); + struct urlpos *up = append_url (link, tag, attrind, ctx); + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } } } } @@ -475,7 +417,7 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) if (!newbase) return; - base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); + base_urlpos = append_url (newbase, tag, attrind, ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -498,10 +440,9 @@ tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) char *action = find_attr (tag, "action", &attrind); if (action) { - struct urlpos *action_urlpos = append_one_url (action, 0, tag, - attrind, ctx); - if (action_urlpos) - action_urlpos->ignore_when_downloading = 1; + struct urlpos *up = append_url (action, tag, attrind, ctx); + if (up) + up->ignore_when_downloading = 1; } } @@ -522,11 +463,19 @@ tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) */ if (href) { - char *rel = find_attr (tag, "rel", NULL); - int inlinep = (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))); - append_one_url (href, inlinep, tag, attrind, ctx); + struct urlpos *up = append_url (href, tag, attrind, ctx); + if (up) + { + char *rel = find_attr (tag, "rel", NULL); + if (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))) + up->link_inline_p = 1; + else + /* The external ones usually point to HTML pages, such as + */ + up->link_expect_html = 1; + } } } @@ -575,11 +524,12 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) while (ISSPACE (*p)) ++p; - entry = append_one_url (p, 0, tag, attrind, ctx); + entry = append_url (p, tag, attrind, ctx); if (entry) { entry->link_refresh_p = 1; entry->refresh_timeout = timeout; + entry->link_expect_html = 1; } } else if (name && 0 == strcasecmp (name, "robots")) @@ -590,7 +540,7 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!content) return; if (!strcasecmp (content, "none")) - ctx->nofollow = 1; + ctx->nofollow = true; else { while (*content) @@ -603,38 +553,39 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) else end = content + strlen (content); if (!strncasecmp (content, "nofollow", end - content)) - ctx->nofollow = 1; + ctx->nofollow = true; content = end; } } } } -/* Examine name and attributes of TAG and take appropriate action - according to the tag. */ +/* Dispatch the tag handler appropriate for the tag we're mapping + over. See known_tags[] for definition of tag handlers. */ static void collect_tags_mapper (struct taginfo *tag, void *arg) { struct map_context *ctx = (struct map_context *)arg; - int tagid; - tag_handler_t handler; - tagid = find_tag (tag->name); - assert (tagid != -1); - handler = known_tags[tagid].handler; + /* Find the tag in our table of tags. This must not fail because + map_html_tags only returns tags found in interesting_tags. */ + struct known_tag *t = hash_table_get (interesting_tags, tag->name); + assert (t != NULL); - handler (tagid, tag, ctx); + t->handler (t->tagid, tag, ctx); } /* Analyze HTML tags FILE and construct a list of URLs referenced from it. It merges relative links in FILE with URL. It is aware of and does the right thing. */ + struct urlpos * -get_urls_html (const char *file, const char *url, int *meta_disallow_follow) +get_urls_html (const char *file, const char *url, bool *meta_disallow_follow) { struct file_memory *fm; struct map_context ctx; + int flags; /* Load the file. */ fm = read_file (file); @@ -643,33 +594,131 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; ctx.head = ctx.tail = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; - ctx.nofollow = 0; + ctx.nofollow = false; if (!interesting_tags) init_interesting (); - map_html_tags (fm->content, fm->length, interesting_tags, - interesting_attributes, collect_tags_mapper, &ctx); + /* Specify MHT_TRIM_VALUES because of buggy HTML generators that + generate instead of (browsers + ignore spaces as well.) If you really mean space, use &32; or + %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, + e.g. in . Such newlines are also + ignored by IE and Mozilla and are presumably introduced by + writing HTML with editors that force word wrap. */ + flags = MHT_TRIM_VALUES; + if (opt.strict_comments) + flags |= MHT_STRICT_COMMENTS; + + map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, + interesting_tags, interesting_attributes); DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) *meta_disallow_follow = ctx.nofollow; - FREE_MAYBE (ctx.base); + xfree_null (ctx.base); read_file_free (fm); return ctx.head; } +/* This doesn't really have anything to do with HTML, but it's similar + to get_urls_html, so we put it here. */ + +struct urlpos * +get_urls_file (const char *file) +{ + struct file_memory *fm; + struct urlpos *head, *tail; + const char *text, *text_end; + + /* Load the file. */ + fm = read_file (file); + if (!fm) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); + return NULL; + } + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); + + head = tail = NULL; + text = fm->content; + text_end = fm->content + fm->length; + while (text < text_end) + { + int up_error_code; + char *url_text; + struct urlpos *entry; + struct url *url; + + const char *line_beg = text; + const char *line_end = memchr (text, '\n', text_end - text); + if (!line_end) + line_end = text_end; + else + ++line_end; + text = line_end; + + /* Strip whitespace from the beginning and end of line. */ + while (line_beg < line_end && ISSPACE (*line_beg)) + ++line_beg; + while (line_end > line_beg && ISSPACE (*(line_end - 1))) + --line_end; + + if (line_beg == line_end) + continue; + + /* The URL is in the [line_beg, line_end) region. */ + + /* We must copy the URL to a zero-terminated string, and we + can't use alloca because we're in a loop. *sigh*. */ + url_text = strdupdelim (line_beg, line_end); + + if (opt.base_href) + { + /* Merge opt.base_href with URL. */ + char *merged = uri_merge (opt.base_href, url_text); + xfree (url_text); + url_text = merged; + } + + url = url_parse (url_text, &up_error_code); + if (!url) + { + logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), + file, url_text, url_error (up_error_code)); + xfree (url_text); + continue; + } + xfree (url_text); + + entry = xnew0 (struct urlpos); + entry->url = url; + + if (!head) + head = entry; + else + tail->next = entry; + tail = entry; + } + read_file_free (fm); + return head; +} + void cleanup_html_url (void) { - FREE_MAYBE (interesting_tags); - FREE_MAYBE (interesting_attributes); + /* Destroy the hash tables. The hash table keys and values are not + allocated by this code, so we don't need to free them here. */ + if (interesting_tags) + hash_table_destroy (interesting_tags); + if (interesting_attributes) + hash_table_destroy (interesting_attributes); }