X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=74703ce6da932bd20782ad8dcdcede764437d242;hb=ca9319aaba2f7e5c1097e2642a6cd5f6d301e014;hp=0441b470c2891c95a86b4709330aca6cf88bf38d;hpb=b0b1c815c15e49c9172f59428810713097a65e37;p=wget diff --git a/src/html-url.c b/src/html-url.c index 0441b470..74703ce6 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,20 +1,20 @@ /* Collect URLs from HTML source. - Copyright (C) 1998, 2000 Free Software Foundation, Inc. + Copyright (C) 1998, 2000, 2001 Free Software Foundation, Inc. -This file is part of Wget. +This file is part of GNU Wget. -This program is free software; you can redistribute it and/or modify +GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -This program is distributed in the hope that it will be useful, +GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software +along with Wget; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include @@ -26,7 +26,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ # include #endif #include -#include #include #include @@ -39,74 +38,89 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ extern int errno; #endif -enum tag_category { TC_LINK, TC_SPEC }; +struct map_context; -/* Here we try to categorize the known tags. Each tag has its ID and - cetegory. Category TC_LINK means that one or more of its - attributes contain links that should be retrieved. TC_SPEC means - that the tag is specific in some way, and has to be handled - specially. */ +typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *, + struct map_context *)); + +#define DECLARE_TAG_HANDLER(fun) \ + static void fun PARAMS ((int, struct taginfo *, struct map_context *)) + +DECLARE_TAG_HANDLER (tag_find_urls); +DECLARE_TAG_HANDLER (tag_handle_base); +DECLARE_TAG_HANDLER (tag_handle_link); +DECLARE_TAG_HANDLER (tag_handle_meta); + +/* The list of known tags and functions used for handling them. Most + tags are simply harvested for URLs. */ static struct { const char *name; - enum tag_category category; + tag_handler_t handler; } known_tags[] = { #define TAG_A 0 - { "a", TC_LINK }, + { "a", tag_find_urls }, #define TAG_APPLET 1 - { "applet", TC_LINK }, + { "applet", tag_find_urls }, #define TAG_AREA 2 - { "area", TC_LINK }, + { "area", tag_find_urls }, #define TAG_BASE 3 - { "base", TC_SPEC }, + { "base", tag_handle_base }, #define TAG_BGSOUND 4 - { "bgsound", TC_LINK }, + { "bgsound", tag_find_urls }, #define TAG_BODY 5 - { "body", TC_LINK }, + { "body", tag_find_urls }, #define TAG_EMBED 6 - { "embed", TC_LINK }, + { "embed", tag_find_urls }, #define TAG_FIG 7 - { "fig", TC_LINK }, + { "fig", tag_find_urls }, #define TAG_FRAME 8 - { "frame", TC_LINK }, + { "frame", tag_find_urls }, #define TAG_IFRAME 9 - { "iframe", TC_LINK }, + { "iframe", tag_find_urls }, #define TAG_IMG 10 - { "img", TC_LINK }, + { "img", tag_find_urls }, #define TAG_INPUT 11 - { "input", TC_LINK }, + { "input", tag_find_urls }, #define TAG_LAYER 12 - { "layer", TC_LINK }, + { "layer", tag_find_urls }, #define TAG_LINK 13 - { "link", TC_SPEC }, + { "link", tag_handle_link }, #define TAG_META 14 - { "meta", TC_SPEC }, + { "meta", tag_handle_meta }, #define TAG_OVERLAY 15 - { "overlay", TC_LINK }, + { "overlay", tag_find_urls }, #define TAG_SCRIPT 16 - { "script", TC_LINK }, + { "script", tag_find_urls }, #define TAG_TABLE 17 - { "table", TC_LINK }, + { "table", tag_find_urls }, #define TAG_TD 18 - { "td", TC_LINK }, + { "td", tag_find_urls }, #define TAG_TH 19 - { "th", TC_LINK } + { "th", tag_find_urls } }; -/* Flags for specific url-attr pairs handled through TC_LINK: */ -#define AF_EXTERNAL 1 +/* tag_url_attributes documents which attributes of which tags contain + URLs to harvest. It is used by tag_find_urls. */ -/* For tags handled by TC_LINK: attributes that contain URLs to +/* Defines for the FLAGS field; currently only one flag is defined. */ + +/* This tag points to an external document not necessary for rendering this + document (i.e. it's not an inlined image, stylesheet, etc.). */ +#define TUA_EXTERNAL 1 + +/* For tags handled by tag_find_urls: attributes that contain URLs to download. */ static struct { int tagid; const char *attr_name; int flags; -} url_tag_attr_map[] = { - { TAG_A, "href", AF_EXTERNAL }, +} tag_url_attributes[] = { + { TAG_A, "href", TUA_EXTERNAL }, { TAG_APPLET, "code", 0 }, - { TAG_AREA, "href", AF_EXTERNAL }, + { TAG_AREA, "href", TUA_EXTERNAL }, { TAG_BGSOUND, "src", 0 }, { TAG_BODY, "background", 0 }, + { TAG_EMBED, "href", TUA_EXTERNAL }, { TAG_EMBED, "src", 0 }, { TAG_FIG, "src", 0 }, { TAG_FRAME, "src", 0 }, @@ -136,7 +150,7 @@ static const char *additional_attributes[] = { static const char **interesting_tags; static const char **interesting_attributes; -void +static void init_interesting (void) { /* Init the variables interesting_tags and interesting_attributes @@ -146,7 +160,10 @@ init_interesting (void) Here we also make sure that what we put in interesting_tags matches the user's preferences as specified through --ignore-tags - and --follow-tags. */ + and --follow-tags. + + This function is as large as this only because of the glorious + expressivity of the C programming language. */ { int i, ind = 0; @@ -160,7 +177,7 @@ init_interesting (void) /* Normally here we could say: interesting_tags[i] = name; But we need to respect the settings of --ignore-tags and - --follow-tags, so the code gets a bit harier. */ + --follow-tags, so the code gets a bit hairier. */ if (opt.ignore_tags) { @@ -170,8 +187,7 @@ init_interesting (void) through if there's no match. */ int j, lose = 0; for (j = 0; opt.ignore_tags[j] != NULL; j++) - /* Loop through all the tags this user doesn't care - about. */ + /* Loop through all the tags this user doesn't care about. */ if (strcasecmp(opt.ignore_tags[j], name) == EQ) { lose = 1; @@ -183,8 +199,8 @@ init_interesting (void) if (opt.follow_tags) { - /* --follow-tags was specified. Only match these specific - tags, so return FALSE if we don't match one of them. */ + /* --follow-tags was specified. Only match these specific tags, so + continue back to top of for if we don't match one of them. */ int j, win = 0; for (j = 0; opt.follow_tags[j] != NULL; j++) /* Loop through all the tags this user cares about. */ @@ -194,12 +210,11 @@ init_interesting (void) break; } if (!win) - continue; /* wasn't one of the explicitly - desired tags */ + continue; /* wasn't one of the explicitly desired tags */ } /* If we get to here, --follow-tags isn't being used or the - tag is among the ones that are follwed, and --ignore-tags, + tag is among the ones that are followed, and --ignore-tags, if specified, didn't include this tag, so it's an "interesting" one. */ interesting_tags[ind++] = name; @@ -207,7 +222,7 @@ init_interesting (void) interesting_tags[ind] = NULL; } - /* The same for attributes, except we loop through url_tag_attr_map. + /* The same for attributes, except we loop through tag_url_attributes. Here we also need to make sure that the list of attributes is unique, and to include the attributes from additional_attributes. */ { @@ -219,10 +234,10 @@ init_interesting (void) att[i] = additional_attributes[i]; ind = i; att[ind] = NULL; - for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++) + for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++) { int j, seen = 0; - const char *look_for = url_tag_attr_map[i].attr_name; + const char *look_for = tag_url_attributes[i].attr_name; for (j = 0; j < ind - 1; j++) if (!strcmp (att[j], look_for)) { @@ -262,279 +277,331 @@ find_tag (const char *tag_name) } /* Find the value of attribute named NAME in the taginfo TAG. If the - attribute is not present, return NULL. If ATTRID is non-NULL, the - exact identity of the attribute will be returned. */ + attribute is not present, return NULL. If ATTRIND is non-NULL, the + index of the attribute in TAG will be stored there. */ static char * -find_attr (struct taginfo *tag, const char *name, int *attrid) +find_attr (struct taginfo *tag, const char *name, int *attrind) { int i; for (i = 0; i < tag->nattrs; i++) if (!strcasecmp (tag->attrs[i].name, name)) { - if (attrid) - *attrid = i; + if (attrind) + *attrind = i; return tag->attrs[i].value; } return NULL; } -struct collect_urls_closure { +struct map_context { char *text; /* HTML text. */ char *base; /* Base URI of the document, possibly changed through . */ - urlpos *head, *tail; /* List of URLs */ const char *parent_base; /* Base of the current document. */ const char *document_file; /* File name of this document. */ - int dash_p_leaf_HTML; /* Whether -p is specified, and this - document is the "leaf" node of the - HTML tree. */ int nofollow; /* whether NOFOLLOW was specified in a tag. */ -}; -/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID - are the necessary context to store the position and size. */ - -static void -handle_link (struct collect_urls_closure *closure, const char *link_uri, - struct taginfo *tag, int attrid) -{ - int no_proto = !has_proto (link_uri); - urlpos *newel; + struct urlpos *head, *tail; /* List of URLs that is being + built. */ +}; - const char *base = closure->base ? closure->base : closure->parent_base; - char *complete_uri; +/* Append LINK_URI to the urlpos structure that is being built. - char *fragment = strrchr (link_uri, '#'); + LINK_URI will be merged with the current document base. TAG and + ATTRIND are the necessary context to store the position and + size. */ - if (fragment) - { - /* Nullify the fragment identifier, i.e. everything after the - last occurrence of `#', inclusive. This copying is - relatively inefficient, but it doesn't matter because - fragment identifiers don't come up all that often. */ - int hashlen = fragment - link_uri; - char *p = alloca (hashlen + 1); - memcpy (p, link_uri, hashlen); - p[hashlen] = '\0'; - link_uri = p; - } +static struct urlpos * +append_one_url (const char *link_uri, int inlinep, + struct taginfo *tag, int attrind, struct map_context *ctx) +{ + int link_has_scheme = url_has_scheme (link_uri); + struct urlpos *newel; + const char *base = ctx->base ? ctx->base : ctx->parent_base; + struct url *url; if (!base) { - if (no_proto) + DEBUGP (("%s: no base, merge will use \"%s\".\n", + ctx->document_file, link_uri)); + + if (!link_has_scheme) { - /* We have no base, and the link does not have a protocol or - a host attached to it. Nothing we can do. */ - /* #### Should we print a warning here? Wget 1.5.x used to. */ - return; + /* Base URL is unavailable, and the link does not have a + location attached to it -- we have to give up. Since + this can only happen when using `--force-html -i', print + a warning. */ + logprintf (LOG_NOTQUIET, + _("%s: Cannot resolve incomplete link %s.\n"), + ctx->document_file, link_uri); + return NULL; + } + + url = url_parse (link_uri, NULL); + if (!url) + { + DEBUGP (("%s: link \"%s\" doesn't parse.\n", + ctx->document_file, link_uri)); + return NULL; } - else - complete_uri = xstrdup (link_uri); } else - complete_uri = url_concat (base, link_uri); + { + /* Merge BASE with LINK_URI, but also make sure the result is + canonicalized, i.e. that "../" have been resolved. + (parse_url will do that for us.) */ + + char *complete_uri = uri_merge (base, link_uri); - DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", - closure->document_file, base ? base : "(null)", - link_uri, complete_uri)); + DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", + ctx->document_file, base, link_uri, complete_uri)); - newel = (urlpos *)xmalloc (sizeof (urlpos)); + url = url_parse (complete_uri, NULL); + if (!url) + { + DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", + ctx->document_file, complete_uri)); + xfree (complete_uri); + return NULL; + } + xfree (complete_uri); + } + DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); + + newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); memset (newel, 0, sizeof (*newel)); + newel->next = NULL; - newel->url = complete_uri; - newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text; - newel->size = tag->attrs[attrid].value_raw_size; - - /* A URL is relative if the host and protocol are not named, and the - name does not start with `/'. - #### This logic might need some rethinking. */ - if (no_proto && *link_uri != '/') - newel->flags |= (URELATIVE | UNOPROTO); - else if (no_proto) - newel->flags |= UNOPROTO; - - if (closure->tail) + newel->url = url; + newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; + newel->size = tag->attrs[attrind].value_raw_size; + newel->link_inline_p = inlinep; + + /* A URL is relative if the host is not named, and the name does not + start with `/'. */ + if (!link_has_scheme && *link_uri != '/') + newel->link_relative_p = 1; + else if (link_has_scheme) + newel->link_complete_p = 1; + + if (ctx->tail) { - closure->tail->next = newel; - closure->tail = newel; + ctx->tail->next = newel; + ctx->tail = newel; } else - closure->tail = closure->head = newel; + ctx->tail = ctx->head = newel; + + return newel; } + +/* All the tag_* functions are called from collect_tags_mapper, as + specified by KNOWN_TAGS. */ -/* #### Document what this does. - #### It would be nice to split this into several functions. */ +/* Default tag handler: collect URLs from attributes specified for + this tag by tag_url_attributes. */ static void -collect_tags_mapper (struct taginfo *tag, void *arg) +tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) { - struct collect_urls_closure *closure = (struct collect_urls_closure *)arg; - int tagid = find_tag (tag->name); - assert (tagid != -1); + int i, attrind, first = -1; + int size = ARRAY_SIZE (tag_url_attributes); - switch (known_tags[tagid].category) - { - case TC_LINK: + for (i = 0; i < size; i++) + if (tag_url_attributes[i].tagid == tagid) { - int i; - int size = ARRAY_SIZE (url_tag_attr_map); - for (i = 0; i < size; i++) - if (url_tag_attr_map[i].tagid == tagid) - break; - /* We've found the index of url_tag_attr_map where the - attributes of our tags begin. Now, look for every one of - them, and handle it. */ - for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++) - { - char *attr_value; - int id; - if (closure->dash_p_leaf_HTML - && (url_tag_attr_map[i].flags & AF_EXTERNAL)) - /* If we're at a -p leaf node, we don't want to retrieve - links to references we know are external, such as . */ - continue; - - /* This find_attr() buried in a loop may seem inefficient - (O(n^2)), but it's not, since the number of attributes - (n) we loop over is extremely small. In the worst case - of IMG with all its possible attributes, n^2 will be - only 9. */ - attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id); - if (attr_value) - handle_link (closure, attr_value, tag, id); - } + /* We've found the index of tag_url_attributes where the + attributes of our tag begin. */ + first = i; + break; } - break; - case TC_SPEC: - switch (tagid) + assert (first != -1); + + /* Loop over the "interesting" attributes of this tag. In this + example, it will loop over "src" and "lowsrc". + + + + This has to be done in the outer loop so that the attributes are + processed in the same order in which they appear in the page. + This is required when converting links. */ + + for (attrind = 0; attrind < tag->nattrs; attrind++) + { + /* Find whether TAG/ATTRIND is a combination that contains a + URL. */ + char *link = tag->attrs[attrind].value; + + /* If you're cringing at the inefficiency of the nested loops, + remember that they both iterate over a laughably small + quantity of items. The worst-case inner loop is for the IMG + tag, which has three attributes. */ + for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) { - case TAG_BASE: - { - char *newbase = find_attr (tag, "href", NULL); - if (!newbase) - break; - if (closure->base) - free (closure->base); - if (closure->parent_base) - closure->base = url_concat (closure->parent_base, newbase); - else - closure->base = xstrdup (newbase); - } - break; - case TAG_LINK: - { - int id; - char *rel = find_attr (tag, "rel", NULL); - char *href = find_attr (tag, "href", &id); - if (href) - { - /* In the normal case, all tags are - fair game. - - In the special case of when -p is active, however, - and we're at a leaf node (relative to the -l - max. depth) in the HTML document tree, the only - tag we'll follow is a , as it's necessary for displaying - this document properly. We won't follow other - tags, like , for instance, - as they refer to external documents. */ - if (!closure->dash_p_leaf_HTML - || (rel && !strcasecmp (rel, "stylesheet"))) - handle_link (closure, href, tag, id); - } - } - break; - case TAG_META: - /* Some pages use a META tag to specify that the page be - refreshed by a new page after a given number of seconds. - The general format for this is: + if (0 == strcasecmp (tag->attrs[attrind].name, + tag_url_attributes[i].attr_name)) + { + int flags = tag_url_attributes[i].flags; + append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); + } + } + } +} - +/* Handle the BASE tag, for . */ - So we just need to skip past the "NUMBER; URL=" garbage - to get to the URL. */ - { - int id; - char *name = find_attr (tag, "name", NULL); - char *http_equiv = find_attr (tag, "http-equiv", &id); - if (http_equiv && !strcasecmp (http_equiv, "refresh")) - { - char *refresh = find_attr (tag, "content", NULL); - char *p = refresh; - int offset; - while (ISDIGIT (*p)) - ++p; - if (*p++ != ';') - return; - while (ISSPACE (*p)) - ++p; - if (!(TOUPPER (*p) == 'U' - && TOUPPER (*(p + 1)) == 'R' - && TOUPPER (*(p + 2)) == 'L' - && *(p + 3) == '=')) - return; - p += 4; - while (ISSPACE (*p)) - ++p; - offset = p - refresh; - tag->attrs[id].value_raw_beginning += offset; - tag->attrs[id].value_raw_size -= offset; - handle_link (closure, p, tag, id); - } - else if (name && !strcasecmp (name, "robots")) - { - /* Handle stuff like: - */ - char *content = find_attr (tag, "content", NULL); - if (!content) - return; - if (!strcasecmp (content, "none")) - closure->nofollow = 1; - else - { - while (*content) - { - /* Find the next occurrence of ',' or the end of - the string. */ - char *end = strchr (content, ','); - if (end) - ++end; - else - end = content + strlen (content); - if (!strncasecmp (content, "nofollow", end - content)) - closure->nofollow = 1; - content = end; - } - } - } - } - break; - default: - /* Category is TC_SPEC, but tag name is unhandled. This - must not be. */ - abort (); +static void +tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) +{ + struct urlpos *base_urlpos; + int attrind; + char *newbase = find_attr (tag, "href", &attrind); + if (!newbase) + return; + + base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); + if (!base_urlpos) + return; + base_urlpos->ignore_when_downloading = 1; + base_urlpos->link_base_p = 1; + + if (ctx->base) + xfree (ctx->base); + if (ctx->parent_base) + ctx->base = uri_merge (ctx->parent_base, newbase); + else + ctx->base = xstrdup (newbase); +} + +/* Handle the LINK tag. It requires special handling because how its + links will be followed in -p mode depends on the REL attribute. */ + +static void +tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) +{ + int attrind; + char *href = find_attr (tag, "href", &attrind); + + /* All link references are external, except those + known not to be, such as style sheet and shortcut icon: + + + + */ + if (href) + { + char *rel = find_attr (tag, "rel", NULL); + int inlinep = (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))); + append_one_url (href, inlinep, tag, attrind, ctx); + } +} + +/* Handle the META tag. This requires special handling because of the + refresh feature and because of robot exclusion. */ + +static void +tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) +{ + char *name = find_attr (tag, "name", NULL); + char *http_equiv = find_attr (tag, "http-equiv", NULL); + + if (http_equiv && 0 == strcasecmp (http_equiv, "refresh")) + { + /* Some pages use a META tag to specify that the page be + refreshed by a new page after a given number of seconds. The + general format for this is: + + + + So we just need to skip past the "NUMBER; URL=" garbage to + get to the URL. */ + + struct urlpos *entry; + + int attrind; + char *p, *refresh = find_attr (tag, "content", &attrind); + int timeout = 0; + + for (p = refresh; ISDIGIT (*p); p++) + timeout = 10 * timeout + *p - '0'; + if (*p++ != ';') + return; + + while (ISSPACE (*p)) + ++p; + if (!( TOUPPER (*p) == 'U' + && TOUPPER (*(p + 1)) == 'R' + && TOUPPER (*(p + 2)) == 'L' + && *(p + 3) == '=')) + return; + p += 4; + while (ISSPACE (*p)) + ++p; + + entry = append_one_url (p, 0, tag, attrind, ctx); + if (entry) + { + entry->link_refresh_p = 1; + entry->refresh_timeout = timeout; + } + } + else if (name && 0 == strcasecmp (name, "robots")) + { + /* Handle stuff like: + */ + char *content = find_attr (tag, "content", NULL); + if (!content) + return; + if (!strcasecmp (content, "none")) + ctx->nofollow = 1; + else + { + while (*content) + { + /* Find the next occurrence of ',' or the end of + the string. */ + char *end = strchr (content, ','); + if (end) + ++end; + else + end = content + strlen (content); + if (!strncasecmp (content, "nofollow", end - content)) + ctx->nofollow = 1; + content = end; + } } - break; } } -/* Scan FILE, retrieving links to HTML documents from it. Each link is +/* Examine name and attributes of TAG and take appropriate action + according to the tag. */ - Similar to get_urls_file, but for HTML files. FILE is scanned as - an HTML document. get_urls_html() constructs the URLs from the - relative href-s. +static void +collect_tags_mapper (struct taginfo *tag, void *arg) +{ + struct map_context *ctx = (struct map_context *)arg; + int tagid; + tag_handler_t handler; + + tagid = find_tag (tag->name); + assert (tagid != -1); + handler = known_tags[tagid].handler; - If SILENT is non-zero, do not barf on baseless relative links. */ -urlpos * -get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML, - int *meta_disallow_follow) + handler (tagid, tag, ctx); +} + +/* Analyze HTML tags FILE and construct a list of URLs referenced from + it. It merges relative links in FILE with URL. It is aware of + and does the right thing. */ +struct urlpos * +get_urls_html (const char *file, const char *url, int *meta_disallow_follow) { struct file_memory *fm; - struct collect_urls_closure closure; + struct map_context ctx; /* Load the file. */ fm = read_file (file); @@ -545,25 +612,31 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML, } DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); - closure.text = fm->content; - closure.head = closure.tail = NULL; - closure.base = NULL; - closure.parent_base = this_url ? this_url : opt.base_href; - closure.document_file = file; - closure.dash_p_leaf_HTML = dash_p_leaf_HTML; - closure.nofollow = 0; + ctx.text = fm->content; + ctx.head = ctx.tail = NULL; + ctx.base = NULL; + ctx.parent_base = url ? url : opt.base_href; + ctx.document_file = file; + ctx.nofollow = 0; if (!interesting_tags) init_interesting (); map_html_tags (fm->content, fm->length, interesting_tags, - interesting_attributes, collect_tags_mapper, &closure); + interesting_attributes, collect_tags_mapper, &ctx); - DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow)); + DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); if (meta_disallow_follow) - *meta_disallow_follow = closure.nofollow; + *meta_disallow_follow = ctx.nofollow; - FREE_MAYBE (closure.base); + FREE_MAYBE (ctx.base); read_file_free (fm); - return closure.head; + return ctx.head; +} + +void +cleanup_html_url (void) +{ + FREE_MAYBE (interesting_tags); + FREE_MAYBE (interesting_attributes); }