X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=fdafe0f1c46d19caf92f136f38f40da80fd87825;hb=0e40fc9a3c75331606d3e584e268b40a51abbcd3;hp=1c8856be406adb0180bde689e774e3a85f5a7b40;hpb=8817f4c1a4fe1e3a5726d94703b6e32f723cf59b;p=wget diff --git a/src/html-url.c b/src/html-url.c index 1c8856be..fdafe0f1 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -120,7 +120,7 @@ static struct { { TAG_AREA, "href", TUA_EXTERNAL }, { TAG_BGSOUND, "src", 0 }, { TAG_BODY, "background", 0 }, - { TAG_EMBED, "href", 0 }, + { TAG_EMBED, "href", TUA_EXTERNAL }, { TAG_EMBED, "src", 0 }, { TAG_FIG, "src", 0 }, { TAG_FRAME, "src", 0 }, @@ -328,9 +328,13 @@ append_one_url (const char *link_uri, int inlinep, if (!link_has_scheme) { - /* We have no base, and the link does not have a host - attached to it. Nothing we can do. */ - /* #### Should we print a warning here? Wget 1.5.x used to. */ + /* Base URL is unavailable, and the link does not have a + location attached to it -- we have to give up. Since + this can only happen when using `--force-html -i', print + a warning. */ + logprintf (LOG_NOTQUIET, + _("%s: Cannot resolve incomplete link %s.\n"), + ctx->document_file, link_uri); return NULL; } @@ -364,6 +368,8 @@ append_one_url (const char *link_uri, int inlinep, xfree (complete_uri); } + DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); + newel = (struct urlpos *)xmalloc (sizeof (struct urlpos)); memset (newel, 0, sizeof (*newel)); @@ -394,8 +400,8 @@ append_one_url (const char *link_uri, int inlinep, /* All the tag_* functions are called from collect_tags_mapper, as specified by KNOWN_TAGS. */ -/* For most tags, all we want to do is harvest URLs from their - attributes. */ +/* Default tag handler: collect URLs from attributes specified for + this tag by tag_url_attributes. */ static void tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) @@ -407,7 +413,7 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) if (tag_url_attributes[i].tagid == tagid) { /* We've found the index of tag_url_attributes where the - attributes of our tags begin. */ + attributes of our tag begin. */ first = i; break; } @@ -426,25 +432,26 @@ tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) { /* Find whether TAG/ATTRIND is a combination that contains a URL. */ - char *attrvalue = tag->attrs[attrind].value; + char *link = tag->attrs[attrind].value; /* If you're cringing at the inefficiency of the nested loops, - remember that the number of attributes the inner loop - iterates over is laughably small -- three in the worst case - (IMG). */ + remember that they both iterate over a laughably small + quantity of items. The worst-case inner loop is for the IMG + tag, which has three attributes. */ for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) { if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { int flags = tag_url_attributes[i].flags; - append_one_url (attrvalue, !(flags & TUA_EXTERNAL), - tag, attrind, ctx); + append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); } } } } +/* Handle the BASE tag, for . */ + static void tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) { @@ -468,30 +475,33 @@ tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) ctx->base = xstrdup (newbase); } +/* Handle the LINK tag. It requires special handling because how its + links will be followed in -p mode depends on the REL attribute. */ + static void tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) { int attrind; char *href = find_attr (tag, "href", &attrind); - /* All link references are external, - except for . */ + /* All link references are external, except those + known not to be, such as style sheet and shortcut icon: + + + + */ if (href) { char *rel = find_attr (tag, "rel", NULL); - int inlinep = (rel && 0 == strcasecmp (rel, "stylesheet")); + int inlinep = (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))); append_one_url (href, inlinep, tag, attrind, ctx); } } -/* Some pages use a META tag to specify that the page be refreshed by - a new page after a given number of seconds. The general format for - this is: - - - - So we just need to skip past the "NUMBER; URL=" garbage to get to - the URL. */ +/* Handle the META tag. This requires special handling because of the + refresh feature and because of robot exclusion. */ static void tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) @@ -501,11 +511,23 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (http_equiv && 0 == strcasecmp (http_equiv, "refresh")) { - struct urlpos *entry; + /* Some pages use a META tag to specify that the page be + refreshed by a new page after a given number of seconds. The + general format for this is: + + + + So we just need to skip past the "NUMBER; URL=" garbage to + get to the URL. */ + struct urlpos *entry; int attrind; - char *p, *refresh = find_attr (tag, "content", &attrind); int timeout = 0; + char *p; + + char *refresh = find_attr (tag, "content", &attrind); + if (!refresh) + return; for (p = refresh; ISDIGIT (*p); p++) timeout = 10 * timeout + *p - '0';