X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fhtml-url.c;h=523f5e0dab1aa82f1431f7752be20742c356f4ec;hb=e095cc064eb72ca0cee6d41622e39e7ea3f211a6;hp=6e8860834338425ac3e9baa878b3d12300623a70;hpb=ccd62071dcbdfc0269813746b9f51ff9c23261db;p=wget diff --git a/src/html-url.c b/src/html-url.c index 6e886083..523f5e0d 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,6 +1,6 @@ /* Collect URLs from HTML source. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007, 2008 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -288,7 +288,7 @@ append_url (const char *link_uri, int position, int size, return NULL; } - url = url_parse (link_uri, NULL, NULL); + url = url_parse (link_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: link \"%s\" doesn't parse.\n", @@ -304,10 +304,13 @@ append_url (const char *link_uri, int position, int size, char *complete_uri = uri_merge (base, link_uri); - DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n", - ctx->document_file, base, link_uri, complete_uri)); + DEBUGP (("%s: merge(%s, %s) -> %s\n", + quotearg_n_style (0, escape_quoting_style, ctx->document_file), + quote_n (1, base), + quote_n (2, link_uri), + quotearg_n_style (3, escape_quoting_style, complete_uri))); - url = url_parse (complete_uri, NULL, NULL); + url = url_parse (complete_uri, NULL, NULL, false); if (!url) { DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", @@ -318,7 +321,7 @@ append_url (const char *link_uri, int position, int size, xfree (complete_uri); } - DEBUGP (("appending \"%s\" to urlpos.\n", url->url)); + DEBUGP (("appending %s to urlpos.\n", quote (url->url))); newel = xnew0 (struct urlpos); newel->url = url; @@ -332,13 +335,27 @@ append_url (const char *link_uri, int position, int size, else if (link_has_scheme) newel->link_complete_p = 1; - if (ctx->tail) + /* Append the new URL maintaining the order by position. */ + if (ctx->head == NULL) + ctx->head = newel; + else { - ctx->tail->next = newel; - ctx->tail = newel; + struct urlpos *it, *prev = NULL; + + it = ctx->head; + while (it && position > it->pos) + { + prev = it; + it = it->next; + } + + newel->next = it; + + if (prev) + prev->next = newel; + else + ctx->head = newel; } - else - ctx->tail = ctx->head = newel; return newel; } @@ -571,7 +588,6 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) if (!mcharset) return; - /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/ xfree_null (meta_charset); meta_charset = mcharset; } @@ -588,15 +604,25 @@ tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) { while (*content) { - /* Find the next occurrence of ',' or the end of - the string. */ - char *end = strchr (content, ','); - if (end) - ++end; - else - end = content + strlen (content); + char *end; + /* Skip any initial whitespace. */ + content += strspn (content, " \f\n\r\t\v"); + /* Find the next occurrence of ',' or whitespace, + * or the end of the string. */ + end = content + strcspn (content, ", \f\n\r\t\v"); if (!strncasecmp (content, "nofollow", end - content)) ctx->nofollow = true; + /* Skip past the next comma, if any. */ + if (*end == ',') + ++end; + else + { + end = strchr (end, ','); + if (end) + ++end; + else + end = content + strlen (content); + } content = end; } } @@ -613,7 +639,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg) /* Find the tag in our table of tags. This must not fail because map_html_tags only returns tags found in interesting_tags. - + I've changed this for now, I'm passing NULL as interesting_tags to map_html_tags. This way we can check all tags for a style attribute. @@ -647,7 +673,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, int flags; /* Load the file. */ - fm = read_file (file); + fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); @@ -656,7 +682,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); ctx.text = fm->content; - ctx.head = ctx.tail = NULL; + ctx.head = NULL; ctx.base = NULL; ctx.parent_base = url ? url : opt.base_href; ctx.document_file = file; @@ -689,7 +715,7 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, *meta_disallow_follow = ctx.nofollow; xfree_null (ctx.base); - read_file_free (fm); + wget_read_file_free (fm); return ctx.head; } @@ -704,7 +730,7 @@ get_urls_file (const char *file) const char *text, *text_end; /* Load the file. */ - fm = read_file (file); + fm = wget_read_file (file); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); @@ -753,12 +779,14 @@ get_urls_file (const char *file) url_text = merged; } - url = url_parse (url_text, &up_error_code, NULL); + url = url_parse (url_text, &up_error_code, NULL, false); if (!url) { + char *error = url_error (url_text, up_error_code); logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), - file, url_text, url_error (up_error_code)); + file, url_text, error); xfree (url_text); + xfree (error); continue; } xfree (url_text); @@ -772,7 +800,7 @@ get_urls_file (const char *file) tail->next = entry; tail = entry; } - read_file_free (fm); + wget_read_file_free (fm); return head; }