X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fconvert.c;h=1bf9d274d5923e819ee8ff6e4a456ffb8ec035cf;hp=cd4873ab0ed3c6ed6f4117343cc11750a6ea1120;hb=d5e283b1a75c5f8249300b465b4e7b55130bec49;hpb=60c88ee992b501590aeed111a669e99fbff7ef82 diff --git a/src/convert.c b/src/convert.c index cd4873ab..1bf9d274 100644 --- a/src/convert.c +++ b/src/convert.c @@ -1,11 +1,12 @@ /* Conversion of links to local files. - Copyright (C) 2003-2005 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2006, 2007, + 2008 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or +the Free Software Foundation; either version 3 of the License, or (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, @@ -14,20 +15,20 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +along with Wget. If not, see . -In addition, as a special exception, the Free Software Foundation -gives permission to link the code of its release of Wget with the -OpenSSL project's "OpenSSL" library (or with modified versions of it -that use the same license as the "OpenSSL" library), and distribute -the linked executables. You must obey the GNU General Public License -in all respects for all of the code used other than "OpenSSL". If you -modify this file, you may extend this exception to your version of the -file, but you are not obligated to do so. If you do not wish to do -so, delete this exception statement from your version. */ +Additional permission under GNU GPL version 3 section 7 -#include +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#include "wget.h" #include #include @@ -37,8 +38,6 @@ so, delete this exception statement from your version. */ #endif /* HAVE_UNISTD_H */ #include #include - -#include "wget.h" #include "convert.h" #include "url.h" #include "recur.h" @@ -46,52 +45,37 @@ so, delete this exception statement from your version. */ #include "hash.h" #include "ptimer.h" #include "res.h" +#include "html-url.h" +#include "css-url.h" static struct hash_table *dl_file_url_map; struct hash_table *dl_url_file_map; -/* Set of HTML files downloaded in this Wget run, used for link +/* Set of HTML/CSS files downloaded in this Wget run, used for link conversion after Wget is done. */ struct hash_table *downloaded_html_set; - -static struct hash_table *nonexisting_urls_hash; +struct hash_table *downloaded_css_set; static void convert_links (const char *, struct urlpos *); -/* This function is called when the retrieval is done to convert the - links that have been downloaded. It has to be called at the end of - the retrieval, because only then does Wget know conclusively which - URLs have been downloaded, and which not, so it can tell which - direction to convert to. - - The "direction" means that the URLs to the files that have been - downloaded get converted to the relative URL which will point to - that file. And the other URLs get converted to the remote URL on - the server. - - All the downloaded HTMLs are kept in downloaded_html_files, and - downloaded URLs in urls_downloaded. All the information is - extracted from these two lists. */ void -convert_all_links (void) +convert_links_in_hashtable (struct hash_table *downloaded_set, + int is_css, + int *file_count) { int i; - double secs; - int file_count = 0; - - struct ptimer *timer = ptimer_new (); int cnt; char **file_array; cnt = 0; - if (downloaded_html_set) - cnt = hash_table_count (downloaded_html_set); + if (downloaded_set) + cnt = hash_table_count (downloaded_set); if (cnt == 0) return; file_array = alloca_array (char *, cnt); - string_set_to_array (downloaded_html_set, file_array); + string_set_to_array (downloaded_set, file_array); for (i = 0; i < cnt; i++) { @@ -99,7 +83,7 @@ convert_all_links (void) char *url; char *file = file_array[i]; - /* Determine the URL of the HTML file. get_urls_html will need + /* Determine the URL of the file. get_urls_{html,css} will need it. */ url = hash_table_get (dl_file_url_map, file); if (!url) @@ -110,8 +94,9 @@ convert_all_links (void) DEBUGP (("Scanning %s (from %s)\n", file, url)); - /* Parse the HTML file... */ - urls = get_urls_html (file, url, NULL); + /* Parse the file... */ + urls = is_css ? get_urls_css_file (file, url) : + get_urls_html (file, url, NULL, NULL); /* We don't respect meta_disallow_follow here because, even if the file is not followed, we might still want to convert the @@ -163,26 +148,55 @@ convert_all_links (void) /* Convert the links in the file. */ convert_links (file, urls); - ++file_count; + ++*file_count; /* Free the data. */ free_urlpos (urls); } +} + +/* This function is called when the retrieval is done to convert the + links that have been downloaded. It has to be called at the end of + the retrieval, because only then does Wget know conclusively which + URLs have been downloaded, and which not, so it can tell which + direction to convert to. + + The "direction" means that the URLs to the files that have been + downloaded get converted to the relative URL which will point to + that file. And the other URLs get converted to the remote URL on + the server. + + All the downloaded HTMLs are kept in downloaded_html_files, and + downloaded URLs in urls_downloaded. All the information is + extracted from these two lists. */ + +void +convert_all_links (void) +{ + double secs; + int file_count = 0; + + struct ptimer *timer = ptimer_new (); + + convert_links_in_hashtable (downloaded_html_set, 0, &file_count); + convert_links_in_hashtable (downloaded_css_set, 1, &file_count); secs = ptimer_measure (timer); - ptimer_destroy (timer); logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"), file_count, print_decimal (secs)); + + ptimer_destroy (timer); } static void write_backup_file (const char *, downloaded_file_t); +static const char *replace_plain (const char*, int, FILE*, const char *); static const char *replace_attr (const char *, int, FILE *, const char *); static const char *replace_attr_refresh_hack (const char *, int, FILE *, const char *, int); static char *local_quote_string (const char *); static char *construct_relative (const char *, const char *); -/* Change the links in one HTML file. LINKS is a list of links in the +/* Change the links in one file. LINKS is a list of links in the document, along with their positions and the desired direction of the conversion. */ static void @@ -232,8 +246,8 @@ convert_links (const char *file, struct urlpos *links) zeroes from the mmaped region. */ if (unlink (file) < 0 && errno != ENOENT) { - logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"), - file, strerror (errno)); + logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"), + quote (file), strerror (errno)); read_file_free (fm); return; } @@ -279,7 +293,9 @@ convert_links (const char *file, struct urlpos *links) char *newname = construct_relative (file, link->local_name); char *quoted_newname = local_quote_string (newname); - if (!link->link_refresh_p) + if (link->link_css_p) + p = replace_plain (p, link->size, fp, quoted_newname); + else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newname); else p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, @@ -298,7 +314,9 @@ convert_links (const char *file, struct urlpos *links) char *newlink = link->url->url; char *quoted_newlink = html_quote_string (newlink); - if (!link->link_refresh_p) + if (link->link_css_p) + p = replace_plain (p, link->size, fp, quoted_newlink); + else if (!link->link_refresh_p) p = replace_attr (p, link->size, fp, quoted_newlink); else p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink, @@ -402,12 +420,14 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) /* Rather than just writing over the original .html file with the converted version, save the former to *.orig. Note we only do this for files we've _successfully_ downloaded, so we don't - clobber .orig files sitting around from previous invocations. */ + clobber .orig files sitting around from previous invocations. + On VMS, use "_orig" instead of ".orig". See "wget.h". */ /* Construct the backup filename as the original name plus ".orig". */ size_t filename_len = strlen (file); char* filename_plus_orig_suffix; + /* TODO: hack this to work with css files */ if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) { /* Just write "orig" over "html". We need to do it this way @@ -424,9 +444,9 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */ { /* Append ".orig" to the name. */ - filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig")); + filename_plus_orig_suffix = alloca (filename_len + sizeof ("ORIG_SFX")); strcpy (filename_plus_orig_suffix, file); - strcpy (filename_plus_orig_suffix + filename_len, ".orig"); + strcpy (filename_plus_orig_suffix + filename_len, "ORIG_SFX"); } if (!converted_files) @@ -467,6 +487,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return) static bool find_fragment (const char *, int, const char **, const char **); +/* Replace a string with NEW_TEXT. Ignore quoting. */ +static const char * +replace_plain (const char *p, int size, FILE *fp, const char *new_text) +{ + fputs (new_text, fp); + p += size; + return p; +} + /* Replace an attribute's original text with NEW_TEXT. */ static const char * @@ -834,8 +863,17 @@ register_html (const char *url, const char *file) string_set_add (downloaded_html_set, file); } +/* Register that FILE is a CSS file that has been downloaded. */ + +void +register_css (const char *url, const char *file) +{ + if (!downloaded_css_set) + downloaded_css_set = make_string_hash_table (0); + string_set_add (downloaded_css_set, file); +} + static void downloaded_files_free (void); -static void nonexisting_urls_free (void); /* Cleanup the data structures associated with this file. */ @@ -857,7 +895,6 @@ convert_cleanup (void) if (downloaded_html_set) string_set_free (downloaded_html_set); downloaded_files_free (); - nonexisting_urls_free (); if (converted_files) string_set_free (converted_files); } @@ -938,7 +975,7 @@ downloaded_file (downloaded_file_t mode, const char *file) return *ptr; ptr = downloaded_mode_to_ptr (mode); - hash_table_put (downloaded_files_hash, xstrdup (file), &ptr); + hash_table_put (downloaded_files_hash, xstrdup (file), ptr); return FILE_NOT_ALREADY_DOWNLOADED; } @@ -957,122 +994,6 @@ downloaded_files_free (void) downloaded_files_hash = NULL; } } - -/* Remembers broken links. */ - -struct broken_urls_list -{ - char *url; - struct broken_urls_list *next; -}; - -static bool -in_list (const struct broken_urls_list *list, const char *url) -{ - const struct broken_urls_list *ptr; - - for (ptr = list; ptr; ptr = ptr->next) - { - /* str[case]cmp is inadequate for URL comparison */ - if (are_urls_equal (url, ptr->url) == 0) return true; - } - - return false; -} - -void -nonexisting_url (const char *url, const char *referrer) -{ - struct broken_urls_list *list; - - /* Ignore robots.txt URLs */ - if (is_robots_txt_url (url)) - return; - - if (!nonexisting_urls_hash) - nonexisting_urls_hash = make_string_hash_table (0); - - list = hash_table_get (nonexisting_urls_hash, url); - if (!list) - { - list = (struct broken_urls_list *) xnew0 (struct broken_urls_list); - list->url = referrer ? xstrdup (referrer) : NULL; - hash_table_put (nonexisting_urls_hash, xstrdup (url), list); - } - else if (list && !in_list (list, referrer)) - { - /* Append referrer at the end of the list */ - struct broken_urls_list *newnode; - - while (list->next) list = list->next; - - newnode = xnew0 (struct broken_urls_list); - newnode->url = xstrdup (referrer); - list->next = newnode; - } -} - -static void -nonexisting_urls_free (void) -{ - if (nonexisting_urls_hash) - { - hash_table_iterator iter; - for (hash_table_iterate (nonexisting_urls_hash, &iter); - hash_table_iter_next (&iter); - ) - { - xfree (iter.key); - xfree (iter.value); - } - hash_table_destroy (nonexisting_urls_hash); - nonexisting_urls_hash = NULL; - } -} - -void -print_broken_links (void) -{ - hash_table_iterator iter; - int num_elems; - - if (!nonexisting_urls_hash) - { - logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n")); - return; - } - - num_elems = hash_table_count (nonexisting_urls_hash); - assert (num_elems > 0); - - if (num_elems > 1) - { - logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), - num_elems); - } - else - { - logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n")); - } - - for (hash_table_iterate (nonexisting_urls_hash, &iter); - hash_table_iter_next (&iter); - ) - { - struct broken_urls_list *list; - - logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key); - - for (list = (struct broken_urls_list *) iter.value; - list; - list = list->next) - { - logprintf (LOG_NOTQUIET, _(" %s\n"), list->url); - } - } - logputs (LOG_NOTQUIET, "\n"); -} - /* The function returns the pointer to the malloc-ed quoted version of string s. It will recognize and quote numeric and special graphic