X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Fconvert.c;h=f5a9cba328cfeb88bcf33f2965a8b53e19cc0044;hb=eee1589ef3d198a21635d15c9086df2b99f9013d;hp=7def7c89195ebedd88fe688def904650cfb8c58d;hpb=1c7493b83ed8cecbbf1f70ef6bf834f94c5fcd43;p=wget
diff --git a/src/convert.c b/src/convert.c
index 7def7c89..f5a9cba3 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -1,11 +1,12 @@
/* Conversion of links to local files.
- Copyright (C) 2003-2006 Free Software Foundation, Inc.
+ Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+ Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
@@ -14,31 +15,27 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+along with Wget. If not, see .
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
-#include
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
#include
#include
#include
-#ifdef HAVE_UNISTD_H
-# include
-#endif /* HAVE_UNISTD_H */
+#include
#include
#include
-
-#include "wget.h"
#include "convert.h"
#include "url.h"
#include "recur.h"
@@ -46,52 +43,38 @@ so, delete this exception statement from your version. */
#include "hash.h"
#include "ptimer.h"
#include "res.h"
+#include "html-url.h"
+#include "css-url.h"
+#include "iri.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
-/* Set of HTML files downloaded in this Wget run, used for link
+/* Set of HTML/CSS files downloaded in this Wget run, used for link
conversion after Wget is done. */
struct hash_table *downloaded_html_set;
-
-static struct hash_table *nonexisting_urls_hash;
+struct hash_table *downloaded_css_set;
static void convert_links (const char *, struct urlpos *);
-/* This function is called when the retrieval is done to convert the
- links that have been downloaded. It has to be called at the end of
- the retrieval, because only then does Wget know conclusively which
- URLs have been downloaded, and which not, so it can tell which
- direction to convert to.
-
- The "direction" means that the URLs to the files that have been
- downloaded get converted to the relative URL which will point to
- that file. And the other URLs get converted to the remote URL on
- the server.
-
- All the downloaded HTMLs are kept in downloaded_html_files, and
- downloaded URLs in urls_downloaded. All the information is
- extracted from these two lists. */
-void
-convert_all_links (void)
+static void
+convert_links_in_hashtable (struct hash_table *downloaded_set,
+ int is_css,
+ int *file_count)
{
int i;
- double secs;
- int file_count = 0;
-
- struct ptimer *timer = ptimer_new ();
int cnt;
char **file_array;
cnt = 0;
- if (downloaded_html_set)
- cnt = hash_table_count (downloaded_html_set);
+ if (downloaded_set)
+ cnt = hash_table_count (downloaded_set);
if (cnt == 0)
return;
file_array = alloca_array (char *, cnt);
- string_set_to_array (downloaded_html_set, file_array);
+ string_set_to_array (downloaded_set, file_array);
for (i = 0; i < cnt; i++)
{
@@ -99,7 +82,7 @@ convert_all_links (void)
char *url;
char *file = file_array[i];
- /* Determine the URL of the HTML file. get_urls_html will need
+ /* Determine the URL of the file. get_urls_{html,css} will need
it. */
url = hash_table_get (dl_file_url_map, file);
if (!url)
@@ -110,8 +93,9 @@ convert_all_links (void)
DEBUGP (("Scanning %s (from %s)\n", file, url));
- /* Parse the HTML file... */
- urls = get_urls_html (file, url, NULL);
+ /* Parse the file... */
+ urls = is_css ? get_urls_css_file (file, url) :
+ get_urls_html (file, url, NULL, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the
@@ -120,7 +104,8 @@ convert_all_links (void)
for (cur_url = urls; cur_url; cur_url = cur_url->next)
{
char *local_name;
- struct url *u = cur_url->url;
+ struct url *u;
+ struct iri *pi;
if (cur_url->link_base_p)
{
@@ -134,6 +119,14 @@ convert_all_links (void)
/* We decide the direction of conversion according to whether
a URL was downloaded. Downloaded URLs will be converted
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
+
+ pi = iri_new ();
+ set_uri_encoding (pi, opt.locale, true);
+
+ u = url_parse (cur_url->url->url, NULL, pi, true);
+ if (!u)
+ continue;
+
local_name = hash_table_get (dl_url_file_map, u->url);
/* Decide on the conversion type. */
@@ -159,30 +152,62 @@ convert_all_links (void)
cur_url->local_name = NULL;
DEBUGP (("will convert url %s to complete\n", u->url));
}
+
+ url_free (u);
+ iri_free (pi);
}
/* Convert the links in the file. */
convert_links (file, urls);
- ++file_count;
+ ++*file_count;
/* Free the data. */
free_urlpos (urls);
}
+}
+
+/* This function is called when the retrieval is done to convert the
+ links that have been downloaded. It has to be called at the end of
+ the retrieval, because only then does Wget know conclusively which
+ URLs have been downloaded, and which not, so it can tell which
+ direction to convert to.
+
+ The "direction" means that the URLs to the files that have been
+ downloaded get converted to the relative URL which will point to
+ that file. And the other URLs get converted to the remote URL on
+ the server.
+
+ All the downloaded HTMLs are kept in downloaded_html_files, and
+ downloaded URLs in urls_downloaded. All the information is
+ extracted from these two lists. */
+
+void
+convert_all_links (void)
+{
+ double secs;
+ int file_count = 0;
+
+ struct ptimer *timer = ptimer_new ();
+
+ convert_links_in_hashtable (downloaded_html_set, 0, &file_count);
+ convert_links_in_hashtable (downloaded_css_set, 1, &file_count);
secs = ptimer_measure (timer);
- ptimer_destroy (timer);
logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
file_count, print_decimal (secs));
+
+ ptimer_destroy (timer);
}
static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_plain (const char*, int, FILE*, const char *);
static const char *replace_attr (const char *, int, FILE *, const char *);
static const char *replace_attr_refresh_hack (const char *, int, FILE *,
const char *, int);
-static char *local_quote_string (const char *);
+static char *local_quote_string (const char *, bool);
static char *construct_relative (const char *, const char *);
-/* Change the links in one HTML file. LINKS is a list of links in the
+/* Change the links in one file. LINKS is a list of links in the
document, along with their positions and the desired direction of
the conversion. */
static void
@@ -214,7 +239,7 @@ convert_links (const char *file, struct urlpos *links)
}
}
- fm = read_file (file);
+ fm = wget_read_file (file);
if (!fm)
{
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
@@ -232,9 +257,9 @@ convert_links (const char *file, struct urlpos *links)
zeroes from the mmaped region. */
if (unlink (file) < 0 && errno != ENOENT)
{
- logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
- file, strerror (errno));
- read_file_free (fm);
+ logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"),
+ quote (file), strerror (errno));
+ wget_read_file_free (fm);
return;
}
/* Now open the file for writing. */
@@ -243,7 +268,7 @@ convert_links (const char *file, struct urlpos *links)
{
logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
file, strerror (errno));
- read_file_free (fm);
+ wget_read_file_free (fm);
return;
}
@@ -277,9 +302,12 @@ convert_links (const char *file, struct urlpos *links)
/* Convert absolute URL to relative. */
{
char *newname = construct_relative (file, link->local_name);
- char *quoted_newname = local_quote_string (newname);
+ char *quoted_newname = local_quote_string (newname,
+ link->link_css_p);
- if (!link->link_refresh_p)
+ if (link->link_css_p)
+ p = replace_plain (p, link->size, fp, quoted_newname);
+ else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
else
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
@@ -298,7 +326,9 @@ convert_links (const char *file, struct urlpos *links)
char *newlink = link->url->url;
char *quoted_newlink = html_quote_string (newlink);
- if (!link->link_refresh_p)
+ if (link->link_css_p)
+ p = replace_plain (p, link->size, fp, newlink);
+ else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newlink);
else
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
@@ -324,7 +354,7 @@ convert_links (const char *file, struct urlpos *links)
if (p - fm->content < fm->length)
fwrite (p, 1, fm->length - (p - fm->content), fp);
fclose (fp);
- read_file_free (fm);
+ wget_read_file_free (fm);
logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
}
@@ -402,12 +432,14 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
/* Rather than just writing over the original .html file with the
converted version, save the former to *.orig. Note we only do
this for files we've _successfully_ downloaded, so we don't
- clobber .orig files sitting around from previous invocations. */
+ clobber .orig files sitting around from previous invocations.
+ On VMS, use "_orig" instead of ".orig". See "wget.h". */
/* Construct the backup filename as the original name plus ".orig". */
size_t filename_len = strlen (file);
char* filename_plus_orig_suffix;
+ /* TODO: hack this to work with css files */
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
/* Just write "orig" over "html". We need to do it this way
@@ -424,9 +456,9 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
{
/* Append ".orig" to the name. */
- filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
+ filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX));
strcpy (filename_plus_orig_suffix, file);
- strcpy (filename_plus_orig_suffix + filename_len, ".orig");
+ strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX);
}
if (!converted_files)
@@ -467,6 +499,15 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
static bool find_fragment (const char *, int, const char **, const char **);
+/* Replace a string with NEW_TEXT. Ignore quoting. */
+static const char *
+replace_plain (const char *p, int size, FILE *fp, const char *new_text)
+{
+ fputs (new_text, fp);
+ p += size;
+ return p;
+}
+
/* Replace an attribute's original text with NEW_TEXT. */
static const char *
@@ -562,25 +603,25 @@ find_fragment (const char *beg, int size, const char **bp, const char **ep)
We quote ? as %3F to avoid passing part of the file name as the
parameter when browsing the converted file through HTTP. However,
- it is safe to do this only when `--html-extension' is turned on.
+ it is safe to do this only when `--adjust-extension' is turned on.
This is because converting "index.html?foo=bar" to
"index.html%3Ffoo=bar" would break local browsing, as the latter
isn't even recognized as an HTML file! However, converting
"index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
safe for both local and HTTP-served browsing.
- We always quote "#" as "%23" and "%" as "%25" because those
- characters have special meanings in URLs. */
+ We always quote "#" as "%23", "%" as "%25" and ";" as "%3B"
+ because those characters have special meanings in URLs. */
static char *
-local_quote_string (const char *file)
+local_quote_string (const char *file, bool no_html_quote)
{
const char *from;
char *newname, *to;
- char *any = strpbrk (file, "?#%");
+ char *any = strpbrk (file, "?#%;");
if (!any)
- return html_quote_string (file);
+ return no_html_quote ? strdup (file) : html_quote_string (file);
/* Allocate space assuming the worst-case scenario, each character
having to be quoted. */
@@ -598,8 +639,13 @@ local_quote_string (const char *file)
*to++ = '2';
*to++ = '3';
break;
+ case ';':
+ *to++ = '%';
+ *to++ = '3';
+ *to++ = 'B';
+ break;
case '?':
- if (opt.html_extension)
+ if (opt.adjust_extension)
{
*to++ = '%';
*to++ = '3';
@@ -612,7 +658,7 @@ local_quote_string (const char *file)
}
*to = '\0';
- return html_quote_string (newname);
+ return no_html_quote ? strdup (newname) : html_quote_string (newname);
}
/* Book-keeping code for dl_file_url_map, dl_url_file_map,
@@ -827,15 +873,24 @@ register_delete_file (const char *file)
/* Register that FILE is an HTML file that has been downloaded. */
void
-register_html (const char *url, const char *file)
+register_html (const char *file)
{
if (!downloaded_html_set)
downloaded_html_set = make_string_hash_table (0);
string_set_add (downloaded_html_set, file);
}
+/* Register that FILE is a CSS file that has been downloaded. */
+
+void
+register_css (const char *file)
+{
+ if (!downloaded_css_set)
+ downloaded_css_set = make_string_hash_table (0);
+ string_set_add (downloaded_css_set, file);
+}
+
static void downloaded_files_free (void);
-static void nonexisting_urls_free (void);
/* Cleanup the data structures associated with this file. */
@@ -857,7 +912,6 @@ convert_cleanup (void)
if (downloaded_html_set)
string_set_free (downloaded_html_set);
downloaded_files_free ();
- nonexisting_urls_free ();
if (converted_files)
string_set_free (converted_files);
}
@@ -876,7 +930,7 @@ static struct hash_table *downloaded_files_hash;
However, our hash tables only accept pointers for keys and values.
So when we need a pointer, we use the address of a
downloaded_file_t variable of static storage. */
-
+
static downloaded_file_t *
downloaded_mode_to_ptr (downloaded_file_t mode)
{
@@ -938,7 +992,7 @@ downloaded_file (downloaded_file_t mode, const char *file)
return *ptr;
ptr = downloaded_mode_to_ptr (mode);
- hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
+ hash_table_put (downloaded_files_hash, xstrdup (file), ptr);
return FILE_NOT_ALREADY_DOWNLOADED;
}
@@ -957,122 +1011,6 @@ downloaded_files_free (void)
downloaded_files_hash = NULL;
}
}
-
-/* Remembers broken links. */
-
-struct broken_urls_list
-{
- char *url;
- struct broken_urls_list *next;
-};
-
-static bool
-in_list (const struct broken_urls_list *list, const char *url)
-{
- const struct broken_urls_list *ptr;
-
- for (ptr = list; ptr; ptr = ptr->next)
- {
- /* str[case]cmp is inadequate for URL comparison */
- if (are_urls_equal (url, ptr->url) == 0) return true;
- }
-
- return false;
-}
-
-void
-nonexisting_url (const char *url, const char *referrer)
-{
- struct broken_urls_list *list;
-
- /* Ignore robots.txt URLs */
- if (is_robots_txt_url (url))
- return;
-
- if (!nonexisting_urls_hash)
- nonexisting_urls_hash = make_string_hash_table (0);
-
- list = hash_table_get (nonexisting_urls_hash, url);
- if (!list)
- {
- list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
- list->url = referrer ? xstrdup (referrer) : NULL;
- hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
- }
- else if (list && !in_list (list, referrer))
- {
- /* Append referrer at the end of the list */
- struct broken_urls_list *newnode;
-
- while (list->next) list = list->next;
-
- newnode = xnew0 (struct broken_urls_list);
- newnode->url = xstrdup (referrer);
- list->next = newnode;
- }
-}
-
-static void
-nonexisting_urls_free (void)
-{
- if (nonexisting_urls_hash)
- {
- hash_table_iterator iter;
- for (hash_table_iterate (nonexisting_urls_hash, &iter);
- hash_table_iter_next (&iter);
- )
- {
- xfree (iter.key);
- xfree (iter.value);
- }
- hash_table_destroy (nonexisting_urls_hash);
- nonexisting_urls_hash = NULL;
- }
-}
-
-void
-print_broken_links (void)
-{
- hash_table_iterator iter;
- int num_elems;
-
- if (!nonexisting_urls_hash)
- {
- logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
- return;
- }
-
- num_elems = hash_table_count (nonexisting_urls_hash);
- assert (num_elems > 0);
-
- if (num_elems > 1)
- {
- logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
- num_elems);
- }
- else
- {
- logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
- }
-
- for (hash_table_iterate (nonexisting_urls_hash, &iter);
- hash_table_iter_next (&iter);
- )
- {
- struct broken_urls_list *list;
-
- logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
-
- for (list = (struct broken_urls_list *) iter.value;
- list;
- list = list->next)
- {
- logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
- }
- }
- logputs (LOG_NOTQUIET, "\n");
-}
-
/* The function returns the pointer to the malloc-ed quoted version of
string s. It will recognize and quote numeric and special graphic