/* Conversion of links to local files.
- Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+ Copyright (C) 2005 Free Software Foundation, Inc.
This file is part of GNU Wget.
#include <stdio.h>
#include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif /* HAVE_STRING_H */
+#include <string.h>
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#include <assert.h>
-#include <sys/types.h>
#include "wget.h"
#include "convert.h"
#include "recur.h"
#include "utils.h"
#include "hash.h"
+#include "ptimer.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
-/* List of HTML files downloaded in this Wget run, used for link
- conversion after Wget is done. The list and the set contain the
- same information, except the list maintains the order. Perhaps I
- should get rid of the list, it's there for historical reasons. */
-static slist *downloaded_html_list;
+/* Set of HTML files downloaded in this Wget run, used for link
+ conversion after Wget is done. */
struct hash_table *downloaded_html_set;
-static void convert_links PARAMS ((const char *, struct urlpos *));
+static void convert_links (const char *, struct urlpos *);
/* This function is called when the retrieval is done to convert the
links that have been downloaded. It has to be called at the end of
void
convert_all_links (void)
{
- slist *html;
- long msecs;
+ int i;
+ double secs;
int file_count = 0;
- struct wget_timer *timer = wtimer_new ();
+ struct ptimer *timer = ptimer_new ();
+
+ int cnt;
+ char **file_array;
- /* Destructively reverse downloaded_html_files to get it in the right order.
- recursive_retrieve() used slist_prepend() consistently. */
- downloaded_html_list = slist_nreverse (downloaded_html_list);
+ cnt = 0;
+ if (downloaded_html_set)
+ cnt = hash_table_count (downloaded_html_set);
+ if (cnt == 0)
+ return;
+ file_array = alloca_array (char *, cnt);
+ string_set_to_array (downloaded_html_set, file_array);
- for (html = downloaded_html_list; html; html = html->next)
+ for (i = 0; i < cnt; i++)
{
struct urlpos *urls, *cur_url;
char *url;
- char *file = html->string;
+ char *file = file_array[i];
/* Determine the URL of the HTML file. get_urls_html will need
it. */
free_urlpos (urls);
}
- msecs = wtimer_elapsed (timer);
- wtimer_delete (timer);
- logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
- file_count, (double)msecs / 1000);
+ secs = ptimer_measure (timer) / 1000;
+ ptimer_destroy (timer);
+ logprintf (LOG_VERBOSE, _("Converted %d files in %.*f seconds.\n"),
+ file_count, secs < 10 ? 3 : 1, secs);
}
-static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static const char *replace_attr PARAMS ((const char *, int, FILE *,
- const char *));
-static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
- const char *, int));
-static char *local_quote_string PARAMS ((const char *));
-static char *construct_relative PARAMS ((const char *, const char *));
+static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_attr (const char *, int, FILE *, const char *);
+static const char *replace_attr_refresh_hack (const char *, int, FILE *,
+ const char *, int);
+static char *local_quote_string (const char *);
+static char *construct_relative (const char *, const char *);
/* Change the links in one HTML file. LINKS is a list of links in the
document, along with their positions and the desired direction of
any URL needs to be converted in the first place. If not, just
leave the file alone. */
int dry_count = 0;
- struct urlpos *dry = links;
+ struct urlpos *dry;
for (dry = links; dry; dry = dry->next)
if (dry->convert != CO_NOCONVERT)
++dry_count;
return link;
}
+/* Used by write_backup_file to remember which files have been
+ written. */
+static struct hash_table *converted_files;
+
static void
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
{
clobber .orig files sitting around from previous invocations. */
/* Construct the backup filename as the original name plus ".orig". */
- size_t filename_len = strlen(file);
+ size_t filename_len = strlen (file);
char* filename_plus_orig_suffix;
- int already_wrote_backup_file = 0;
- slist* converted_file_ptr;
- static slist* converted_files = NULL;
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
".html", so we need to compare vs. the original URL plus
".orig", not the original URL plus ".html.orig". */
filename_plus_orig_suffix = alloca (filename_len + 1);
- strcpy(filename_plus_orig_suffix, file);
- strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+ strcpy (filename_plus_orig_suffix, file);
+ strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
}
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
{
/* Append ".orig" to the name. */
- filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
- strcpy(filename_plus_orig_suffix, file);
- strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
+ strcpy (filename_plus_orig_suffix, file);
+ strcpy (filename_plus_orig_suffix + filename_len, ".orig");
}
+ if (!converted_files)
+ converted_files = make_string_hash_table (0);
+
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file
each time in such a case, it'll end up containing the first-pass
conversion, not the original file. So, see if we've already been
called on this file. */
- converted_file_ptr = converted_files;
- while (converted_file_ptr != NULL)
- if (strcmp(converted_file_ptr->string, file) == 0)
- {
- already_wrote_backup_file = 1;
- break;
- }
- else
- converted_file_ptr = converted_file_ptr->next;
-
- if (!already_wrote_backup_file)
+ if (!string_set_contains (converted_files, file))
{
/* Rename <file> to <file>.orig before former gets written over. */
- if (rename(file, filename_plus_orig_suffix) != 0)
+ if (rename (file, filename_plus_orig_suffix) != 0)
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
file, filename_plus_orig_suffix, strerror (errno));
list.
-- Hrvoje Niksic <hniksic@xemacs.org>
*/
- converted_file_ptr = xmalloc (sizeof (*converted_file_ptr));
- converted_file_ptr->string = xstrdup (file);
- converted_file_ptr->next = converted_files;
- converted_files = converted_file_ptr;
+ string_set_add (converted_files, file);
}
}
-static int find_fragment PARAMS ((const char *, int, const char **,
- const char **));
+static int find_fragment (const char *, int, const char **, const char **);
/* Replace an attribute's original text with NEW_TEXT. */
"index.html%3Ffoo=bar" would break local browsing, as the latter
isn't even recognized as an HTML file! However, converting
"index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
- safe for both local and HTTP-served browsing. */
+ safe for both local and HTTP-served browsing.
+
+ We always quote "#" as "%23" and "%" as "%25" because those
+ characters have special meanings in URLs. */
static char *
local_quote_string (const char *file)
{
- const char *file_sans_qmark;
- int qm;
+ const char *from;
+ char *newname, *to;
- if (!opt.html_extension)
+ char *any = strpbrk (file, "?#%");
+ if (!any)
return html_quote_string (file);
- qm = count_char (file, '?');
-
- if (qm)
- {
- const char *from = file;
- char *to, *newname;
-
- /* qm * 2 because we replace each question mark with "%3F",
- i.e. replace one char with three, hence two more. */
- int fsqlen = strlen (file) + qm * 2;
-
- to = newname = (char *)alloca (fsqlen + 1);
- for (; *from; from++)
- {
- if (*from != '?')
- *to++ = *from;
- else
- {
- *to++ = '%';
- *to++ = '3';
- *to++ = 'F';
- }
- }
- assert (to - newname == fsqlen);
- *to = '\0';
-
- file_sans_qmark = newname;
- }
- else
- file_sans_qmark = file;
+ /* Allocate space assuming the worst-case scenario, each character
+ having to be quoted. */
+ to = newname = (char *)alloca (3 * strlen (file) + 1);
+ for (from = file; *from; from++)
+ switch (*from)
+ {
+ case '%':
+ *to++ = '%';
+ *to++ = '2';
+ *to++ = '5';
+ break;
+ case '#':
+ *to++ = '%';
+ *to++ = '2';
+ *to++ = '3';
+ break;
+ case '?':
+ if (opt.html_extension)
+ {
+ *to++ = '%';
+ *to++ = '3';
+ *to++ = 'F';
+ break;
+ }
+ /* fallthrough */
+ default:
+ *to++ = *from;
+ }
+ *to = '\0';
- return html_quote_string (file_sans_qmark);
+ return html_quote_string (newname);
}
\f
/* Book-keeping code for dl_file_url_map, dl_url_file_map,
{
if (!downloaded_html_set)
downloaded_html_set = make_string_hash_table (0);
- else if (hash_table_contains (downloaded_html_set, file))
- return;
-
- /* The set and the list should use the same copy of FILE, but the
- slist interface insists on strduping the string it gets. Oh
- well. */
string_set_add (downloaded_html_set, file);
- downloaded_html_list = slist_prepend (downloaded_html_list, file);
}
-/* Cleanup the data structures associated with recursive retrieving
- (the variables above). */
+static void downloaded_files_free (void);
+
+/* Cleanup the data structures associated with this file. */
+
void
convert_cleanup (void)
{
}
if (downloaded_html_set)
string_set_free (downloaded_html_set);
- slist_free (downloaded_html_list);
- downloaded_html_list = NULL;
+ downloaded_files_free ();
+ if (converted_files)
+ string_set_free (converted_files);
}
\f
/* Book-keeping code for downloaded files that enables extension
return 0;
}
-void
+static void
downloaded_files_free (void)
{
if (downloaded_files_hash)
downloaded_files_hash = NULL;
}
}
+\f
+/* The function returns the pointer to the malloc-ed quoted version of
+ string s. It will recognize and quote numeric and special graphic
+ entities, as per RFC1866:
+
+ `&' -> `&'
+ `<' -> `<'
+ `>' -> `>'
+ `"' -> `"'
+ SP -> ` '
+
+ No other entities are recognized or replaced. */
+char *
+html_quote_string (const char *s)
+{
+ const char *b = s;
+ char *p, *res;
+ int i;
+
+ /* Pass through the string, and count the new size. */
+ for (i = 0; *s; s++, i++)
+ {
+ if (*s == '&')
+ i += 4; /* `amp;' */
+ else if (*s == '<' || *s == '>')
+ i += 3; /* `lt;' and `gt;' */
+ else if (*s == '\"')
+ i += 5; /* `quot;' */
+ else if (*s == ' ')
+ i += 4; /* #32; */
+ }
+ res = (char *)xmalloc (i + 1);
+ s = b;
+ for (p = res; *s; s++)
+ {
+ switch (*s)
+ {
+ case '&':
+ *p++ = '&';
+ *p++ = 'a';
+ *p++ = 'm';
+ *p++ = 'p';
+ *p++ = ';';
+ break;
+ case '<': case '>':
+ *p++ = '&';
+ *p++ = (*s == '<' ? 'l' : 'g');
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case '\"':
+ *p++ = '&';
+ *p++ = 'q';
+ *p++ = 'u';
+ *p++ = 'o';
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case ' ':
+ *p++ = '&';
+ *p++ = '#';
+ *p++ = '3';
+ *p++ = '2';
+ *p++ = ';';
+ break;
+ default:
+ *p++ = *s;
+ }
+ }
+ *p = '\0';
+ return res;
+}