/* Conversion of links to local files.
- Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+ Copyright (C) 2005 Free Software Foundation, Inc.
This file is part of GNU Wget.
#include <stdio.h>
#include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif /* HAVE_STRING_H */
+#include <string.h>
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#include <assert.h>
-#include <sys/types.h>
#include "wget.h"
#include "convert.h"
#include "recur.h"
#include "utils.h"
#include "hash.h"
+#include "ptimer.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
-/* List of HTML files downloaded in this Wget run, used for link
- conversion after Wget is done. The list and the set contain the
- same information, except the list maintains the order. Perhaps I
- should get rid of the list, it's there for historical reasons. */
-static slist *downloaded_html_list;
+/* Set of HTML files downloaded in this Wget run, used for link
+ conversion after Wget is done. */
struct hash_table *downloaded_html_set;
-static void convert_links PARAMS ((const char *, struct urlpos *));
+static void convert_links (const char *, struct urlpos *);
/* This function is called when the retrieval is done to convert the
links that have been downloaded. It has to be called at the end of
void
convert_all_links (void)
{
- slist *html;
- long msecs;
+ int i;
+ double secs;
int file_count = 0;
- struct wget_timer *timer = wtimer_new ();
+ struct ptimer *timer = ptimer_new ();
- /* Destructively reverse downloaded_html_files to get it in the right order.
- recursive_retrieve() used slist_prepend() consistently. */
- downloaded_html_list = slist_nreverse (downloaded_html_list);
+ int cnt;
+ char **file_array;
- for (html = downloaded_html_list; html; html = html->next)
+ cnt = 0;
+ if (downloaded_html_set)
+ cnt = hash_table_count (downloaded_html_set);
+ if (cnt == 0)
+ return;
+ file_array = alloca_array (char *, cnt);
+ string_set_to_array (downloaded_html_set, file_array);
+
+ for (i = 0; i < cnt; i++)
{
struct urlpos *urls, *cur_url;
char *url;
- char *file = html->string;
+ char *file = file_array[i];
/* Determine the URL of the HTML file. get_urls_html will need
it. */
free_urlpos (urls);
}
- wtimer_update (timer);
- msecs = wtimer_read (timer);
- wtimer_delete (timer);
- logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
- file_count, (double)msecs / 1000);
+ secs = ptimer_measure (timer) / 1000;
+ ptimer_destroy (timer);
+ logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
+ file_count, print_decimal (secs));
}
-static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static const char *replace_attr PARAMS ((const char *, int, FILE *,
- const char *));
-static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
- const char *, int));
-static char *local_quote_string PARAMS ((const char *));
-static char *construct_relative PARAMS ((const char *, const char *));
+static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_attr (const char *, int, FILE *, const char *);
+static const char *replace_attr_refresh_hack (const char *, int, FILE *,
+ const char *, int);
+static char *local_quote_string (const char *);
+static char *construct_relative (const char *, const char *);
/* Change the links in one HTML file. LINKS is a list of links in the
document, along with their positions and the desired direction of
}
/* Construct LINK as explained above. */
- link = (char *)xmalloc (3 * basedirs + strlen (linkfile) + 1);
+ link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
for (i = 0; i < basedirs; i++)
memcpy (link + 3 * i, "../", 3);
strcpy (link + 3 * i, linkfile);
return link;
}
+/* Used by write_backup_file to remember which files have been
+ written. */
+static struct hash_table *converted_files;
+
static void
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
{
clobber .orig files sitting around from previous invocations. */
/* Construct the backup filename as the original name plus ".orig". */
- size_t filename_len = strlen(file);
+ size_t filename_len = strlen (file);
char* filename_plus_orig_suffix;
- int already_wrote_backup_file = 0;
- slist* converted_file_ptr;
- static slist* converted_files = NULL;
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
".html", so we need to compare vs. the original URL plus
".orig", not the original URL plus ".html.orig". */
filename_plus_orig_suffix = alloca (filename_len + 1);
- strcpy(filename_plus_orig_suffix, file);
- strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+ strcpy (filename_plus_orig_suffix, file);
+ strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
}
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
{
/* Append ".orig" to the name. */
- filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
- strcpy(filename_plus_orig_suffix, file);
- strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
+ strcpy (filename_plus_orig_suffix, file);
+ strcpy (filename_plus_orig_suffix + filename_len, ".orig");
}
+ if (!converted_files)
+ converted_files = make_string_hash_table (0);
+
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file
each time in such a case, it'll end up containing the first-pass
conversion, not the original file. So, see if we've already been
called on this file. */
- converted_file_ptr = converted_files;
- while (converted_file_ptr != NULL)
- if (strcmp(converted_file_ptr->string, file) == 0)
- {
- already_wrote_backup_file = 1;
- break;
- }
- else
- converted_file_ptr = converted_file_ptr->next;
-
- if (!already_wrote_backup_file)
+ if (!string_set_contains (converted_files, file))
{
/* Rename <file> to <file>.orig before former gets written over. */
- if (rename(file, filename_plus_orig_suffix) != 0)
+ if (rename (file, filename_plus_orig_suffix) != 0)
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
file, filename_plus_orig_suffix, strerror (errno));
list.
-- Hrvoje Niksic <hniksic@xemacs.org>
*/
- converted_file_ptr = xmalloc (sizeof (*converted_file_ptr));
- converted_file_ptr->string = xstrdup (file);
- converted_file_ptr->next = converted_files;
- converted_files = converted_file_ptr;
+ string_set_add (converted_files, file);
}
}
-static int find_fragment PARAMS ((const char *, int, const char **,
- const char **));
+static bool find_fragment (const char *, int, const char **, const char **);
/* Replace an attribute's original text with NEW_TEXT. */
static const char *
replace_attr (const char *p, int size, FILE *fp, const char *new_text)
{
- int quote_flag = 0;
+ bool quote_flag = false;
char quote_char = '\"'; /* use "..." for quoting, unless the
original value is quoted, in which
case reuse its quoting char. */
if (*p == '\"' || *p == '\'')
{
quote_char = *p;
- quote_flag = 1;
+ quote_flag = true;
++p;
size -= 2; /* disregard opening and closing quote */
}
/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
preceded by '&'. If the character is not found, return zero. If
- the character is found, return 1 and set BP and EP to point to the
- beginning and end of the region.
+ the character is found, return true and set BP and EP to point to
+ the beginning and end of the region.
This is used for finding the fragment indentifiers in URLs. */
-static int
+static bool
find_fragment (const char *beg, int size, const char **bp, const char **ep)
{
const char *end = beg + size;
- int saw_amp = 0;
+ bool saw_amp = false;
for (; beg < end; beg++)
{
switch (*beg)
{
case '&':
- saw_amp = 1;
+ saw_amp = true;
break;
case '#':
if (!saw_amp)
{
*bp = beg;
*ep = end;
- return 1;
+ return true;
}
/* fallthrough */
default:
- saw_amp = 0;
+ saw_amp = false;
}
}
- return 0;
+ return false;
}
/* Quote FILE for use as local reference to an HTML file.
dl_url_file_map = make_string_hash_table (0); \
} while (0)
-/* Return 1 if S1 and S2 are the same, except for "/index.html". The
- three cases in which it returns one are (substitute any substring
- for "foo"):
+/* Return true if S1 and S2 are the same, except for "/index.html".
+ The three cases in which it returns one are (substitute any
+ substring for "foo"):
m("foo/index.html", "foo/") ==> 1
m("foo/", "foo/index.html") ==> 1
m("foo", "foo/" ==> 1
m("foo", "foo") ==> 1 */
-static int
+static bool
match_except_index (const char *s1, const char *s2)
{
int i;
/* Strings differ at the very beginning -- bail out. We need to
check this explicitly to avoid `lng - 1' reading outside the
array. */
- return 0;
+ return false;
if (!*s1 && !*s2)
/* Both strings hit EOF -- strings are equal. */
- return 1;
+ return true;
else if (*s1 && *s2)
/* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
- return 0;
+ return false;
else if (*s1)
/* S1 is the longer one. */
lng = s1;
if (*lng == '/' && *(lng + 1) == '\0')
/* foo */
/* foo/ */
- return 1;
+ return true;
return 0 == strcmp (lng, "/index.html");
}
{
if (!downloaded_html_set)
downloaded_html_set = make_string_hash_table (0);
- else if (hash_table_contains (downloaded_html_set, file))
- return;
-
- /* The set and the list should use the same copy of FILE, but the
- slist interface insists on strduping the string it gets. Oh
- well. */
string_set_add (downloaded_html_set, file);
- downloaded_html_list = slist_prepend (downloaded_html_list, file);
}
-/* Cleanup the data structures associated with recursive retrieving
- (the variables above). */
+static void downloaded_files_free (void);
+
+/* Cleanup the data structures associated with this file. */
+
void
convert_cleanup (void)
{
}
if (downloaded_html_set)
string_set_free (downloaded_html_set);
- slist_free (downloaded_html_list);
- downloaded_html_list = NULL;
+ downloaded_files_free ();
+ if (converted_files)
+ string_set_free (converted_files);
}
\f
/* Book-keeping code for downloaded files that enables extension
return 0;
}
-void
+static void
downloaded_files_free (void)
{
if (downloaded_files_hash)
downloaded_files_hash = NULL;
}
}
+\f
+/* The function returns the pointer to the malloc-ed quoted version of
+ string s. It will recognize and quote numeric and special graphic
+ entities, as per RFC1866:
+
+ `&' -> `&'
+ `<' -> `<'
+ `>' -> `>'
+ `"' -> `"'
+ SP -> ` '
+
+ No other entities are recognized or replaced. */
+char *
+html_quote_string (const char *s)
+{
+ const char *b = s;
+ char *p, *res;
+ int i;
+
+ /* Pass through the string, and count the new size. */
+ for (i = 0; *s; s++, i++)
+ {
+ if (*s == '&')
+ i += 4; /* `amp;' */
+ else if (*s == '<' || *s == '>')
+ i += 3; /* `lt;' and `gt;' */
+ else if (*s == '\"')
+ i += 5; /* `quot;' */
+ else if (*s == ' ')
+ i += 4; /* #32; */
+ }
+ res = xmalloc (i + 1);
+ s = b;
+ for (p = res; *s; s++)
+ {
+ switch (*s)
+ {
+ case '&':
+ *p++ = '&';
+ *p++ = 'a';
+ *p++ = 'm';
+ *p++ = 'p';
+ *p++ = ';';
+ break;
+ case '<': case '>':
+ *p++ = '&';
+ *p++ = (*s == '<' ? 'l' : 'g');
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case '\"':
+ *p++ = '&';
+ *p++ = 'q';
+ *p++ = 'u';
+ *p++ = 'o';
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case ' ':
+ *p++ = '&';
+ *p++ = '#';
+ *p++ = '3';
+ *p++ = '2';
+ *p++ = ';';
+ break;
+ default:
+ *p++ = *s;
+ }
+ }
+ *p = '\0';
+ return res;
+}