/* Conversion of links to local files.
- Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+ Copyright (C) 2003-2005 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
#include <stdio.h>
#include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif /* HAVE_STRING_H */
+#include <string.h>
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif /* HAVE_UNISTD_H */
#include <errno.h>
#include <assert.h>
-#include <sys/types.h>
#include "wget.h"
#include "convert.h"
#include "recur.h"
#include "utils.h"
#include "hash.h"
+#include "ptimer.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
-/* List of HTML files downloaded in this Wget run, used for link
- conversion after Wget is done. The list and the set contain the
- same information, except the list maintains the order. Perhaps I
- should get rid of the list, it's there for historical reasons. */
-static slist *downloaded_html_list;
+/* Set of HTML files downloaded in this Wget run, used for link
+ conversion after Wget is done. */
struct hash_table *downloaded_html_set;
-static void convert_links PARAMS ((const char *, struct urlpos *));
+static void convert_links (const char *, struct urlpos *);
/* This function is called when the retrieval is done to convert the
links that have been downloaded. It has to be called at the end of
void
convert_all_links (void)
{
- slist *html;
- long msecs;
+ int i;
+ double secs;
int file_count = 0;
- struct wget_timer *timer = wtimer_new ();
+ struct ptimer *timer = ptimer_new ();
- /* Destructively reverse downloaded_html_files to get it in the right order.
- recursive_retrieve() used slist_prepend() consistently. */
- downloaded_html_list = slist_nreverse (downloaded_html_list);
+ int cnt;
+ char **file_array;
- for (html = downloaded_html_list; html; html = html->next)
+ cnt = 0;
+ if (downloaded_html_set)
+ cnt = hash_table_count (downloaded_html_set);
+ if (cnt == 0)
+ return;
+ file_array = alloca_array (char *, cnt);
+ string_set_to_array (downloaded_html_set, file_array);
+
+ for (i = 0; i < cnt; i++)
{
struct urlpos *urls, *cur_url;
char *url;
- char *file = html->string;
+ char *file = file_array[i];
/* Determine the URL of the HTML file. get_urls_html will need
it. */
free_urlpos (urls);
}
- wtimer_update (timer);
- msecs = wtimer_read (timer);
- wtimer_delete (timer);
- logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
- file_count, (double)msecs / 1000);
+ secs = ptimer_measure (timer);
+ ptimer_destroy (timer);
+ logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
+ file_count, print_decimal (secs));
}
-static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static const char *replace_attr PARAMS ((const char *, int, FILE *,
- const char *));
-static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
- const char *, int));
-static char *local_quote_string PARAMS ((const char *));
-static char *construct_relative PARAMS ((const char *, const char *));
+static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_attr (const char *, int, FILE *, const char *);
+static const char *replace_attr_refresh_hack (const char *, int, FILE *,
+ const char *, int);
+static char *local_quote_string (const char *);
+static char *construct_relative (const char *, const char *);
/* Change the links in one HTML file. LINKS is a list of links in the
document, along with their positions and the desired direction of
any URL needs to be converted in the first place. If not, just
leave the file alone. */
int dry_count = 0;
- struct urlpos *dry = links;
+ struct urlpos *dry;
for (dry = links; dry; dry = dry->next)
if (dry->convert != CO_NOCONVERT)
++dry_count;
}
/* Construct LINK as explained above. */
- link = (char *)xmalloc (3 * basedirs + strlen (linkfile) + 1);
+ link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
for (i = 0; i < basedirs; i++)
memcpy (link + 3 * i, "../", 3);
strcpy (link + 3 * i, linkfile);
return link;
}
+/* Used by write_backup_file to remember which files have been
+ written. */
+static struct hash_table *converted_files;
+
static void
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
{
clobber .orig files sitting around from previous invocations. */
/* Construct the backup filename as the original name plus ".orig". */
- size_t filename_len = strlen(file);
+ size_t filename_len = strlen (file);
char* filename_plus_orig_suffix;
- int already_wrote_backup_file = 0;
- slist* converted_file_ptr;
- static slist* converted_files = NULL;
if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
{
".html", so we need to compare vs. the original URL plus
".orig", not the original URL plus ".html.orig". */
filename_plus_orig_suffix = alloca (filename_len + 1);
- strcpy(filename_plus_orig_suffix, file);
- strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+ strcpy (filename_plus_orig_suffix, file);
+ strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
}
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
{
/* Append ".orig" to the name. */
- filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
- strcpy(filename_plus_orig_suffix, file);
- strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+ filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
+ strcpy (filename_plus_orig_suffix, file);
+ strcpy (filename_plus_orig_suffix + filename_len, ".orig");
}
+ if (!converted_files)
+ converted_files = make_string_hash_table (0);
+
/* We can get called twice on the same URL thanks to the
convert_all_links() call in main(). If we write the .orig file
each time in such a case, it'll end up containing the first-pass
conversion, not the original file. So, see if we've already been
called on this file. */
- converted_file_ptr = converted_files;
- while (converted_file_ptr != NULL)
- if (strcmp(converted_file_ptr->string, file) == 0)
- {
- already_wrote_backup_file = 1;
- break;
- }
- else
- converted_file_ptr = converted_file_ptr->next;
-
- if (!already_wrote_backup_file)
+ if (!string_set_contains (converted_files, file))
{
/* Rename <file> to <file>.orig before former gets written over. */
- if (rename(file, filename_plus_orig_suffix) != 0)
+ if (rename (file, filename_plus_orig_suffix) != 0)
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
file, filename_plus_orig_suffix, strerror (errno));
list.
-- Hrvoje Niksic <hniksic@xemacs.org>
*/
- converted_file_ptr = xmalloc (sizeof (*converted_file_ptr));
- converted_file_ptr->string = xstrdup (file);
- converted_file_ptr->next = converted_files;
- converted_files = converted_file_ptr;
+ string_set_add (converted_files, file);
}
}
-static int find_fragment PARAMS ((const char *, int, const char **,
- const char **));
+static bool find_fragment (const char *, int, const char **, const char **);
/* Replace an attribute's original text with NEW_TEXT. */
static const char *
replace_attr (const char *p, int size, FILE *fp, const char *new_text)
{
- int quote_flag = 0;
+ bool quote_flag = false;
char quote_char = '\"'; /* use "..." for quoting, unless the
original value is quoted, in which
case reuse its quoting char. */
if (*p == '\"' || *p == '\'')
{
quote_char = *p;
- quote_flag = 1;
+ quote_flag = true;
++p;
size -= 2; /* disregard opening and closing quote */
}
/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
preceded by '&'. If the character is not found, return zero. If
- the character is found, return 1 and set BP and EP to point to the
- beginning and end of the region.
+ the character is found, return true and set BP and EP to point to
+ the beginning and end of the region.
This is used for finding the fragment indentifiers in URLs. */
-static int
+static bool
find_fragment (const char *beg, int size, const char **bp, const char **ep)
{
const char *end = beg + size;
- int saw_amp = 0;
+ bool saw_amp = false;
for (; beg < end; beg++)
{
switch (*beg)
{
case '&':
- saw_amp = 1;
+ saw_amp = true;
break;
case '#':
if (!saw_amp)
{
*bp = beg;
*ep = end;
- return 1;
+ return true;
}
/* fallthrough */
default:
- saw_amp = 0;
+ saw_amp = false;
}
}
- return 0;
+ return false;
}
/* Quote FILE for use as local reference to an HTML file.
dl_url_file_map = make_string_hash_table (0); \
} while (0)
-/* Return 1 if S1 and S2 are the same, except for "/index.html". The
- three cases in which it returns one are (substitute any substring
- for "foo"):
+/* Return true if S1 and S2 are the same, except for "/index.html".
+ The three cases in which it returns one are (substitute any
+ substring for "foo"):
m("foo/index.html", "foo/") ==> 1
m("foo/", "foo/index.html") ==> 1
m("foo", "foo/" ==> 1
m("foo", "foo") ==> 1 */
-static int
+static bool
match_except_index (const char *s1, const char *s2)
{
int i;
/* Strings differ at the very beginning -- bail out. We need to
check this explicitly to avoid `lng - 1' reading outside the
array. */
- return 0;
+ return false;
if (!*s1 && !*s2)
/* Both strings hit EOF -- strings are equal. */
- return 1;
+ return true;
else if (*s1 && *s2)
/* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
- return 0;
+ return false;
else if (*s1)
/* S1 is the longer one. */
lng = s1;
if (*lng == '/' && *(lng + 1) == '\0')
/* foo */
/* foo/ */
- return 1;
+ return true;
return 0 == strcmp (lng, "/index.html");
}
{
if (!downloaded_html_set)
downloaded_html_set = make_string_hash_table (0);
- else if (hash_table_contains (downloaded_html_set, file))
- return;
-
- /* The set and the list should use the same copy of FILE, but the
- slist interface insists on strduping the string it gets. Oh
- well. */
string_set_add (downloaded_html_set, file);
- downloaded_html_list = slist_prepend (downloaded_html_list, file);
}
-/* Cleanup the data structures associated with recursive retrieving
- (the variables above). */
+static void downloaded_files_free (void);
+
+/* Cleanup the data structures associated with this file. */
+
void
convert_cleanup (void)
{
}
if (downloaded_html_set)
string_set_free (downloaded_html_set);
- slist_free (downloaded_html_list);
- downloaded_html_list = NULL;
+ downloaded_files_free ();
+ if (converted_files)
+ string_set_free (converted_files);
}
\f
/* Book-keeping code for downloaded files that enables extension
return 0;
}
-void
+static void
downloaded_files_free (void)
{
if (downloaded_files_hash)
downloaded_files_hash = NULL;
}
}
+\f
+/* The function returns the pointer to the malloc-ed quoted version of
+ string s. It will recognize and quote numeric and special graphic
+ entities, as per RFC1866:
+
+ `&' -> `&'
+ `<' -> `<'
+ `>' -> `>'
+ `"' -> `"'
+ SP -> ` '
+
+ No other entities are recognized or replaced. */
+char *
+html_quote_string (const char *s)
+{
+ const char *b = s;
+ char *p, *res;
+ int i;
+
+ /* Pass through the string, and count the new size. */
+ for (i = 0; *s; s++, i++)
+ {
+ if (*s == '&')
+ i += 4; /* `amp;' */
+ else if (*s == '<' || *s == '>')
+ i += 3; /* `lt;' and `gt;' */
+ else if (*s == '\"')
+ i += 5; /* `quot;' */
+ else if (*s == ' ')
+ i += 4; /* #32; */
+ }
+ res = xmalloc (i + 1);
+ s = b;
+ for (p = res; *s; s++)
+ {
+ switch (*s)
+ {
+ case '&':
+ *p++ = '&';
+ *p++ = 'a';
+ *p++ = 'm';
+ *p++ = 'p';
+ *p++ = ';';
+ break;
+ case '<': case '>':
+ *p++ = '&';
+ *p++ = (*s == '<' ? 'l' : 'g');
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case '\"':
+ *p++ = '&';
+ *p++ = 'q';
+ *p++ = 'u';
+ *p++ = 'o';
+ *p++ = 't';
+ *p++ = ';';
+ break;
+ case ' ':
+ *p++ = '&';
+ *p++ = '#';
+ *p++ = '3';
+ *p++ = '2';
+ *p++ = ';';
+ break;
+ default:
+ *p++ = *s;
+ }
+ }
+ *p = '\0';
+ return res;
+}