[svn] Include ETA information in dot progress.

[wget] / src / convert.c
diff --git a/src/convert.c b/src/convert.c

index d3e85ae83d3077a644e2e5fa383f2a63277ef54e..8afef4c557e54dfaeab8905b1f111809575499c2 100644 (file)
--- a/src/convert.c
+++ b/src/convert.c
@@ -1,5 +1,5 @@
  /* Conversion of links to local files.
-   Copyright (C) 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 2005 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -31,17 +31,12 @@ so, delete this exception statement from your version.  */
  
  #include <stdio.h>
  #include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif /* HAVE_STRING_H */
+#include <string.h>
  #ifdef HAVE_UNISTD_H
  # include <unistd.h>
  #endif /* HAVE_UNISTD_H */
  #include <errno.h>
  #include <assert.h>
-#include <sys/types.h>
  
  #include "wget.h"
  #include "convert.h"
@@ -49,18 +44,16 @@ so, delete this exception statement from your version.  */
  #include "recur.h"
  #include "utils.h"
  #include "hash.h"
+#include "ptimer.h"
  
  static struct hash_table *dl_file_url_map;
  struct hash_table *dl_url_file_map;
  
-/* List of HTML files downloaded in this Wget run, used for link
-   conversion after Wget is done.  The list and the set contain the
-   same information, except the list maintains the order.  Perhaps I
-   should get rid of the list, it's there for historical reasons.  */
-static slist *downloaded_html_list;
+/* Set of HTML files downloaded in this Wget run, used for link
+   conversion after Wget is done.  */
  struct hash_table *downloaded_html_set;
  
-static void convert_links PARAMS ((const char *, struct urlpos *));
+static void convert_links (const char *, struct urlpos *);
  
  /* This function is called when the retrieval is done to convert the
     links that have been downloaded.  It has to be called at the end of
@@ -80,21 +73,28 @@ static void convert_links PARAMS ((const char *, struct urlpos *));
  void
  convert_all_links (void)
  {
-  slist *html;
-  long msecs;
+  int i;
+  double secs;
    int file_count = 0;
  
-  struct wget_timer *timer = wtimer_new ();
+  struct ptimer *timer = ptimer_new ();
+
+  int cnt;
+  char **file_array;
  
-  /* Destructively reverse downloaded_html_files to get it in the right order.
-     recursive_retrieve() used slist_prepend() consistently.  */
-  downloaded_html_list = slist_nreverse (downloaded_html_list);
+  cnt = 0;
+  if (downloaded_html_set)
+    cnt = hash_table_count (downloaded_html_set);
+  if (cnt == 0)
+    return;
+  file_array = alloca_array (char *, cnt);
+  string_set_to_array (downloaded_html_set, file_array);
  
-  for (html = downloaded_html_list; html; html = html->next)
+  for (i = 0; i < cnt; i++)
      {
        struct urlpos *urls, *cur_url;
        char *url;
-      char *file = html->string;
+      char *file = file_array[i];
  
        /* Determine the URL of the HTML file.  get_urls_html will need
          it.  */
@@ -166,19 +166,18 @@ convert_all_links (void)
        free_urlpos (urls);
      }
  
-  msecs = wtimer_elapsed (timer);
-  wtimer_delete (timer);
-  logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
-            file_count, (double)msecs / 1000);
+  secs = ptimer_measure (timer) / 1000;
+  ptimer_destroy (timer);
+  logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
+            file_count, print_decimal (secs));
  }
  
-static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static const char *replace_attr PARAMS ((const char *, int, FILE *,
-                                        const char *));
-static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
-                                                     const char *, int));
-static char *local_quote_string PARAMS ((const char *));
-static char *construct_relative PARAMS ((const char *, const char *));
+static void write_backup_file (const char *, downloaded_file_t);
+static const char *replace_attr (const char *, int, FILE *, const char *);
+static const char *replace_attr_refresh_hack (const char *, int, FILE *,
+                                             const char *, int);
+static char *local_quote_string (const char *);
+static char *construct_relative (const char *, const char *);
  
  /* Change the links in one HTML file.  LINKS is a list of links in the
     document, along with their positions and the desired direction of
@@ -201,7 +200,7 @@ convert_links (const char *file, struct urlpos *links)
         any URL needs to be converted in the first place.  If not, just
         leave the file alone.  */
      int dry_count = 0;
-    struct urlpos *dry = links;
+    struct urlpos *dry;
      for (dry = links; dry; dry = dry->next)
        if (dry->convert != CO_NOCONVERT)
         ++dry_count;
@@ -383,13 +382,17 @@ construct_relative (const char *basefile, const char *linkfile)
      }
  
    /* Construct LINK as explained above. */
-  link = (char *)xmalloc (3 * basedirs + strlen (linkfile) + 1);
+  link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
    for (i = 0; i < basedirs; i++)
      memcpy (link + 3 * i, "../", 3);
    strcpy (link + 3 * i, linkfile);
    return link;
  }
  
+/* Used by write_backup_file to remember which files have been
+   written. */
+static struct hash_table *converted_files;
+
  static void
  write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
  {
@@ -399,11 +402,8 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
       clobber .orig files sitting around from previous invocations. */
  
    /* Construct the backup filename as the original name plus ".orig". */
-  size_t         filename_len = strlen(file);
+  size_t         filename_len = strlen (file);
    char*          filename_plus_orig_suffix;
-  int            already_wrote_backup_file = 0;
-  slist*         converted_file_ptr;
-  static slist*  converted_files = NULL;
  
    if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
      {
@@ -415,36 +415,29 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
          ".html", so we need to compare vs. the original URL plus
          ".orig", not the original URL plus ".html.orig". */
        filename_plus_orig_suffix = alloca (filename_len + 1);
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
+      strcpy (filename_plus_orig_suffix, file);
+      strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
      }
    else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
      {
        /* Append ".orig" to the name. */
-      filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
-      strcpy(filename_plus_orig_suffix, file);
-      strcpy(filename_plus_orig_suffix + filename_len, ".orig");
+      filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
+      strcpy (filename_plus_orig_suffix, file);
+      strcpy (filename_plus_orig_suffix + filename_len, ".orig");
      }
  
+  if (!converted_files)
+    converted_files = make_string_hash_table (0);
+
    /* We can get called twice on the same URL thanks to the
       convert_all_links() call in main().  If we write the .orig file
       each time in such a case, it'll end up containing the first-pass
       conversion, not the original file.  So, see if we've already been
       called on this file. */
-  converted_file_ptr = converted_files;
-  while (converted_file_ptr != NULL)
-    if (strcmp(converted_file_ptr->string, file) == 0)
-      {
-       already_wrote_backup_file = 1;
-       break;
-      }
-    else
-      converted_file_ptr = converted_file_ptr->next;
-
-  if (!already_wrote_backup_file)
+  if (!string_set_contains (converted_files, file))
      {
        /* Rename <file> to <file>.orig before former gets written over. */
-      if (rename(file, filename_plus_orig_suffix) != 0)
+      if (rename (file, filename_plus_orig_suffix) != 0)
         logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
                    file, filename_plus_orig_suffix, strerror (errno));
  
@@ -465,22 +458,18 @@ write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
           list.
          -- Hrvoje Niksic <hniksic@xemacs.org>
        */
-      converted_file_ptr = xmalloc (sizeof (*converted_file_ptr));
-      converted_file_ptr->string = xstrdup (file);
-      converted_file_ptr->next = converted_files;
-      converted_files = converted_file_ptr;
+      string_set_add (converted_files, file);
      }
  }
  
-static int find_fragment PARAMS ((const char *, int, const char **,
-                                 const char **));
+static bool find_fragment (const char *, int, const char **, const char **);
  
  /* Replace an attribute's original text with NEW_TEXT. */
  
  static const char *
  replace_attr (const char *p, int size, FILE *fp, const char *new_text)
  {
-  int quote_flag = 0;
+  bool quote_flag = false;
    char quote_char = '\"';      /* use "..." for quoting, unless the
                                    original value is quoted, in which
                                    case reuse its quoting char. */
@@ -496,7 +485,7 @@ replace_attr (const char *p, int size, FILE *fp, const char *new_text)
    if (*p == '\"' || *p == '\'')
      {
        quote_char = *p;
-      quote_flag = 1;
+      quote_flag = true;
        ++p;
        size -= 2;               /* disregard opening and closing quote */
      }
@@ -534,36 +523,36 @@ replace_attr_refresh_hack (const char *p, int size, FILE *fp,
  
  /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
     preceded by '&'.  If the character is not found, return zero.  If
-   the character is found, return 1 and set BP and EP to point to the
-   beginning and end of the region.
+   the character is found, return true and set BP and EP to point to
+   the beginning and end of the region.
  
     This is used for finding the fragment indentifiers in URLs.  */
  
-static int
+static bool
  find_fragment (const char *beg, int size, const char **bp, const char **ep)
  {
    const char *end = beg + size;
-  int saw_amp = 0;
+  bool saw_amp = false;
    for (; beg < end; beg++)
      {
        switch (*beg)
         {
         case '&':
-         saw_amp = 1;
+         saw_amp = true;
           break;
         case '#':
           if (!saw_amp)
             {
               *bp = beg;
               *ep = end;
-             return 1;
+             return true;
             }
           /* fallthrough */
         default:
-         saw_amp = 0;
+         saw_amp = false;
         }
      }
-  return 0;
+  return false;
  }
  
  /* Quote FILE for use as local reference to an HTML file.
@@ -575,49 +564,52 @@ find_fragment (const char *beg, int size, const char **bp, const char **ep)
     "index.html%3Ffoo=bar" would break local browsing, as the latter
     isn't even recognized as an HTML file!  However, converting
     "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
-   safe for both local and HTTP-served browsing.  */
+   safe for both local and HTTP-served browsing.
+
+   We always quote "#" as "%23" and "%" as "%25" because those
+   characters have special meanings in URLs.  */
  
  static char *
  local_quote_string (const char *file)
  {
-  const char *file_sans_qmark;
-  int qm;
+  const char *from;
+  char *newname, *to;
  
-  if (!opt.html_extension)
+  char *any = strpbrk (file, "?#%");
+  if (!any)
      return html_quote_string (file);
  
-  qm = count_char (file, '?');
-
-  if (qm)
-    {
-      const char *from = file;
-      char *to, *newname;
-
-      /* qm * 2 because we replace each question mark with "%3F",
-        i.e. replace one char with three, hence two more.  */
-      int fsqlen = strlen (file) + qm * 2;
-
-      to = newname = (char *)alloca (fsqlen + 1);
-      for (; *from; from++)
-       {
-         if (*from != '?')
-           *to++ = *from;
-         else
-           {
-             *to++ = '%';
-             *to++ = '3';
-             *to++ = 'F';
-           }
-       }
-      assert (to - newname == fsqlen);
-      *to = '\0';
-
-      file_sans_qmark = newname;
-    }
-  else
-    file_sans_qmark = file;
+  /* Allocate space assuming the worst-case scenario, each character
+     having to be quoted.  */
+  to = newname = (char *)alloca (3 * strlen (file) + 1);
+  for (from = file; *from; from++)
+    switch (*from)
+      {
+      case '%':
+       *to++ = '%';
+       *to++ = '2';
+       *to++ = '5';
+       break;
+      case '#':
+       *to++ = '%';
+       *to++ = '2';
+       *to++ = '3';
+       break;
+      case '?':
+       if (opt.html_extension)
+         {
+           *to++ = '%';
+           *to++ = '3';
+           *to++ = 'F';
+           break;
+         }
+       /* fallthrough */
+      default:
+       *to++ = *from;
+      }
+  *to = '\0';
  
-  return html_quote_string (file_sans_qmark);
+  return html_quote_string (newname);
  }
  \f
  /* Book-keeping code for dl_file_url_map, dl_url_file_map,
@@ -631,9 +623,9 @@ local_quote_string (const char *file)
      dl_url_file_map = make_string_hash_table (0);      \
  } while (0)
  
-/* Return 1 if S1 and S2 are the same, except for "/index.html".  The
-   three cases in which it returns one are (substitute any substring
-   for "foo"):
+/* Return true if S1 and S2 are the same, except for "/index.html".
+   The three cases in which it returns one are (substitute any
+   substring for "foo"):
  
     m("foo/index.html", "foo/")  ==> 1
     m("foo/", "foo/index.html")  ==> 1
@@ -641,7 +633,7 @@ local_quote_string (const char *file)
     m("foo", "foo/"              ==> 1
     m("foo", "foo")              ==> 1  */
  
-static int
+static bool
  match_except_index (const char *s1, const char *s2)
  {
    int i;
@@ -654,14 +646,14 @@ match_except_index (const char *s1, const char *s2)
      /* Strings differ at the very beginning -- bail out.  We need to
         check this explicitly to avoid `lng - 1' reading outside the
         array.  */
-    return 0;
+    return false;
  
    if (!*s1 && !*s2)
      /* Both strings hit EOF -- strings are equal. */
-    return 1;
+    return true;
    else if (*s1 && *s2)
      /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
-    return 0;
+    return false;
    else if (*s1)
      /* S1 is the longer one. */
      lng = s1;
@@ -680,7 +672,7 @@ match_except_index (const char *s1, const char *s2)
    if (*lng == '/' && *(lng + 1) == '\0')
      /* foo  */
      /* foo/ */
-    return 1;
+    return true;
  
    return 0 == strcmp (lng, "/index.html");
  }
@@ -835,18 +827,13 @@ register_html (const char *url, const char *file)
  {
    if (!downloaded_html_set)
      downloaded_html_set = make_string_hash_table (0);
-  else if (hash_table_contains (downloaded_html_set, file))
-    return;
-
-  /* The set and the list should use the same copy of FILE, but the
-     slist interface insists on strduping the string it gets.  Oh
-     well. */
    string_set_add (downloaded_html_set, file);
-  downloaded_html_list = slist_prepend (downloaded_html_list, file);
  }
  
-/* Cleanup the data structures associated with recursive retrieving
-   (the variables above).  */
+static void downloaded_files_free (void);
+
+/* Cleanup the data structures associated with this file.  */
+
  void
  convert_cleanup (void)
  {
@@ -864,8 +851,9 @@ convert_cleanup (void)
      }
    if (downloaded_html_set)
      string_set_free (downloaded_html_set);
-  slist_free (downloaded_html_list);
-  downloaded_html_list = NULL;
+  downloaded_files_free ();
+  if (converted_files)
+    string_set_free (converted_files);
  }
  \f
  /* Book-keeping code for downloaded files that enables extension
@@ -956,7 +944,7 @@ df_free_mapper (void *key, void *value, void *ignored)
    return 0;
  }
  
-void
+static void
  downloaded_files_free (void)
  {
    if (downloaded_files_hash)
@@ -966,3 +954,75 @@ downloaded_files_free (void)
        downloaded_files_hash = NULL;
      }
  }
+\f
+/* The function returns the pointer to the malloc-ed quoted version of
+   string s.  It will recognize and quote numeric and special graphic
+   entities, as per RFC1866:
+
+   `&' -> `&amp;'
+   `<' -> `&lt;'
+   `>' -> `&gt;'
+   `"' -> `&quot;'
+   SP  -> `&#32;'
+
+   No other entities are recognized or replaced.  */
+char *
+html_quote_string (const char *s)
+{
+  const char *b = s;
+  char *p, *res;
+  int i;
+
+  /* Pass through the string, and count the new size.  */
+  for (i = 0; *s; s++, i++)
+    {
+      if (*s == '&')
+       i += 4;                 /* `amp;' */
+      else if (*s == '<' || *s == '>')
+       i += 3;                 /* `lt;' and `gt;' */
+      else if (*s == '\"')
+       i += 5;                 /* `quot;' */
+      else if (*s == ' ')
+       i += 4;                 /* #32; */
+    }
+  res = xmalloc (i + 1);
+  s = b;
+  for (p = res; *s; s++)
+    {
+      switch (*s)
+       {
+       case '&':
+         *p++ = '&';
+         *p++ = 'a';
+         *p++ = 'm';
+         *p++ = 'p';
+         *p++ = ';';
+         break;
+       case '<': case '>':
+         *p++ = '&';
+         *p++ = (*s == '<' ? 'l' : 'g');
+         *p++ = 't';
+         *p++ = ';';
+         break;
+       case '\"':
+         *p++ = '&';
+         *p++ = 'q';
+         *p++ = 'u';
+         *p++ = 'o';
+         *p++ = 't';
+         *p++ = ';';
+         break;
+       case ' ':
+         *p++ = '&';
+         *p++ = '#';
+         *p++ = '3';
+         *p++ = '2';
+         *p++ = ';';
+         break;
+       default:
+         *p++ = *s;
+       }
+    }
+  *p = '\0';
+  return res;
+}