[svn] Use sets/hash-tables instead of "slists". Remove slist implementation from

[wget] / src / utils.c
diff --git a/src/utils.c b/src/utils.c

index 2589a233234098f390492585dc3b999930133fe4..1d2c5c539eba1d00dd0f493c2212c0bffa3853bd 100644 (file)
--- a/src/utils.c
+++ b/src/utils.c
@@ -1,5 +1,5 @@
  /* Various utility functions.
-   Copyright (C) 2003 Free Software Foundation, Inc.
+   Copyright (C) 2005 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -120,19 +120,6 @@ xstrdup_lower (const char *s)
    return copy;
  }
  
-/* Return a count of how many times CHR occurs in STRING. */
-
-int
-count_char (const char *string, char chr)
-{
-  const char *p;
-  int count = 0;
-  for (p = string; *p; p++)
-    if (*p == chr)
-      ++count;
-  return count;
-}
-
  /* Copy the string formed by two pointers (one on the beginning, other
     on the char after the last char) to a new, malloc-ed location.
     0-terminate it.  */
@@ -226,6 +213,54 @@ aprintf (const char *fmt, ...)
      }
    return NULL;                 /* unreached */
  }
+
+/* Concatenate the NULL-terminated list of string arguments into
+   freshly allocated space.  */
+
+char *
+concat_strings (const char *str0, ...)
+{
+  va_list args;
+  int saved_lengths[5];                /* inspired by Apache's apr_pstrcat */
+  char *ret, *p;
+
+  const char *next_str;
+  int total_length = 0;
+  int argcount;
+
+  /* Calculate the length of and allocate the resulting string. */
+
+  argcount = 0;
+  VA_START (args, str0);
+  for (next_str = str0; next_str != NULL; next_str = va_arg (args, char *))
+    {
+      int len = strlen (next_str);
+      if (argcount < countof (saved_lengths))
+       saved_lengths[argcount++] = len;
+      total_length += len;
+    }
+  va_end (args);
+  p = ret = xmalloc (total_length + 1);
+
+  /* Copy the strings into the allocated space. */
+
+  argcount = 0;
+  VA_START (args, str0);
+  for (next_str = str0; next_str != NULL; next_str = va_arg (args, char *))
+    {
+      int len;
+      if (argcount < countof (saved_lengths))
+       len = saved_lengths[argcount++];
+      else
+       len = strlen (next_str);
+      memcpy (p, next_str, len);
+      p += len;
+    }
+  va_end (args);
+  *p = '\0';
+
+  return ret;
+}
  \f
  /* Return pointer to a static char[] buffer in which zero-terminated
     string-representation of TM (in form hh:mm:ss) is printed.
@@ -283,12 +318,21 @@ fork_to_background (void)
  {
    pid_t pid;
    /* Whether we arrange our own version of opt.lfilename here.  */
-  int changedp = 0;
+  int logfile_changed = 0;
  
    if (!opt.lfilename)
      {
-      opt.lfilename = unique_name (DEFAULT_LOGFILE, 0);
-      changedp = 1;
+      /* We must create the file immediately to avoid either a race
+        condition (which arises from using unique_name and failing to
+        use fopen_excl) or lying to the user about the log file name
+        (which arises from using unique_name, printing the name, and
+        using fopen_excl later on.)  */
+      FILE *new_log_fp = unique_create (DEFAULT_LOGFILE, 0, &opt.lfilename);
+      if (new_log_fp)
+       {
+         logfile_changed = 1;
+         fclose (new_log_fp);
+       }
      }
    pid = fork ();
    if (pid < 0)
@@ -301,7 +345,7 @@ fork_to_background (void)
      {
        /* parent, no error */
        printf (_("Continuing in background, pid %d.\n"), (int)pid);
-      if (changedp)
+      if (logfile_changed)
         printf (_("Output will be written to `%s'.\n"), opt.lfilename);
        exit (0);                        /* #### should we use _exit()? */
      }
@@ -439,7 +483,7 @@ unique_name_1 (const char *prefix)
     exist at the point in time when the function was called.
     Therefore, where security matters, don't rely that the file created
     by this function exists until you open it with O_EXCL or
-   something.
+   equivalent.
  
     If ALLOW_PASSTHROUGH is 0, it always returns a freshly allocated
     string.  Otherwise, it may return FILE if the file doesn't exist
@@ -457,6 +501,74 @@ unique_name (const char *file, int allow_passthrough)
       and return it.  */
    return unique_name_1 (file);
  }
+
+/* Create a file based on NAME, except without overwriting an existing
+   file with that name.  Providing O_EXCL is correctly implemented,
+   this function does not have the race condition associated with
+   opening the file returned by unique_name.  */
+
+FILE *
+unique_create (const char *name, int binary, char **opened_name)
+{
+  /* unique file name, based on NAME */
+  char *uname = unique_name (name, 0);
+  FILE *fp;
+  while ((fp = fopen_excl (uname, binary)) == NULL && errno == EEXIST)
+    {
+      xfree (uname);
+      uname = unique_name (name, 0);
+    }
+  if (opened_name && fp != NULL)
+    {
+      if (fp)
+       *opened_name = uname;
+      else
+       {
+         *opened_name = NULL;
+         xfree (uname);
+       }
+    }
+  else
+    xfree (uname);
+  return fp;
+}
+
+/* Open the file for writing, with the addition that the file is
+   opened "exclusively".  This means that, if the file already exists,
+   this function will *fail* and errno will be set to EEXIST.  If
+   BINARY is set, the file will be opened in binary mode, equivalent
+   to fopen's "wb".
+
+   If opening the file fails for any reason, including the file having
+   previously existed, this function returns NULL and sets errno
+   appropriately.  */
+   
+FILE *
+fopen_excl (const char *fname, int binary)
+{
+  int fd;
+#ifdef O_EXCL
+  int flags = O_WRONLY | O_CREAT | O_EXCL;
+# ifdef O_BINARY
+  if (binary)
+    flags |= O_BINARY;
+# endif
+  fd = open (fname, flags, 0666);
+  if (fd < 0)
+    return NULL;
+  return fdopen (fd, binary ? "wb" : "w");
+#else  /* not O_EXCL */
+  /* Manually check whether the file exists.  This is prone to race
+     conditions, but systems without O_EXCL haven't deserved
+     better.  */
+  if (file_exists_p (fname))
+    {
+      errno = EEXIST;
+      return NULL;
+    }
+  return fopen (fname, binary ? "wb" : "w");
+#endif /* not O_EXCL */
+}
  \f
  /* Create DIRECTORY.  If some of the pathname components of DIRECTORY
     are missing, create them first.  In case any mkdir() call fails,
@@ -467,9 +579,7 @@ unique_name (const char *file, int allow_passthrough)
  int
  make_directory (const char *directory)
  {
-  int quit = 0;
-  int i;
-  int ret = 0;
+  int i, ret, quit = 0;
    char *dir;
  
    /* Make a copy of dir, to be able to write to it.  Otherwise, the
@@ -807,7 +917,7 @@ read_file (const char *file)
  {
    int fd;
    struct file_memory *fm;
-  wgint size;
+  long size;
    int inhibit_close = 0;
  
    /* Some magic in the finest tradition of Perl and its kin: if FILE
@@ -970,91 +1080,6 @@ merge_vecs (char **v1, char **v2)
    xfree (v2);
    return v1;
  }
-
-/* A set of simple-minded routines to store strings in a linked list.
-   This used to also be used for searching, but now we have hash
-   tables for that.  */
-
-/* It's a shame that these simple things like linked lists and hash
-   tables (see hash.c) need to be implemented over and over again.  It
-   would be nice to be able to use the routines from glib -- see
-   www.gtk.org for details.  However, that would make Wget depend on
-   glib, and I want to avoid dependencies to external libraries for
-   reasons of convenience and portability (I suspect Wget is more
-   portable than anything ever written for Gnome).  */
-
-/* Append an element to the list.  If the list has a huge number of
-   elements, this can get slow because it has to find the list's
-   ending.  If you think you have to call slist_append in a loop,
-   think about calling slist_prepend() followed by slist_nreverse().  */
-
-slist *
-slist_append (slist *l, const char *s)
-{
-  slist *newel = xnew (slist);
-  slist *beg = l;
-
-  newel->string = xstrdup (s);
-  newel->next = NULL;
-
-  if (!l)
-    return newel;
-  /* Find the last element.  */
-  while (l->next)
-    l = l->next;
-  l->next = newel;
-  return beg;
-}
-
-/* Prepend S to the list.  Unlike slist_append(), this is O(1).  */
-
-slist *
-slist_prepend (slist *l, const char *s)
-{
-  slist *newel = xnew (slist);
-  newel->string = xstrdup (s);
-  newel->next = l;
-  return newel;
-}
-
-/* Destructively reverse L. */
-
-slist *
-slist_nreverse (slist *l)
-{
-  slist *prev = NULL;
-  while (l)
-    {
-      slist *next = l->next;
-      l->next = prev;
-      prev = l;
-      l = next;
-    }
-  return prev;
-}
-
-/* Is there a specific entry in the list?  */
-int
-slist_contains (slist *l, const char *s)
-{
-  for (; l; l = l->next)
-    if (!strcmp (l->string, s))
-      return 1;
-  return 0;
-}
-
-/* Free the whole slist.  */
-void
-slist_free (slist *l)
-{
-  while (l)
-    {
-      slist *n = l->next;
-      xfree (l->string);
-      xfree (l);
-      l = n;
-    }
-}
  \f
  /* Sometimes it's useful to create "sets" of strings, i.e. special
     hash tables where you want to store strings as keys and merely
@@ -1085,6 +1110,22 @@ string_set_contains (struct hash_table *ht, const char *s)
    return hash_table_contains (ht, s);
  }
  
+static int
+string_set_to_array_mapper (void *key, void *value_ignored, void *arg)
+{
+  char ***arrayptr = (char ***) arg;
+  *(*arrayptr)++ = (char *) key;
+  return 0;
+}
+
+/* Convert the specified string set to array.  ARRAY should be large
+   enough to hold hash_table_count(ht) char pointers.  */
+
+void string_set_to_array (struct hash_table *ht, char **array)
+{
+  hash_table_map (ht, string_set_to_array_mapper, &array);
+}
+
  static int
  string_set_free_mapper (void *key, void *value_ignored, void *arg_ignored)
  {
@@ -1116,11 +1157,11 @@ free_keys_and_values (struct hash_table *ht)
  }
  
  \f
-/* Engine for legible and legible_large_int; add thousand separators
-   to numbers printed in strings.  */
+/* Add thousand separators to a number already in string form.  Used
+   by with_thousand_seps and with_thousand_seps_large.  */
  
  static char *
-legible_1 (const char *repr)
+add_thousand_seps (const char *repr)
  {
    static char outbuf[48];
    int i, i1, mod;
@@ -1156,54 +1197,118 @@ legible_1 (const char *repr)
    return outbuf;
  }
  
-/* Legible -- return a static pointer to the legibly printed wgint.  */
+/* Return a static pointer to the number printed with thousand
+   separators inserted at the right places.  */
  
  char *
-legible (wgint l)
+with_thousand_seps (wgint l)
  {
    char inbuf[24];
    /* Print the number into the buffer.  */
    number_to_string (inbuf, l);
-  return legible_1 (inbuf);
+  return add_thousand_seps (inbuf);
  }
  
  /* Write a string representation of LARGE_INT NUMBER into the provided
-   buffer.  The buffer should be able to accept 24 characters,
-   including the terminating zero.
+   buffer.
  
     It would be dangerous to use sprintf, because the code wouldn't
     work on a machine with gcc-provided long long support, but without
-   libc support for "%lld".  However, such platforms will typically
-   not have snprintf and will use our version, which does support
-   "%lld" where long longs are available.  */
+   libc support for "%lld".  However, such old systems platforms
+   typically lack snprintf and will end up using our version, which
+   does support "%lld" whereever long longs are available.  */
  
  static void
-large_int_to_string (char *buffer, LARGE_INT number)
+large_int_to_string (char *buffer, int bufsize, LARGE_INT number)
  {
-  snprintf (buffer, 24, LARGE_INT_FMT, number);
+  snprintf (buffer, bufsize, LARGE_INT_FMT, number);
  }
  
-/* The same as legible(), but works on LARGE_INT.  */
+/* The same as with_thousand_seps, but works on LARGE_INT.  */
  
  char *
-legible_large_int (LARGE_INT l)
+with_thousand_seps_large (LARGE_INT l)
  {
    char inbuf[48];
-  large_int_to_string (inbuf, l);
-  return legible_1 (inbuf);
+  large_int_to_string (inbuf, sizeof (inbuf), l);
+  return add_thousand_seps (inbuf);
+}
+
+/* N, a byte quantity, is converted to a human-readable abberviated
+   form a la sizes printed by `ls -lh'.  The result is written to a
+   static buffer, a pointer to which is returned.
+
+   Unlike `with_thousand_seps', this approximates to the nearest unit.
+   Quoting GNU libit: "Most people visually process strings of 3-4
+   digits effectively, but longer strings of digits are more prone to
+   misinterpretation.  Hence, converting to an abbreviated form
+   usually improves readability."
+
+   This intentionally uses kilobyte (KB), megabyte (MB), etc. in their
+   original computer science meaning of "multiples of 1024".
+   Multiples of 1000 would be useless since Wget already adds thousand
+   separators for legibility.  We don't use the "*bibyte" names
+   invented in 1998, and seldom used in practice.  Wikipedia's entry
+   on kilobyte discusses this in some detail.  */
+
+char *
+human_readable (wgint n)
+{
+  /* These suffixes are compatible with those of GNU `ls -lh'. */
+  static char powers[] =
+    {
+      'K',                     /* kilobyte, 2^10 bytes */
+      'M',                     /* megabyte, 2^20 bytes */
+      'G',                     /* gigabyte, 2^30 bytes */
+      'T',                     /* terabyte, 2^40 bytes */
+      'P',                     /* petabyte, 2^50 bytes */
+      'E',                     /* exabyte,  2^60 bytes */
+    };
+  static char buf[8];
+  int i;
+
+  /* If the quantity is smaller than 1K, just print it. */
+  if (n < 1024)
+    {
+      snprintf (buf, sizeof (buf), "%d", (int) n);
+      return buf;
+    }
+
+  /* Loop over powers, dividing N with 1024 in each iteration.  This
+     works unchanged for all sizes of wgint, while still avoiding
+     non-portable `long double' arithmetic.  */
+  for (i = 0; i < countof (powers); i++)
+    {
+      /* At each iteration N is greater than the *subsequent* power.
+        That way N/1024.0 produces a decimal number in the units of
+        *this* power.  */
+      if ((n >> 10) < 1024 || i == countof (powers) - 1)
+       {
+         /* Must cast to long first because MS VC can't directly cast
+            __int64 to double.  (This is safe because N is known to
+            be <2**20.)  */
+         double val = (double) (long) n / 1024.0;
+         /* Print values smaller than 10 with one decimal digits, and
+            others without any decimals.  */
+         snprintf (buf, sizeof (buf), "%.*f%c",
+                   val < 10 ? 1 : 0, val, powers[i]);
+         return buf;
+       }
+      n >>= 10;
+    }
+  return NULL;                 /* unreached */
  }
  
-/* Count the digits in an integer number.  */
+/* Count the digits in the provided number.  Used to allocate space
+   when printing numbers.  */
+
  int
  numdigit (wgint number)
  {
    int cnt = 1;
    if (number < 0)
-    {
-      number = -number;
-      ++cnt;
-    }
-  while ((number /= 10) > 0)
+    ++cnt;                     /* accomodate '-' */
+  while ((number /= 10) != 0)
      ++cnt;
    return cnt;
  }
@@ -1293,8 +1398,8 @@ numdigit (wgint number)
  #endif
  
  /* Print NUMBER to BUFFER in base 10.  This is equivalent to
-   `sprintf(buffer, "%lld", (long long) number)', only much faster and
-   portable to machines without long long.
+   `sprintf(buffer, "%lld", (long long) number)', only typically much
+   faster and portable to machines without long long.
  
     The speedup may make a difference in programs that frequently
     convert numbers to strings.  Some implementations of sprintf,
@@ -1449,11 +1554,7 @@ number_to_static_string (wgint number)
     only one of the above constants will be defined.  Virtually all
     modern Unix systems will define TIMER_GETTIMEOFDAY; Windows will
     use TIMER_WINDOWS.  TIMER_TIME is a catch-all method for
-   non-Windows systems without gettimeofday.
-
-   #### Perhaps we should also support ftime(), which exists on old
-   BSD 4.2-influenced systems?  (It also existed under MS DOS Borland
-   C, if memory serves me.)  */
+   non-Windows systems without gettimeofday.  */
  
  #ifdef WINDOWS
  # define TIMER_WINDOWS
@@ -1474,7 +1575,10 @@ typedef time_t wget_sys_time;
  #endif
  
  #ifdef TIMER_WINDOWS
-typedef ULARGE_INTEGER wget_sys_time;
+typedef union {
+  DWORD lores;          /* In case GetTickCount is used */
+  LARGE_INTEGER hires;  /* In case high-resolution timer is used */
+} wget_sys_time;
  #endif
  
  struct wget_timer {
@@ -1494,6 +1598,39 @@ struct wget_timer {
    double elapsed_pre_start;
  };
  
+#ifdef TIMER_WINDOWS
+
+/* Whether high-resolution timers are used.  Set by wtimer_initialize_once
+   the first time wtimer_allocate is called. */
+static int using_hires_timers;
+
+/* Frequency of high-resolution timers -- number of updates per
+   millisecond.  Calculated the first time wtimer_allocate is called
+   provided that high-resolution timers are available. */
+static double hires_millisec_freq;
+
+/* The first time a timer is created, determine whether to use
+   high-resolution timers. */
+
+static void
+wtimer_initialize_once (void)
+{
+  static int init_done;
+  if (!init_done)
+    {
+      LARGE_INTEGER freq;
+      init_done = 1;
+      freq.QuadPart = 0;
+      QueryPerformanceFrequency (&freq);
+      if (freq.QuadPart != 0)
+        {
+          using_hires_timers = 1;
+          hires_millisec_freq = (double) freq.QuadPart / 1000.0;
+        }
+     }
+}
+#endif /* TIMER_WINDOWS */
+
  /* Allocate a timer.  Calling wtimer_read on the timer will return
     zero.  It is not legal to call wtimer_update with a freshly
     allocated timer -- use wtimer_reset first.  */
@@ -1503,6 +1640,11 @@ wtimer_allocate (void)
  {
    struct wget_timer *wt = xnew (struct wget_timer);
    xzero (*wt);
+
+#ifdef TIMER_WINDOWS
+  wtimer_initialize_once ();
+#endif
+
    return wt;
  }
  
@@ -1539,32 +1681,24 @@ wtimer_sys_set (wget_sys_time *wst)
  #endif
  
  #ifdef TIMER_WINDOWS
-  /* We use GetSystemTime to get the elapsed time.  MSDN warns that
-     system clock adjustments can skew the output of GetSystemTime
-     when used as a timer and gives preference to GetTickCount and
-     high-resolution timers.  But GetTickCount can overflow, and hires
-     timers are typically used for profiling, not for regular time
-     measurement.  Since we handle clock skew anyway, we just use
-     GetSystemTime.  */
-  FILETIME ft;
-  SYSTEMTIME st;
-  GetSystemTime (&st);
-
-  /* As recommended by MSDN, we convert SYSTEMTIME to FILETIME, copy
-     FILETIME to ULARGE_INTEGER, and use regular 64-bit integer
-     arithmetic on that.  */
-  SystemTimeToFileTime (&st, &ft);
-  wst->HighPart = ft.dwHighDateTime;
-  wst->LowPart  = ft.dwLowDateTime;
+  if (using_hires_timers)
+    {
+      QueryPerformanceCounter (&wst->hires);
+    }
+  else
+    {
+      /* Where hires counters are not available, use GetTickCount rather
+         GetSystemTime, because it is unaffected by clock skew and simpler
+         to use.  Note that overflows don't affect us because we never use
+         absolute values of the ticker, only the differences.  */
+      wst->lores = GetTickCount ();
+    }
  #endif
  }
  
  /* Reset timer WT.  This establishes the starting point from which
     wtimer_elapsed() will return the number of elapsed milliseconds.
-   It is allowed to reset a previously used timer.
-
-   If a non-zero value is used as START, the timer's values will be
-   offset by START.  */
+   It is allowed to reset a previously used timer.  */
  
  void
  wtimer_reset (struct wget_timer *wt)
@@ -1589,10 +1723,10 @@ wtimer_sys_diff (wget_sys_time *wst1, wget_sys_time *wst2)
  #endif
  
  #ifdef WINDOWS
-  /* VC++ 6 doesn't support direct cast of uint64 to double.  To work
-     around this, we subtract, then convert to signed, then finally to
-     double.  */
-  return (double)(signed __int64)(wst1->QuadPart - wst2->QuadPart) / 10000;
+  if (using_hires_timers)
+    return (wst1->hires.QuadPart - wst2->hires.QuadPart) / hires_millisec_freq;
+  else
+    return wst1->lores - wst2->lores;
  #endif
  }
  
@@ -1674,9 +1808,10 @@ wtimer_granularity (void)
  #endif
  
  #ifdef TIMER_WINDOWS
-  /* According to MSDN, GetSystemTime returns a broken-down time
-     structure the smallest member of which are milliseconds.  */
-  return 1;
+  if (using_hires_timers)
+    return 1.0 / hires_millisec_freq;
+  else
+    return 10;  /* according to MSDN */
  #endif
  }
  \f
@@ -1849,40 +1984,6 @@ random_float (void)
    int rnd3 = random_number (1000);
    return rnd1 / 1000.0 + rnd2 / 1000000.0 + rnd3 / 1000000000.0;
  }
-
-#if 0
-/* A debugging function for checking whether an MD5 library works. */
-
-#include "gen-md5.h"
-
-char *
-debug_test_md5 (char *buf)
-{
-  unsigned char raw[16];
-  static char res[33];
-  unsigned char *p1;
-  char *p2;
-  int cnt;
-  ALLOCA_MD5_CONTEXT (ctx);
-
-  gen_md5_init (ctx);
-  gen_md5_update ((unsigned char *)buf, strlen (buf), ctx);
-  gen_md5_finish (ctx, raw);
-
-  p1 = raw;
-  p2 = res;
-  cnt = 16;
-  while (cnt--)
-    {
-      *p2++ = XNUM_TO_digit (*p1 >> 4);
-      *p2++ = XNUM_TO_digit (*p1 & 0xf);
-      ++p1;
-    }
-  *p2 = '\0';
-
-  return res;
-}
-#endif
  \f
  /* Implementation of run_with_timeout, a generic timeout-forcing
     routine for systems with Unix-like signal handling.  */
@@ -2053,8 +2154,9 @@ xsleep (double seconds)
  #ifdef HAVE_NANOSLEEP
    /* nanosleep is the preferred interface because it offers high
       accuracy and, more importantly, because it allows us to reliably
-     restart after having been interrupted by a signal such as
-     SIGWINCH.  */
+     restart receiving a signal such as SIGWINCH.  (There was an
+     actual Debian bug report about --limit-rate malfunctioning while
+     the terminal was being resized.)  */
    struct timespec sleep, remaining;
    sleep.tv_sec = (long) seconds;
    sleep.tv_nsec = 1000000000L * (seconds - (long) seconds);