[svn] New option --ignore-case for case-insensitive matching.

[wget] / src / utils.c
diff --git a/src/utils.c b/src/utils.c

index 6461200622e5aa0ebe67fb15467b0ef777ca0f10..991aaf4befa4fd6735646d627016df736eb06384 100644 (file)
--- a/src/utils.c
+++ b/src/utils.c
@@ -1,5 +1,5 @@
  /* Various utility functions.
-   Copyright (C) 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996-2005 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -14,8 +14,8 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  
  In addition, as a special exception, the Free Software Foundation
  gives permission to link the code of its release of Wget with the
@@ -58,9 +58,7 @@ so, delete this exception statement from your version.  */
  #include <fcntl.h>
  #include <assert.h>
  #include <stdarg.h>
-#ifdef HAVE_LOCALE_H
-# include <locale.h>
-#endif
+#include <locale.h>
  
  /* For TIOCGWINSZ and friends: */
  #ifdef HAVE_SYS_IOCTL_H
@@ -617,6 +615,28 @@ file_merge (const char *base, const char *file)
    return result;
  }
  \f
+/* Like fnmatch, but performs a lower-case comparison.  */
+
+int
+fnmatch_nocase (const char *pattern, const char *string, int flags)
+{
+#ifdef FNM_CASEFOLD
+  return fnmatch (pattern, string, flags | FNM_CASEFOLD);
+#else
+  /* Turn PATTERN and STRING to lower case and call fnmatch on them. */
+  char *patcopy = (char *) alloca (strlen (pattern) + 1);
+  char *strcopy = (char *) alloca (strlen (string) + 1);
+  char *p;
+  for (p = patcopy; *pattern; pattern++, p++)
+    *p = TOLOWER (*pattern);
+  *p = '\0';
+  for (p = strcopy; *string; string++, p++)
+    *p = TOLOWER (*string);
+  *p = '\0';
+  return fnmatch (patcopy, strcopy, flags);
+#endif
+}
+
  static bool in_acclist (const char *const *, const char *, bool);
  
  /* Determine whether a file is acceptable to be followed, according to
@@ -644,28 +664,34 @@ acceptable (const char *s)
  }
  
  /* Compare S1 and S2 frontally; S2 must begin with S1.  E.g. if S1 is
-   `/something', frontcmp() will return 1 only if S2 begins with
-   `/something'.  Otherwise, 0 is returned.  */
+   `/something', frontcmp() will return true only if S2 begins with
+   `/something'.  */
  bool
  frontcmp (const char *s1, const char *s2)
  {
-  for (; *s1 && *s2 && (*s1 == *s2); ++s1, ++s2);
+  if (!opt.ignore_case)
+    for (; *s1 && *s2 && (*s1 == *s2); ++s1, ++s2);
+  else
+    for (; *s1 && *s2 && (TOLOWER (*s1) == TOLOWER (*s2)); ++s1, ++s2);
    return *s1 == '\0';
  }
  
  /* Iterate through STRLIST, and return the first element that matches
     S, through wildcards or front comparison (as appropriate).  */
  static char *
-proclist (char **strlist, const char *s, enum accd flags)
+proclist (char **strlist, const char *s)
  {
    char **x;
+  int (*matcher) (const char *, const char *, int)
+    = opt.ignore_case ? fnmatch_nocase : fnmatch;
+
    for (x = strlist; *x; x++)
      {
-      /* Remove leading '/' if ALLABS */
-      char *p = *x + ((flags & ALLABS) && (**x == '/'));
+      /* Remove leading '/' */
+      char *p = *x + (**x == '/');
        if (has_wildcards_p (p))
         {
-         if (fnmatch (p, s, FNM_PATHNAME) == 0)
+         if (matcher (p, s, FNM_PATHNAME) == 0)
             break;
         }
        else
@@ -680,22 +706,23 @@ proclist (char **strlist, const char *s, enum accd flags)
  /* Returns whether DIRECTORY is acceptable for download, wrt the
     include/exclude lists.
  
-   If FLAGS is ALLABS, the leading `/' is ignored in paths; relative
-   and absolute paths may be freely intermixed.  */
+   The leading `/' is ignored in paths; relative and absolute paths
+   may be freely intermixed.  */
+
  bool
-accdir (const char *directory, enum accd flags)
+accdir (const char *directory)
  {
    /* Remove starting '/'.  */
-  if (flags & ALLABS && *directory == '/')
+  if (*directory == '/')
      ++directory;
    if (opt.includes)
      {
-      if (!proclist (opt.includes, directory, flags))
+      if (!proclist (opt.includes, directory))
         return false;
      }
    if (opt.excludes)
      {
-      if (proclist (opt.excludes, directory, flags))
+      if (proclist (opt.excludes, directory))
         return false;
      }
    return true;
@@ -750,21 +777,24 @@ in_acclist (const char *const *accepts, const char *s, bool backward)
      {
        if (has_wildcards_p (*accepts))
         {
-         /* fnmatch returns 0 if the pattern *does* match the
-            string.  */
-         if (fnmatch (*accepts, s, 0) == 0)
+         int res = opt.ignore_case
+           ? fnmatch_nocase (*accepts, s, 0) : fnmatch (*accepts, s, 0);
+         /* fnmatch returns 0 if the pattern *does* match the string.  */
+         if (res == 0)
             return true;
         }
        else
         {
           if (backward)
             {
-             if (match_tail (s, *accepts, 0))
+             if (match_tail (s, *accepts, opt.ignore_case))
                 return true;
             }
           else
             {
-             if (!strcmp (s, *accepts))
+             int cmp = opt.ignore_case
+               ? strcasecmp (s, *accepts) : strcmp (s, *accepts);
+             if (cmp == 0)
                 return true;
             }
         }
@@ -1164,6 +1194,15 @@ free_keys_and_values (struct hash_table *ht)
  }
  
  \f
+/* Get grouping data, the separator and grouping info, by calling
+   localeconv().  The information is cached after the first call to
+   the function.
+
+   In locales that don't set a thousand separator (such as the "C"
+   locale), this forces it to be ",".  We are now only showing
+   thousand separators in one place, so this shouldn't be a problem in
+   practice.  */
+
  static void
  get_grouping_data (const char **sep, const char **grouping)
  {
@@ -1172,27 +1211,29 @@ get_grouping_data (const char **sep, const char **grouping)
    static bool initialized;
    if (!initialized)
      {
-#ifdef LC_NUMERIC
        /* Get the grouping info from the locale. */
-      struct lconv *lconv;
-      const char *oldlocale = setlocale (LC_NUMERIC, "");
-      lconv = localeconv ();
-      cached_sep = xstrdup (lconv->thousands_sep);
-      cached_grouping = xstrdup (lconv->grouping);
-      /* Restore the locale to previous settings. */
-      setlocale (LC_NUMERIC, oldlocale);
-      if (!cached_sep)
-#endif
-       /* Force separator for locales that specify no separators
-          ("C", "hr", and probably many more.) */
-       cached_sep = ",", cached_grouping = "\x03";
+      struct lconv *lconv = localeconv ();
+      cached_sep = lconv->thousands_sep;
+      cached_grouping = lconv->grouping;
+      if (!*cached_sep)
+       {
+         /* Many locales (such as "C" or "hr_HR") don't specify
+            grouping, which we still want to use it for legibility.
+            In those locales set the sep char to ',', unless that
+            character is used for decimal point, in which case set it
+            to ".".  */
+         if (*lconv->decimal_point != ',')
+           cached_sep = ",";
+         else
+           cached_sep = ".";
+         cached_grouping = "\x03";
+       }
        initialized = true;
      }
    *sep = cached_sep;
    *grouping = cached_grouping;
  }
  
-
  /* Return a printed representation of N with thousand separators.
     This should respect locale settings, with the exception of the "C"
     locale which mandates no separator, but we use one anyway.
@@ -1216,12 +1257,19 @@ with_thousand_seps (wgint n)
    int i = 0, groupsize;
    const char *atgroup;
  
+  bool negative = n < 0;
+
    /* Initialize grouping data. */
    get_grouping_data (&sep, &grouping);
    seplen = strlen (sep);
    atgroup = grouping;
    groupsize = *atgroup++;
  
+  /* This will overflow on WGINT_MIN, but we're not using this to
+     print negative numbers anyway.  */
+  if (negative)
+    n = -n;
+
    /* Write the number into the buffer, backwards, inserting the
       separators as necessary.  */
    *--p = '\0';
@@ -1243,6 +1291,9 @@ with_thousand_seps (wgint n)
             groupsize = *atgroup++;
         }
      }
+  if (negative)
+    *--p = '-';
+
    return p;
  }
  
@@ -1296,10 +1347,7 @@ human_readable (HR_NUMTYPE n)
          *this* power.  */
        if ((n / 1024) < 1024 || i == countof (powers) - 1)
         {
-         /* Must cast to long first because MS VC can't directly cast
-            __int64 to double.  (This is safe because N is known to
-            be < 1024^2, so always fits into long.)  */
-         double val = (double) (long) n / 1024.0;
+         double val = n / 1024.0;
           /* Print values smaller than 10 with one decimal digits, and
              others without any decimals.  */
           snprintf (buf, sizeof (buf), "%.*f%c",
@@ -1459,6 +1507,7 @@ number_to_string (char *buffer, wgint number)
  
  #undef PR
  #undef W
+#undef SPRINTF_WGINT
  #undef DIGITS_1
  #undef DIGITS_2
  #undef DIGITS_3
@@ -1552,68 +1601,74 @@ determine_screen_width (void)
    return 0;
  #endif /* neither TIOCGWINSZ nor WINDOWS */
  }
+\f
+/* Whether the rnd system (either rand or [dl]rand48) has been
+   seeded.  */
+static int rnd_seeded;
  
  /* Return a random number between 0 and MAX-1, inclusive.
  
-   If MAX is greater than the value of RAND_MAX+1 on the system, the
-   returned value will be in the range [0, RAND_MAX].  This may be
-   fixed in a future release.
-
+   If the system does not support lrand48 and MAX is greater than the
+   value of RAND_MAX+1 on the system, the returned value will be in
+   the range [0, RAND_MAX].  This may be fixed in a future release.
     The random number generator is seeded automatically the first time
     it is called.
  
-   This uses rand() for portability.  It has been suggested that
-   random() offers better randomness, but this is not required for
-   Wget, so I chose to go for simplicity and use rand
-   unconditionally.
-
-   DO NOT use this for cryptographic purposes.  It is only meant to be
-   used in situations where quality of the random numbers returned
-   doesn't really matter.  */
+   This uses lrand48 where available, rand elsewhere.  DO NOT use it
+   for cryptography.  It is only meant to be used in situations where
+   quality of the random numbers returned doesn't really matter.  */
  
  int
  random_number (int max)
  {
-  static int seeded;
+#ifdef HAVE_DRAND48
+  if (!rnd_seeded)
+    {
+      srand48 ((long) time (NULL) ^ (long) getpid ());
+      rnd_seeded = 1;
+    }
+  return lrand48 () % max;
+#else  /* not HAVE_DRAND48 */
+
    double bounded;
    int rnd;
-
-  if (!seeded)
+  if (!rnd_seeded)
      {
-      srand (time (NULL));
-      seeded = 1;
+      srand ((unsigned) time (NULL) ^ (unsigned) getpid ());
+      rnd_seeded = 1;
      }
    rnd = rand ();
  
-  /* On systems that don't define RAND_MAX, assume it to be 2**15 - 1,
-     and enforce that assumption by masking other bits.  */
-#ifndef RAND_MAX
-# define RAND_MAX 32767
-  rnd &= RAND_MAX;
-#endif
+  /* Like rand() % max, but uses the high-order bits for better
+     randomness on architectures where rand() is implemented using a
+     simple congruential generator.  */
  
-  /* This is equivalent to rand() % max, but uses the high-order bits
-     for better randomness on architecture where rand() is implemented
-     using a simple congruential generator.  */
+  bounded = (double) max * rnd / (RAND_MAX + 1.0);
+  return (int) bounded;
  
-  bounded = (double)max * rnd / (RAND_MAX + 1.0);
-  return (int)bounded;
+#endif /* not HAVE_DRAND48 */
  }
  
  /* Return a random uniformly distributed floating point number in the
-   [0, 1) range.  The precision of returned numbers is 9 digits.
-
-   Modify this to use erand48() where available!  */
+   [0, 1) range.  Uses drand48 where available, and a really lame
+   kludge elsewhere.  */
  
  double
  random_float (void)
  {
-  /* We can't rely on any specific value of RAND_MAX, but I'm pretty
-     sure it's greater than 1000.  */
-  int rnd1 = random_number (1000);
-  int rnd2 = random_number (1000);
-  int rnd3 = random_number (1000);
-  return rnd1 / 1000.0 + rnd2 / 1000000.0 + rnd3 / 1000000000.0;
+#ifdef HAVE_DRAND48
+  if (!rnd_seeded)
+    {
+      srand48 ((long) time (NULL) ^ (long) getpid ());
+      rnd_seeded = 1;
+    }
+  return drand48 ();
+#else  /* not HAVE_DRAND48 */
+  return (  random_number (10000) / 10000.0
+         + random_number (10000) / (10000.0 * 10000.0)
+         + random_number (10000) / (10000.0 * 10000.0 * 10000.0)
+         + random_number (10000) / (10000.0 * 10000.0 * 10000.0 * 10000.0));
+#endif /* not HAVE_DRAND48 */
  }
  \f
  /* Implementation of run_with_timeout, a generic timeout-forcing
@@ -1873,19 +1928,17 @@ base64_encode (const char *str, int length, char *b64store)
    return p - b64store;
  }
  
-#define IS_ASCII(c) (((c) & 0x80) == 0)
-#define IS_BASE64(c) ((IS_ASCII (c) && base64_char_to_value[c] >= 0) || c == '=')
+/* Store in C the next non-whitespace character from the string, or \0
+   when end of string is reached.  */
+#define NEXT_CHAR(c, p) do {                   \
+  c = (unsigned char) *p++;                    \
+} while (ISSPACE (c))
  
-/* Get next character from the string, except that non-base64
-   characters are ignored, as mandated by rfc2045.  */
-#define NEXT_BASE64_CHAR(c, p) do {                    \
-  c = *p++;                                            \
-} while (c != '\0' && !IS_BASE64 (c))
+#define IS_ASCII(c) (((c) & 0x80) == 0)
  
-/* Decode data from BASE64 (assumed to be encoded as base64) into
-   memory pointed to by TO.  TO should be large enough to accomodate
-   the decoded data, which is guaranteed to be less than
-   strlen(base64).
+/* Decode data from BASE64 (pointer to \0-terminated text) into memory
+   pointed to by TO.  TO should be large enough to accomodate the
+   decoded data, which is guaranteed to be less than strlen(base64).
  
     Since TO is assumed to contain binary data, it is not
     NUL-terminated.  The function returns the length of the data
@@ -1897,7 +1950,7 @@ base64_decode (const char *base64, char *to)
  {
    /* Table of base64 values for first 128 characters.  Note that this
       assumes ASCII (but so does Wget in other places).  */
-  static short base64_char_to_value[128] =
+  static signed char base64_char_to_value[128] =
      {
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, /*   0-  9 */
        -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, /*  10- 19 */
@@ -1913,6 +1966,8 @@ base64_decode (const char *base64, char *to)
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48, /* 110-119 */
        49,  50,  51,  -1,  -1,  -1,  -1,  -1            /* 120-127 */
      };
+#define BASE64_CHAR_TO_VALUE(c) ((int) base64_char_to_value[c])
+#define IS_BASE64(c) ((IS_ASCII (c) && BASE64_CHAR_TO_VALUE (c) >= 0) || c == '=')
  
    const char *p = base64;
    char *q = to;
@@ -1923,30 +1978,32 @@ base64_decode (const char *base64, char *to)
        unsigned long value;
  
        /* Process first byte of a quadruplet.  */
-      NEXT_BASE64_CHAR (c, p);
+      NEXT_CHAR (c, p);
        if (!c)
         break;
-      if (c == '=')
-       return -1;              /* illegal '=' while decoding base64 */
-      value = base64_char_to_value[c] << 18;
+      if (c == '=' || !IS_BASE64 (c))
+       return -1;              /* illegal char while decoding base64 */
+      value = BASE64_CHAR_TO_VALUE (c) << 18;
  
-      /* Process scond byte of a quadruplet.  */
-      NEXT_BASE64_CHAR (c, p);
+      /* Process second byte of a quadruplet.  */
+      NEXT_CHAR (c, p);
        if (!c)
         return -1;              /* premature EOF while decoding base64 */
-      if (c == '=')
-       return -1;              /* illegal `=' while decoding base64 */
-      value |= base64_char_to_value[c] << 12;
+      if (c == '=' || !IS_BASE64 (c))
+       return -1;              /* illegal char while decoding base64 */
+      value |= BASE64_CHAR_TO_VALUE (c) << 12;
        *q++ = value >> 16;
  
        /* Process third byte of a quadruplet.  */
-      NEXT_BASE64_CHAR (c, p);
+      NEXT_CHAR (c, p);
        if (!c)
         return -1;              /* premature EOF while decoding base64 */
+      if (!IS_BASE64 (c))
+       return -1;              /* illegal char while decoding base64 */
  
        if (c == '=')
         {
-         NEXT_BASE64_CHAR (c, p);
+         NEXT_CHAR (c, p);
           if (!c)
             return -1;          /* premature EOF while decoding base64 */
           if (c != '=')
@@ -1954,26 +2011,29 @@ base64_decode (const char *base64, char *to)
           continue;
         }
  
-      value |= base64_char_to_value[c] << 6;
+      value |= BASE64_CHAR_TO_VALUE (c) << 6;
        *q++ = 0xff & value >> 8;
  
        /* Process fourth byte of a quadruplet.  */
-      NEXT_BASE64_CHAR (c, p);
+      NEXT_CHAR (c, p);
        if (!c)
         return -1;              /* premature EOF while decoding base64 */
        if (c == '=')
         continue;
+      if (!IS_BASE64 (c))
+       return -1;              /* illegal char while decoding base64 */
  
-      value |= base64_char_to_value[c];
+      value |= BASE64_CHAR_TO_VALUE (c);
        *q++ = 0xff & value;
      }
+#undef IS_BASE64
+#undef BASE64_CHAR_TO_VALUE
  
    return q - to;
  }
  
  #undef IS_ASCII
-#undef IS_BASE64
-#undef NEXT_BASE64_CHAR
+#undef NEXT_CHAR
  \f
  /* Simple merge sort for use by stable_sort.  Implementation courtesy
     Zeljko Vrba with additional debugging by Nenad Barbutov.  */
@@ -2020,3 +2080,38 @@ stable_sort (void *base, size_t nmemb, size_t size,
        mergesort_internal (base, temp, size, 0, nmemb - 1, cmpfun);
      }
  }
+\f
+/* Print a decimal number.  If it is equal to or larger than ten, the
+   number is rounded.  Otherwise it is printed with one significant
+   digit without trailing zeros and with no more than three fractional
+   digits total.  For example, 0.1 is printed as "0.1", 0.035 is
+   printed as "0.04", 0.0091 as "0.009", and 0.0003 as simply "0".
+
+   This is useful for displaying durations because it provides
+   order-of-magnitude information without unnecessary clutter --
+   long-running downloads are shown without the fractional part, and
+   short ones still retain one significant digit.  */
+
+const char *
+print_decimal (double number)
+{
+  static char buf[32];
+  double n = number >= 0 ? number : -number;
+
+  if (n >= 9.95)
+    /* Cut off at 9.95 because the below %.1f would round 9.96 to
+       "10.0" instead of "10".  OTOH 9.94 will print as "9.9".  */
+    snprintf (buf, sizeof buf, "%.0f", number);
+  else if (n >= 0.95)
+    snprintf (buf, sizeof buf, "%.1f", number);
+  else if (n >= 0.001)
+    snprintf (buf, sizeof buf, "%.1g", number);
+  else if (n >= 0.0005)
+    /* round [0.0005, 0.001) to 0.001 */
+    snprintf (buf, sizeof buf, "%.3f", number);
+  else
+    /* print numbers close to 0 as 0, not 0.000 */
+    strcpy (buf, "0");
+
+  return buf;
+}