[svn] Don't preserve ".." at beginning of path.

[wget] / src / url.c
diff --git a/src/url.c b/src/url.c

index adf5b1d08205488906666707cc8131120f31b4b0..f97a31801ea5a6861fccc47b4a1d4d60632bf6b0 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1,5 +1,5 @@
  /* URL handling.
-   Copyright (C) 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996-2005 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -14,8 +14,8 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  
  In addition, as a special exception, the Free Software Foundation
  gives permission to link the code of its release of Wget with the
@@ -43,6 +43,13 @@ so, delete this exception statement from your version.  */
  #include "url.h"
  #include "host.h"  /* for is_valid_ipv6_address */
  
+enum {
+  scm_disabled = 1,            /* for https when OpenSSL fails to init. */
+  scm_has_params = 2,          /* whether scheme has ;params */
+  scm_has_query = 4,           /* whether scheme has ?query */
+  scm_has_fragment = 8         /* whether scheme has #fragment */
+};
+
  struct scheme_data
  {
    /* Short name of the scheme, such as "http" or "ftp". */
@@ -51,23 +58,18 @@ struct scheme_data
    const char *leading_string;
    /* Default port of the scheme when none is specified. */
    int default_port;
-  /* Used for disabling https when OpenSSL fails to init. */
-  bool disabled;
-  /* Allowed separators, handled by url_parse.  For example, ftp
-     doesn't support the "?query", and http/https don't support
-     ";params".  All schemes must support at least "/:".  */
-  const char *separators;
+  /* Various flags. */
    int flags;
  };
  
  /* Supported schemes: */
  static struct scheme_data supported_schemes[] =
  {
-  { "http",    "http://",  DEFAULT_HTTP_PORT,  false, "/:?#" },
+  { "http",    "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
  #ifdef HAVE_SSL
-  { "https",   "https://", DEFAULT_HTTPS_PORT, false, "/:?#" },
+  { "https",   "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
  #endif
-  { "ftp",     "ftp://",   DEFAULT_FTP_PORT,   false, "/:;#" },
+  { "ftp",     "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
  
    /* SCHEME_INVALID */
    { NULL,      NULL,       -1,                 0 }
@@ -413,7 +415,7 @@ url_scheme (const char *url)
      if (0 == strncasecmp (url, supported_schemes[i].leading_string,
                           strlen (supported_schemes[i].leading_string)))
        {
-       if (!(supported_schemes[i].disabled))
+       if (!(supported_schemes[i].flags & scm_disabled))
           return (enum url_scheme) i;
         else
           return SCHEME_INVALID;
@@ -453,7 +455,7 @@ scheme_default_port (enum url_scheme scheme)
  void
  scheme_disable (enum url_scheme scheme)
  {
-  supported_schemes[scheme].disabled = true;
+  supported_schemes[scheme].flags |= scm_disabled;
  }
  
  /* Skip the username and password, if present in the URL.  The
@@ -506,7 +508,8 @@ parse_credentials (const char *beg, const char *end, char **user, char **passwd)
  }
  
  /* Used by main.c: detect URLs written using the "shorthand" URL forms
-   popularized by Netscape and NcFTP.  HTTP shorthands look like this:
+   originally popularized by Netscape and NcFTP.  HTTP shorthands look
+   like this:
  
     www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
     www.foo.com[:port]            -> http://www.foo.com[:port]
@@ -522,78 +525,49 @@ char *
  rewrite_shorthand_url (const char *url)
  {
    const char *p;
+  char *ret;
  
    if (url_scheme (url) != SCHEME_INVALID)
      return NULL;
  
    /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
       latter Netscape.  */
-  for (p = url; *p && *p != ':' && *p != '/'; p++)
-    ;
-
+  p = strpbrk (url, ":/");
    if (p == url)
      return NULL;
  
    /* If we're looking at "://", it means the URL uses a scheme we
       don't support, which may include "https" when compiled without
       SSL support.  Don't bogusly rewrite such URLs.  */
-  if (p[0] == ':' && p[1] == '/' && p[2] == '/')
+  if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
      return NULL;
  
-  if (*p == ':')
+  if (p && *p == ':')
      {
-      const char *pp;
-      char *res;
-      /* If the characters after the colon and before the next slash
-        or end of string are all digits, it's HTTP.  */
-      int digits = 0;
-      for (pp = p + 1; ISDIGIT (*pp); pp++)
-       ++digits;
-      if (digits > 0 && (*pp == '/' || *pp == '\0'))
+      /* Colon indicates ftp, as in foo.bar.com:path.  Check for
+        special case of http port number ("localhost:10000").  */
+      int digits = strspn (p + 1, "0123456789");
+      if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
         goto http;
  
-      /* Prepend "ftp://" to the entire URL... */
-      res = xmalloc (6 + strlen (url) + 1);
-      sprintf (res, "ftp://%s", url);
-      /* ...and replace ':' with '/'. */
-      res[6 + (p - url)] = '/';
-      return res;
+      /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
+      ret = aprintf ("ftp://%s", url);
+      ret[6 + (p - url)] = '/';
      }
    else
      {
-      char *res;
      http:
-      /* Just prepend "http://" to what we have. */
-      res = xmalloc (7 + strlen (url) + 1);
-      sprintf (res, "http://%s", url);
-      return res;
+      /* Just prepend "http://" to URL. */
+      ret = aprintf ("http://%s", url);
      }
+  return ret;
  }
  \f
  static void split_path (const char *, char **, char **);
  
  /* Like strpbrk, with the exception that it returns the pointer to the
     terminating zero (end-of-string aka "eos") if no matching character
-   is found.
-
-   Although I normally balk at Gcc-specific optimizations, it probably
-   makes sense here: glibc has optimizations that detect strpbrk being
-   called with literal string as ACCEPT and inline the search.  That
-   optimization is defeated if strpbrk is hidden within the call to
-   another function.  (And no, making strpbrk_or_eos inline doesn't
-   help because the check for literal accept is in the
-   preprocessor.)  */
-
-#if defined(__GNUC__) && __GNUC__ >= 3
-
-#define strpbrk_or_eos(s, accept) ({           \
-  char *SOE_p = strpbrk (s, accept);           \
-  if (!SOE_p)                                  \
-    SOE_p = strchr (s, '\0');                  \
-  SOE_p;                                       \
-})
-
-#else  /* not __GNUC__ or old gcc */
+   is found.  */
  
  static inline char *
  strpbrk_or_eos (const char *s, const char *accept)
@@ -603,7 +577,6 @@ strpbrk_or_eos (const char *s, const char *accept)
      p = strchr (s, '\0');
    return p;
  }
-#endif /* not __GNUC__ or old gcc */
  
  /* Turn STR into lowercase; return true if a character was actually
     changed. */
@@ -621,6 +594,23 @@ lowercase_str (char *str)
    return changed;
  }
  
+static const char *
+init_seps (enum url_scheme scheme)
+{
+  static char seps[8] = ":/";
+  char *p = seps + 2;
+  int flags = supported_schemes[scheme].flags;
+
+  if (flags & scm_has_params)
+    *p++ = ';';
+  if (flags & scm_has_query)
+    *p++ = '?';
+  if (flags & scm_has_fragment)
+    *p++ = '#';
+  *p++ = '\0';
+  return seps;
+}
+
  static const char *parse_errors[] = {
  #define PE_NO_ERROR                    0
    N_("No error"),
@@ -700,7 +690,7 @@ url_parse (const char *url, int *error)
    /* Initialize separators for optional parts of URL, depending on the
       scheme.  For example, FTP has params, and HTTP and HTTPS have
       query string and fragment. */
-  seps = supported_schemes[scheme].separators;
+  seps = init_seps (scheme);
  
    host_b = p;
  
@@ -805,9 +795,12 @@ url_parse (const char *url, int *error)
  } while (0)
  
    GET_URL_PART ('/', path);
-  GET_URL_PART (';', params);
-  GET_URL_PART ('?', query);
-  GET_URL_PART ('#', fragment);
+  if (supported_schemes[scheme].flags & scm_has_params)
+    GET_URL_PART (';', params);
+  if (supported_schemes[scheme].flags & scm_has_query)
+    GET_URL_PART ('?', query);
+  if (supported_schemes[scheme].flags & scm_has_fragment)
+    GET_URL_PART ('#', fragment);
  
  #undef GET_URL_PART
    assert (*p == 0);
@@ -1518,8 +1511,7 @@ path_simplify (char *path)
  {
    char *h = path;              /* hare */
    char *t = path;              /* tortoise */
-  char *beg = path;            /* boundary for backing the tortoise */
-  char *end = path + strlen (path);
+  char *end = strchr (path, '\0');
  
    while (h < end)
      {
@@ -1534,26 +1526,17 @@ path_simplify (char *path)
         {
           /* Handle "../" by retreating the tortoise by one path
              element -- but not past beggining.  */
-         if (t > beg)
+         if (t > path)
             {
               /* Move backwards until T hits the beginning of the
                  previous path element or the beginning of path. */
-             for (--t; t > beg && t[-1] != '/'; t--)
+             for (--t; t > path && t[-1] != '/'; t--)
                 ;
             }
-         else
-           {
-             /* If we're at the beginning, copy the "../" literally
-                move the beginning so a later ".." doesn't remove
-                it.  */
-             beg = t + 3;
-             goto regular;
-           }
           h += 3;
         }
        else
         {
-       regular:
           /* A regular path element.  If H hasn't advanced past T,
              simply skip to the next path element.  Otherwise, copy
              the path element until the next slash.  */
@@ -1583,14 +1566,19 @@ path_simplify (char *path)
  }
  \f
  /* Return the length of URL's path.  Path is considered to be
-   terminated by one of '?', ';', '#', or by the end of the
-   string.  */
+   terminated by one or more of the ?query or ;params or #fragment,
+   depending on the scheme.  */
  
-static int
-path_length (const char *url)
+static const char *
+path_end (const char *url)
  {
-  const char *q = strpbrk_or_eos (url, "?;#");
-  return q - url;
+  enum url_scheme scheme = url_scheme (url);
+  const char *seps;
+  if (scheme == SCHEME_INVALID)
+    scheme = SCHEME_HTTP;      /* use http semantics for rel links */
+  /* +2 to ignore the first two separators ':' and '/' */
+  seps = init_seps (scheme) + 2;
+  return strpbrk_or_eos (url, seps);
  }
  
  /* Find the last occurrence of character C in the range [b, e), or
@@ -1629,7 +1617,7 @@ uri_merge (const char *base, const char *link)
      return xstrdup (link);
  
    /* We may not examine BASE past END. */
-  end = base + path_length (base);
+  end = path_end (base);
    linklength = strlen (link);
  
    if (!*link)
@@ -1974,8 +1962,8 @@ test_path_simplify (void)
      { "",                      "",             false },
      { ".",                     "",             true },
      { "./",                    "",             true },
-    { "..",                    "..",           false },
-    { "../",                   "../",          false },
+    { "..",                    "",             true },
+    { "../",                   "",             true },
      { "foo",                   "foo",          false },
      { "foo/bar",               "foo/bar",      false },
      { "foo///bar",             "foo///bar",    false },
@@ -1988,9 +1976,9 @@ test_path_simplify (void)
      { "foo/bar/../x",          "foo/x",        true },
      { "foo/bar/../x/",         "foo/x/",       true },
      { "foo/..",                        "",             true },
-    { "foo/../..",             "..",           true },
-    { "foo/../../..",          "../..",        true },
-    { "foo/../../bar/../../baz", "../../baz",  true },
+    { "foo/../..",             "",             true },
+    { "foo/../../..",          "",             true },
+    { "foo/../../bar/../../baz", "baz",                true },
      { "a/b/../../c",           "c",            true },
      { "./a/../b",              "b",            true }
    };