X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Furl.c;h=f97a31801ea5a6861fccc47b4a1d4d60632bf6b0;hb=aa07e689f2c03dd25342859e7e527a13467ad219;hp=ca7179a667a45253c35252f8386b19dfef812069;hpb=74fbb03b10f6148b5a0cf5b8831b1872e55df7f6;p=wget

diff --git a/src/url.c b/src/url.c
index ca7179a6..f97a3180 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1,5 +1,5 @@
 /* URL handling.
-   Copyright (C) 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996-2005 Free Software Foundation, Inc.
 
 This file is part of GNU Wget.
 
@@ -14,8 +14,8 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 
 In addition, as a special exception, the Free Software Foundation
 gives permission to link the code of its release of Wget with the
@@ -43,22 +43,33 @@ so, delete this exception statement from your version.  */
 #include "url.h"
 #include "host.h"  /* for is_valid_ipv6_address */
 
+enum {
+  scm_disabled = 1,		/* for https when OpenSSL fails to init. */
+  scm_has_params = 2,		/* whether scheme has ;params */
+  scm_has_query = 4,		/* whether scheme has ?query */
+  scm_has_fragment = 8		/* whether scheme has #fragment */
+};
+
 struct scheme_data
 {
+  /* Short name of the scheme, such as "http" or "ftp". */
   const char *name;
+  /* Leading string that identifies the scheme, such as "https://". */
   const char *leading_string;
+  /* Default port of the scheme when none is specified. */
   int default_port;
-  bool enabled;
+  /* Various flags. */
+  int flags;
 };
 
 /* Supported schemes: */
 static struct scheme_data supported_schemes[] =
 {
-  { "http",	"http://",  DEFAULT_HTTP_PORT,  1 },
+  { "http",	"http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
 #ifdef HAVE_SSL
-  { "https",	"https://", DEFAULT_HTTPS_PORT, 1 },
+  { "https",	"https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
 #endif
-  { "ftp",	"ftp://",   DEFAULT_FTP_PORT,   1 },
+  { "ftp",	"ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
 
   /* SCHEME_INVALID */
   { NULL,	NULL,       -1,                 0 }
@@ -404,7 +415,7 @@ url_scheme (const char *url)
     if (0 == strncasecmp (url, supported_schemes[i].leading_string,
 			  strlen (supported_schemes[i].leading_string)))
       {
-	if (supported_schemes[i].enabled)
+	if (!(supported_schemes[i].flags & scm_disabled))
 	  return (enum url_scheme) i;
 	else
 	  return SCHEME_INVALID;
@@ -444,7 +455,7 @@ scheme_default_port (enum url_scheme scheme)
 void
 scheme_disable (enum url_scheme scheme)
 {
-  supported_schemes[scheme].enabled = false;
+  supported_schemes[scheme].flags |= scm_disabled;
 }
 
 /* Skip the username and password, if present in the URL.  The
@@ -497,7 +508,8 @@ parse_credentials (const char *beg, const char *end, char **user, char **passwd)
 }
 
 /* Used by main.c: detect URLs written using the "shorthand" URL forms
-   popularized by Netscape and NcFTP.  HTTP shorthands look like this:
+   originally popularized by Netscape and NcFTP.  HTTP shorthands look
+   like this:
 
    www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
    www.foo.com[:port]            -> http://www.foo.com[:port]
@@ -513,78 +525,49 @@ char *
 rewrite_shorthand_url (const char *url)
 {
   const char *p;
+  char *ret;
 
   if (url_scheme (url) != SCHEME_INVALID)
     return NULL;
 
   /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
      latter Netscape.  */
-  for (p = url; *p && *p != ':' && *p != '/'; p++)
-    ;
-
+  p = strpbrk (url, ":/");
   if (p == url)
     return NULL;
 
   /* If we're looking at "://", it means the URL uses a scheme we
      don't support, which may include "https" when compiled without
      SSL support.  Don't bogusly rewrite such URLs.  */
-  if (p[0] == ':' && p[1] == '/' && p[2] == '/')
+  if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
     return NULL;
 
-  if (*p == ':')
+  if (p && *p == ':')
     {
-      const char *pp;
-      char *res;
-      /* If the characters after the colon and before the next slash
-	 or end of string are all digits, it's HTTP.  */
-      int digits = 0;
-      for (pp = p + 1; ISDIGIT (*pp); pp++)
-	++digits;
-      if (digits > 0 && (*pp == '/' || *pp == '\0'))
+      /* Colon indicates ftp, as in foo.bar.com:path.  Check for
+	 special case of http port number ("localhost:10000").  */
+      int digits = strspn (p + 1, "0123456789");
+      if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
 	goto http;
 
-      /* Prepend "ftp://" to the entire URL... */
-      res = xmalloc (6 + strlen (url) + 1);
-      sprintf (res, "ftp://%s", url);
-      /* ...and replace ':' with '/'. */
-      res[6 + (p - url)] = '/';
-      return res;
+      /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
+      ret = aprintf ("ftp://%s", url);
+      ret[6 + (p - url)] = '/';
     }
   else
     {
-      char *res;
     http:
-      /* Just prepend "http://" to what we have. */
-      res = xmalloc (7 + strlen (url) + 1);
-      sprintf (res, "http://%s", url);
-      return res;
+      /* Just prepend "http://" to URL. */
+      ret = aprintf ("http://%s", url);
     }
+  return ret;
 }
 
 static void split_path (const char *, char **, char **);
 
 /* Like strpbrk, with the exception that it returns the pointer to the
    terminating zero (end-of-string aka "eos") if no matching character
-   is found.
-
-   Although I normally balk at Gcc-specific optimizations, it probably
-   makes sense here: glibc has optimizations that detect strpbrk being
-   called with literal string as ACCEPT and inline the search.  That
-   optimization is defeated if strpbrk is hidden within the call to
-   another function.  (And no, making strpbrk_or_eos inline doesn't
-   help because the check for literal accept is in the
-   preprocessor.)  */
-
-#if defined(__GNUC__) && __GNUC__ >= 3
-
-#define strpbrk_or_eos(s, accept) ({		\
-  char *SOE_p = strpbrk (s, accept);		\
-  if (!SOE_p)					\
-    SOE_p = strchr (s, '\0');			\
-  SOE_p;					\
-})
-
-#else  /* not __GNUC__ or old gcc */
+   is found.  */
 
 static inline char *
 strpbrk_or_eos (const char *s, const char *accept)
@@ -594,7 +577,6 @@ strpbrk_or_eos (const char *s, const char *accept)
     p = strchr (s, '\0');
   return p;
 }
-#endif /* not __GNUC__ or old gcc */
 
 /* Turn STR into lowercase; return true if a character was actually
    changed. */
@@ -612,13 +594,30 @@ lowercase_str (char *str)
   return changed;
 }
 
+static const char *
+init_seps (enum url_scheme scheme)
+{
+  static char seps[8] = ":/";
+  char *p = seps + 2;
+  int flags = supported_schemes[scheme].flags;
+
+  if (flags & scm_has_params)
+    *p++ = ';';
+  if (flags & scm_has_query)
+    *p++ = '?';
+  if (flags & scm_has_fragment)
+    *p++ = '#';
+  *p++ = '\0';
+  return seps;
+}
+
 static const char *parse_errors[] = {
 #define PE_NO_ERROR			0
   N_("No error"),
 #define PE_UNSUPPORTED_SCHEME		1
   N_("Unsupported scheme"),
-#define PE_EMPTY_HOST			2
-  N_("Empty host"),
+#define PE_INVALID_HOST_NAME		2
+  N_("Invalid host name"),
 #define PE_BAD_PORT_NUMBER		3
   N_("Bad port number"),
 #define PE_INVALID_USER_NAME		4
@@ -644,6 +643,7 @@ url_parse (const char *url, int *error)
   bool path_modified, host_modified;
 
   enum url_scheme scheme;
+  const char *seps;
 
   const char *uname_b,     *uname_e;
   const char *host_b,      *host_e;
@@ -682,10 +682,16 @@ url_parse (const char *url, int *error)
 
        scheme://host[:port][/path][;params][?query][#fragment]  */
 
+  path_b     = path_e     = NULL;
   params_b   = params_e   = NULL;
   query_b    = query_e    = NULL;
   fragment_b = fragment_e = NULL;
 
+  /* Initialize separators for optional parts of URL, depending on the
+     scheme.  For example, FTP has params, and HTTP and HTTPS have
+     query string and fragment. */
+  seps = init_seps (scheme);
+
   host_b = p;
 
   if (*p == '[')
@@ -718,16 +724,28 @@ url_parse (const char *url, int *error)
       error_code = PE_IPV6_NOT_SUPPORTED;
       goto error;
 #endif
+
+      /* The closing bracket must be followed by a separator or by the
+	 null char.  */
+      /* http://[::1]... */
+      /*             ^   */
+      if (!strchr (seps, *p))
+	{
+	  /* Trailing garbage after []-delimited IPv6 address. */
+	  error_code = PE_INVALID_HOST_NAME;
+	  goto error;
+	}
     }
   else
     {
-      p = strpbrk_or_eos (p, ":/;?#");
+      p = strpbrk_or_eos (p, seps);
       host_e = p;
     }
+  ++seps;			/* advance to '/' */
 
   if (host_b == host_e)
     {
-      error_code = PE_EMPTY_HOST;
+      error_code = PE_INVALID_HOST_NAME;
       goto error;
     }
 
@@ -740,76 +758,51 @@ url_parse (const char *url, int *error)
       /*              ^             */
       ++p;
       port_b = p;
-      p = strpbrk_or_eos (p, "/;?#");
+      p = strpbrk_or_eos (p, seps);
       port_e = p;
 
       /* Allow empty port, as per rfc2396. */
       if (port_b != port_e)
-	{
-	  for (port = 0, pp = port_b; pp < port_e; pp++)
-	    {
-	      if (!ISDIGIT (*pp))
-		{
-	 	  /* http://host:12randomgarbage/blah */
-		  /*               ^                  */
-		  error_code = PE_BAD_PORT_NUMBER;
-		  goto error;
-		}
-	      port = 10 * port + (*pp - '0');
-	      /* Check for too large port numbers here, before we have
-		 a chance to overflow on bogus port values.  */
-	      if (port > 65535)
-		{
-		  error_code = PE_BAD_PORT_NUMBER;
-		  goto error;
-		}
-	    }
-	}
+	for (port = 0, pp = port_b; pp < port_e; pp++)
+	  {
+	    if (!ISDIGIT (*pp))
+	      {
+		/* http://host:12randomgarbage/blah */
+		/*               ^                  */
+		error_code = PE_BAD_PORT_NUMBER;
+		goto error;
+	      }
+	    port = 10 * port + (*pp - '0');
+	    /* Check for too large port numbers here, before we have
+	       a chance to overflow on bogus port values.  */
+	    if (port > 0xffff)
+	      {
+		error_code = PE_BAD_PORT_NUMBER;
+		goto error;
+	      }
+	  }
     }
+  /* Advance to the first separator *after* '/' (either ';' or '?',
+     depending on the scheme).  */
+  ++seps;
+
+  /* Get the optional parts of URL, each part being delimited by
+     current location and the position of the next separator.  */
+#define GET_URL_PART(sepchar, var) do {				\
+  if (*p == sepchar)						\
+    var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);	\
+  ++seps;							\
+} while (0)
 
-  if (*p == '/')
-    {
-      ++p;
-      path_b = p;
-      p = strpbrk_or_eos (p, ";?#");
-      path_e = p;
-    }
-  else
-    {
-      /* Path is not allowed not to exist. */
-      path_b = path_e = p;
-    }
+  GET_URL_PART ('/', path);
+  if (supported_schemes[scheme].flags & scm_has_params)
+    GET_URL_PART (';', params);
+  if (supported_schemes[scheme].flags & scm_has_query)
+    GET_URL_PART ('?', query);
+  if (supported_schemes[scheme].flags & scm_has_fragment)
+    GET_URL_PART ('#', fragment);
 
-  if (*p == ';')
-    {
-      ++p;
-      params_b = p;
-      p = strpbrk_or_eos (p, "?#");
-      params_e = p;
-    }
-  if (*p == '?')
-    {
-      ++p;
-      query_b = p;
-      p = strpbrk_or_eos (p, "#");
-      query_e = p;
-
-      /* Hack that allows users to use '?' (a wildcard character) in
-	 FTP URLs without it being interpreted as a query string
-	 delimiter.  */
-      if (scheme == SCHEME_FTP)
-	{
-	  query_b = query_e = NULL;
-	  path_e = p;
-	}
-    }
-  if (*p == '#')
-    {
-      ++p;
-      fragment_b = p;
-      p += strlen (p);
-      fragment_e = p;
-    }
+#undef GET_URL_PART
   assert (*p == 0);
 
   if (uname_b != uname_e)
@@ -1518,8 +1511,7 @@ path_simplify (char *path)
 {
   char *h = path;		/* hare */
   char *t = path;		/* tortoise */
-  char *beg = path;		/* boundary for backing the tortoise */
-  char *end = path + strlen (path);
+  char *end = strchr (path, '\0');
 
   while (h < end)
     {
@@ -1534,26 +1526,17 @@ path_simplify (char *path)
 	{
 	  /* Handle "../" by retreating the tortoise by one path
 	     element -- but not past beggining.  */
-	  if (t > beg)
+	  if (t > path)
 	    {
 	      /* Move backwards until T hits the beginning of the
 		 previous path element or the beginning of path. */
-	      for (--t; t > beg && t[-1] != '/'; t--)
+	      for (--t; t > path && t[-1] != '/'; t--)
 		;
 	    }
-	  else
-	    {
-	      /* If we're at the beginning, copy the "../" literally
-		 move the beginning so a later ".." doesn't remove
-		 it.  */
-	      beg = t + 3;
-	      goto regular;
-	    }
 	  h += 3;
 	}
       else
 	{
-	regular:
 	  /* A regular path element.  If H hasn't advanced past T,
 	     simply skip to the next path element.  Otherwise, copy
 	     the path element until the next slash.  */
@@ -1583,14 +1566,19 @@ path_simplify (char *path)
 }
 
 /* Return the length of URL's path.  Path is considered to be
-   terminated by one of '?', ';', '#', or by the end of the
-   string.  */
+   terminated by one or more of the ?query or ;params or #fragment,
+   depending on the scheme.  */
 
-static int
-path_length (const char *url)
+static const char *
+path_end (const char *url)
 {
-  const char *q = strpbrk_or_eos (url, "?;#");
-  return q - url;
+  enum url_scheme scheme = url_scheme (url);
+  const char *seps;
+  if (scheme == SCHEME_INVALID)
+    scheme = SCHEME_HTTP;	/* use http semantics for rel links */
+  /* +2 to ignore the first two separators ':' and '/' */
+  seps = init_seps (scheme) + 2;
+  return strpbrk_or_eos (url, seps);
 }
 
 /* Find the last occurrence of character C in the range [b, e), or
@@ -1629,7 +1617,7 @@ uri_merge (const char *base, const char *link)
     return xstrdup (link);
 
   /* We may not examine BASE past END. */
-  end = base + path_length (base);
+  end = path_end (base);
   linklength = strlen (link);
 
   if (!*link)
@@ -1974,8 +1962,8 @@ test_path_simplify (void)
     { "",			"",		false },
     { ".",			"",		true },
     { "./",			"",		true },
-    { "..",			"..",		false },
-    { "../",			"../",		false },
+    { "..",			"",		true },
+    { "../",			"",		true },
     { "foo",			"foo",		false },
     { "foo/bar",		"foo/bar",	false },
     { "foo///bar",		"foo///bar",	false },
@@ -1988,9 +1976,9 @@ test_path_simplify (void)
     { "foo/bar/../x",		"foo/x",	true },
     { "foo/bar/../x/",		"foo/x/",	true },
     { "foo/..",			"",		true },
-    { "foo/../..",		"..",		true },
-    { "foo/../../..",		"../..",	true },
-    { "foo/../../bar/../../baz", "../../baz",	true },
+    { "foo/../..",		"",		true },
+    { "foo/../../..",		"",		true },
+    { "foo/../../bar/../../baz", "baz",		true },
     { "a/b/../../c",		"c",		true },
     { "./a/../b",		"b",		true }
   };