/* URL handling.
- Copyright (C) 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996-2005 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+enum {
+ scm_disabled = 1, /* for https when OpenSSL fails to init. */
+ scm_has_params = 2, /* whether scheme has ;params */
+ scm_has_query = 4, /* whether scheme has ?query */
+ scm_has_fragment = 8 /* whether scheme has #fragment */
+};
+
struct scheme_data
{
+ /* Short name of the scheme, such as "http" or "ftp". */
const char *name;
+ /* Leading string that identifies the scheme, such as "https://". */
const char *leading_string;
+ /* Default port of the scheme when none is specified. */
int default_port;
- bool enabled;
+ /* Various flags. */
+ int flags;
};
/* Supported schemes: */
static struct scheme_data supported_schemes[] =
{
- { "http", "http://", DEFAULT_HTTP_PORT, 1 },
+ { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment },
#ifdef HAVE_SSL
- { "https", "https://", DEFAULT_HTTPS_PORT, 1 },
+ { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
#endif
- { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },
+ { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment },
/* SCHEME_INVALID */
{ NULL, NULL, -1, 0 }
if (0 == strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string)))
{
- if (supported_schemes[i].enabled)
+ if (!(supported_schemes[i].flags & scm_disabled))
return (enum url_scheme) i;
else
return SCHEME_INVALID;
void
scheme_disable (enum url_scheme scheme)
{
- supported_schemes[scheme].enabled = false;
+ supported_schemes[scheme].flags |= scm_disabled;
}
/* Skip the username and password, if present in the URL. The
}
/* Used by main.c: detect URLs written using the "shorthand" URL forms
- popularized by Netscape and NcFTP. HTTP shorthands look like this:
+ originally popularized by Netscape and NcFTP. HTTP shorthands look
+ like this:
www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
www.foo.com[:port] -> http://www.foo.com[:port]
rewrite_shorthand_url (const char *url)
{
const char *p;
+ char *ret;
if (url_scheme (url) != SCHEME_INVALID)
return NULL;
/* Look for a ':' or '/'. The former signifies NcFTP syntax, the
latter Netscape. */
- for (p = url; *p && *p != ':' && *p != '/'; p++)
- ;
-
+ p = strpbrk (url, ":/");
if (p == url)
return NULL;
/* If we're looking at "://", it means the URL uses a scheme we
don't support, which may include "https" when compiled without
SSL support. Don't bogusly rewrite such URLs. */
- if (p[0] == ':' && p[1] == '/' && p[2] == '/')
+ if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
return NULL;
- if (*p == ':')
+ if (p && *p == ':')
{
- const char *pp;
- char *res;
- /* If the characters after the colon and before the next slash
- or end of string are all digits, it's HTTP. */
- int digits = 0;
- for (pp = p + 1; ISDIGIT (*pp); pp++)
- ++digits;
- if (digits > 0 && (*pp == '/' || *pp == '\0'))
+ /* Colon indicates ftp, as in foo.bar.com:path. Check for
+ special case of http port number ("localhost:10000"). */
+ int digits = strspn (p + 1, "0123456789");
+ if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
goto http;
- /* Prepend "ftp://" to the entire URL... */
- res = xmalloc (6 + strlen (url) + 1);
- sprintf (res, "ftp://%s", url);
- /* ...and replace ':' with '/'. */
- res[6 + (p - url)] = '/';
- return res;
+ /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
+ ret = aprintf ("ftp://%s", url);
+ ret[6 + (p - url)] = '/';
}
else
{
- char *res;
http:
- /* Just prepend "http://" to what we have. */
- res = xmalloc (7 + strlen (url) + 1);
- sprintf (res, "http://%s", url);
- return res;
+ /* Just prepend "http://" to URL. */
+ ret = aprintf ("http://%s", url);
}
+ return ret;
}
\f
static void split_path (const char *, char **, char **);
/* Like strpbrk, with the exception that it returns the pointer to the
terminating zero (end-of-string aka "eos") if no matching character
- is found.
-
- Although I normally balk at Gcc-specific optimizations, it probably
- makes sense here: glibc has optimizations that detect strpbrk being
- called with literal string as ACCEPT and inline the search. That
- optimization is defeated if strpbrk is hidden within the call to
- another function. (And no, making strpbrk_or_eos inline doesn't
- help because the check for literal accept is in the
- preprocessor.) */
-
-#if defined(__GNUC__) && __GNUC__ >= 3
-
-#define strpbrk_or_eos(s, accept) ({ \
- char *SOE_p = strpbrk (s, accept); \
- if (!SOE_p) \
- SOE_p = strchr (s, '\0'); \
- SOE_p; \
-})
-
-#else /* not __GNUC__ or old gcc */
+ is found. */
static inline char *
strpbrk_or_eos (const char *s, const char *accept)
p = strchr (s, '\0');
return p;
}
-#endif /* not __GNUC__ or old gcc */
/* Turn STR into lowercase; return true if a character was actually
changed. */
return changed;
}
+static const char *
+init_seps (enum url_scheme scheme)
+{
+ static char seps[8] = ":/";
+ char *p = seps + 2;
+ int flags = supported_schemes[scheme].flags;
+
+ if (flags & scm_has_params)
+ *p++ = ';';
+ if (flags & scm_has_query)
+ *p++ = '?';
+ if (flags & scm_has_fragment)
+ *p++ = '#';
+ *p++ = '\0';
+ return seps;
+}
+
static const char *parse_errors[] = {
#define PE_NO_ERROR 0
N_("No error"),
#define PE_UNSUPPORTED_SCHEME 1
N_("Unsupported scheme"),
-#define PE_EMPTY_HOST 2
- N_("Empty host"),
+#define PE_INVALID_HOST_NAME 2
+ N_("Invalid host name"),
#define PE_BAD_PORT_NUMBER 3
N_("Bad port number"),
#define PE_INVALID_USER_NAME 4
bool path_modified, host_modified;
enum url_scheme scheme;
+ const char *seps;
const char *uname_b, *uname_e;
const char *host_b, *host_e;
scheme://host[:port][/path][;params][?query][#fragment] */
+ path_b = path_e = NULL;
params_b = params_e = NULL;
query_b = query_e = NULL;
fragment_b = fragment_e = NULL;
+ /* Initialize separators for optional parts of URL, depending on the
+ scheme. For example, FTP has params, and HTTP and HTTPS have
+ query string and fragment. */
+ seps = init_seps (scheme);
+
host_b = p;
if (*p == '[')
error_code = PE_IPV6_NOT_SUPPORTED;
goto error;
#endif
+
+ /* The closing bracket must be followed by a separator or by the
+ null char. */
+ /* http://[::1]... */
+ /* ^ */
+ if (!strchr (seps, *p))
+ {
+ /* Trailing garbage after []-delimited IPv6 address. */
+ error_code = PE_INVALID_HOST_NAME;
+ goto error;
+ }
}
else
{
- p = strpbrk_or_eos (p, ":/;?#");
+ p = strpbrk_or_eos (p, seps);
host_e = p;
}
+ ++seps; /* advance to '/' */
if (host_b == host_e)
{
- error_code = PE_EMPTY_HOST;
+ error_code = PE_INVALID_HOST_NAME;
goto error;
}
/* ^ */
++p;
port_b = p;
- p = strpbrk_or_eos (p, "/;?#");
+ p = strpbrk_or_eos (p, seps);
port_e = p;
/* Allow empty port, as per rfc2396. */
if (port_b != port_e)
- {
- for (port = 0, pp = port_b; pp < port_e; pp++)
- {
- if (!ISDIGIT (*pp))
- {
- /* http://host:12randomgarbage/blah */
- /* ^ */
- error_code = PE_BAD_PORT_NUMBER;
- goto error;
- }
- port = 10 * port + (*pp - '0');
- /* Check for too large port numbers here, before we have
- a chance to overflow on bogus port values. */
- if (port > 65535)
- {
- error_code = PE_BAD_PORT_NUMBER;
- goto error;
- }
- }
- }
+ for (port = 0, pp = port_b; pp < port_e; pp++)
+ {
+ if (!ISDIGIT (*pp))
+ {
+ /* http://host:12randomgarbage/blah */
+ /* ^ */
+ error_code = PE_BAD_PORT_NUMBER;
+ goto error;
+ }
+ port = 10 * port + (*pp - '0');
+ /* Check for too large port numbers here, before we have
+ a chance to overflow on bogus port values. */
+ if (port > 0xffff)
+ {
+ error_code = PE_BAD_PORT_NUMBER;
+ goto error;
+ }
+ }
}
+ /* Advance to the first separator *after* '/' (either ';' or '?',
+ depending on the scheme). */
+ ++seps;
+
+ /* Get the optional parts of URL, each part being delimited by
+ current location and the position of the next separator. */
+#define GET_URL_PART(sepchar, var) do { \
+ if (*p == sepchar) \
+ var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \
+ ++seps; \
+} while (0)
- if (*p == '/')
- {
- ++p;
- path_b = p;
- p = strpbrk_or_eos (p, ";?#");
- path_e = p;
- }
- else
- {
- /* Path is not allowed not to exist. */
- path_b = path_e = p;
- }
+ GET_URL_PART ('/', path);
+ if (supported_schemes[scheme].flags & scm_has_params)
+ GET_URL_PART (';', params);
+ if (supported_schemes[scheme].flags & scm_has_query)
+ GET_URL_PART ('?', query);
+ if (supported_schemes[scheme].flags & scm_has_fragment)
+ GET_URL_PART ('#', fragment);
- if (*p == ';')
- {
- ++p;
- params_b = p;
- p = strpbrk_or_eos (p, "?#");
- params_e = p;
- }
- if (*p == '?')
- {
- ++p;
- query_b = p;
- p = strpbrk_or_eos (p, "#");
- query_e = p;
-
- /* Hack that allows users to use '?' (a wildcard character) in
- FTP URLs without it being interpreted as a query string
- delimiter. */
- if (scheme == SCHEME_FTP)
- {
- query_b = query_e = NULL;
- path_e = p;
- }
- }
- if (*p == '#')
- {
- ++p;
- fragment_b = p;
- p += strlen (p);
- fragment_e = p;
- }
+#undef GET_URL_PART
assert (*p == 0);
if (uname_b != uname_e)
{
char *h = path; /* hare */
char *t = path; /* tortoise */
- char *beg = path; /* boundary for backing the tortoise */
- char *end = path + strlen (path);
+ char *end = strchr (path, '\0');
while (h < end)
{
{
/* Handle "../" by retreating the tortoise by one path
element -- but not past beggining. */
- if (t > beg)
+ if (t > path)
{
/* Move backwards until T hits the beginning of the
previous path element or the beginning of path. */
- for (--t; t > beg && t[-1] != '/'; t--)
+ for (--t; t > path && t[-1] != '/'; t--)
;
}
- else
- {
- /* If we're at the beginning, copy the "../" literally
- move the beginning so a later ".." doesn't remove
- it. */
- beg = t + 3;
- goto regular;
- }
h += 3;
}
else
{
- regular:
/* A regular path element. If H hasn't advanced past T,
simply skip to the next path element. Otherwise, copy
the path element until the next slash. */
}
\f
/* Return the length of URL's path. Path is considered to be
- terminated by one of '?', ';', '#', or by the end of the
- string. */
+ terminated by one or more of the ?query or ;params or #fragment,
+ depending on the scheme. */
-static int
-path_length (const char *url)
+static const char *
+path_end (const char *url)
{
- const char *q = strpbrk_or_eos (url, "?;#");
- return q - url;
+ enum url_scheme scheme = url_scheme (url);
+ const char *seps;
+ if (scheme == SCHEME_INVALID)
+ scheme = SCHEME_HTTP; /* use http semantics for rel links */
+ /* +2 to ignore the first two separators ':' and '/' */
+ seps = init_seps (scheme) + 2;
+ return strpbrk_or_eos (url, seps);
}
/* Find the last occurrence of character C in the range [b, e), or
return xstrdup (link);
/* We may not examine BASE past END. */
- end = base + path_length (base);
+ end = path_end (base);
linklength = strlen (link);
if (!*link)
{ "", "", false },
{ ".", "", true },
{ "./", "", true },
- { "..", "..", false },
- { "../", "../", false },
+ { "..", "", true },
+ { "../", "", true },
{ "foo", "foo", false },
{ "foo/bar", "foo/bar", false },
{ "foo///bar", "foo///bar", false },
{ "foo/bar/../x", "foo/x", true },
{ "foo/bar/../x/", "foo/x/", true },
{ "foo/..", "", true },
- { "foo/../..", "..", true },
- { "foo/../../..", "../..", true },
- { "foo/../../bar/../../baz", "../../baz", true },
+ { "foo/../..", "", true },
+ { "foo/../../..", "", true },
+ { "foo/../../bar/../../baz", "baz", true },
{ "a/b/../../c", "c", true },
{ "./a/../b", "b", true }
};