X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Furl.c;h=f97a31801ea5a6861fccc47b4a1d4d60632bf6b0;hb=aa07e689f2c03dd25342859e7e527a13467ad219;hp=ca7179a667a45253c35252f8386b19dfef812069;hpb=74fbb03b10f6148b5a0cf5b8831b1872e55df7f6;p=wget diff --git a/src/url.c b/src/url.c index ca7179a6..f97a3180 100644 --- a/src/url.c +++ b/src/url.c @@ -1,5 +1,5 @@ /* URL handling. - Copyright (C) 2005 Free Software Foundation, Inc. + Copyright (C) 1996-2005 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -14,8 +14,8 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software -Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +along with Wget; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -43,22 +43,33 @@ so, delete this exception statement from your version. */ #include "url.h" #include "host.h" /* for is_valid_ipv6_address */ +enum { + scm_disabled = 1, /* for https when OpenSSL fails to init. */ + scm_has_params = 2, /* whether scheme has ;params */ + scm_has_query = 4, /* whether scheme has ?query */ + scm_has_fragment = 8 /* whether scheme has #fragment */ +}; + struct scheme_data { + /* Short name of the scheme, such as "http" or "ftp". */ const char *name; + /* Leading string that identifies the scheme, such as "https://". */ const char *leading_string; + /* Default port of the scheme when none is specified. */ int default_port; - bool enabled; + /* Various flags. */ + int flags; }; /* Supported schemes: */ static struct scheme_data supported_schemes[] = { - { "http", "http://", DEFAULT_HTTP_PORT, 1 }, + { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment }, #ifdef HAVE_SSL - { "https", "https://", DEFAULT_HTTPS_PORT, 1 }, + { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment }, #endif - { "ftp", "ftp://", DEFAULT_FTP_PORT, 1 }, + { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment }, /* SCHEME_INVALID */ { NULL, NULL, -1, 0 } @@ -404,7 +415,7 @@ url_scheme (const char *url) if (0 == strncasecmp (url, supported_schemes[i].leading_string, strlen (supported_schemes[i].leading_string))) { - if (supported_schemes[i].enabled) + if (!(supported_schemes[i].flags & scm_disabled)) return (enum url_scheme) i; else return SCHEME_INVALID; @@ -444,7 +455,7 @@ scheme_default_port (enum url_scheme scheme) void scheme_disable (enum url_scheme scheme) { - supported_schemes[scheme].enabled = false; + supported_schemes[scheme].flags |= scm_disabled; } /* Skip the username and password, if present in the URL. The @@ -497,7 +508,8 @@ parse_credentials (const char *beg, const char *end, char **user, char **passwd) } /* Used by main.c: detect URLs written using the "shorthand" URL forms - popularized by Netscape and NcFTP. HTTP shorthands look like this: + originally popularized by Netscape and NcFTP. HTTP shorthands look + like this: www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file www.foo.com[:port] -> http://www.foo.com[:port] @@ -513,78 +525,49 @@ char * rewrite_shorthand_url (const char *url) { const char *p; + char *ret; if (url_scheme (url) != SCHEME_INVALID) return NULL; /* Look for a ':' or '/'. The former signifies NcFTP syntax, the latter Netscape. */ - for (p = url; *p && *p != ':' && *p != '/'; p++) - ; - + p = strpbrk (url, ":/"); if (p == url) return NULL; /* If we're looking at "://", it means the URL uses a scheme we don't support, which may include "https" when compiled without SSL support. Don't bogusly rewrite such URLs. */ - if (p[0] == ':' && p[1] == '/' && p[2] == '/') + if (p && p[0] == ':' && p[1] == '/' && p[2] == '/') return NULL; - if (*p == ':') + if (p && *p == ':') { - const char *pp; - char *res; - /* If the characters after the colon and before the next slash - or end of string are all digits, it's HTTP. */ - int digits = 0; - for (pp = p + 1; ISDIGIT (*pp); pp++) - ++digits; - if (digits > 0 && (*pp == '/' || *pp == '\0')) + /* Colon indicates ftp, as in foo.bar.com:path. Check for + special case of http port number ("localhost:10000"). */ + int digits = strspn (p + 1, "0123456789"); + if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0')) goto http; - /* Prepend "ftp://" to the entire URL... */ - res = xmalloc (6 + strlen (url) + 1); - sprintf (res, "ftp://%s", url); - /* ...and replace ':' with '/'. */ - res[6 + (p - url)] = '/'; - return res; + /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */ + ret = aprintf ("ftp://%s", url); + ret[6 + (p - url)] = '/'; } else { - char *res; http: - /* Just prepend "http://" to what we have. */ - res = xmalloc (7 + strlen (url) + 1); - sprintf (res, "http://%s", url); - return res; + /* Just prepend "http://" to URL. */ + ret = aprintf ("http://%s", url); } + return ret; } static void split_path (const char *, char **, char **); /* Like strpbrk, with the exception that it returns the pointer to the terminating zero (end-of-string aka "eos") if no matching character - is found. - - Although I normally balk at Gcc-specific optimizations, it probably - makes sense here: glibc has optimizations that detect strpbrk being - called with literal string as ACCEPT and inline the search. That - optimization is defeated if strpbrk is hidden within the call to - another function. (And no, making strpbrk_or_eos inline doesn't - help because the check for literal accept is in the - preprocessor.) */ - -#if defined(__GNUC__) && __GNUC__ >= 3 - -#define strpbrk_or_eos(s, accept) ({ \ - char *SOE_p = strpbrk (s, accept); \ - if (!SOE_p) \ - SOE_p = strchr (s, '\0'); \ - SOE_p; \ -}) - -#else /* not __GNUC__ or old gcc */ + is found. */ static inline char * strpbrk_or_eos (const char *s, const char *accept) @@ -594,7 +577,6 @@ strpbrk_or_eos (const char *s, const char *accept) p = strchr (s, '\0'); return p; } -#endif /* not __GNUC__ or old gcc */ /* Turn STR into lowercase; return true if a character was actually changed. */ @@ -612,13 +594,30 @@ lowercase_str (char *str) return changed; } +static const char * +init_seps (enum url_scheme scheme) +{ + static char seps[8] = ":/"; + char *p = seps + 2; + int flags = supported_schemes[scheme].flags; + + if (flags & scm_has_params) + *p++ = ';'; + if (flags & scm_has_query) + *p++ = '?'; + if (flags & scm_has_fragment) + *p++ = '#'; + *p++ = '\0'; + return seps; +} + static const char *parse_errors[] = { #define PE_NO_ERROR 0 N_("No error"), #define PE_UNSUPPORTED_SCHEME 1 N_("Unsupported scheme"), -#define PE_EMPTY_HOST 2 - N_("Empty host"), +#define PE_INVALID_HOST_NAME 2 + N_("Invalid host name"), #define PE_BAD_PORT_NUMBER 3 N_("Bad port number"), #define PE_INVALID_USER_NAME 4 @@ -644,6 +643,7 @@ url_parse (const char *url, int *error) bool path_modified, host_modified; enum url_scheme scheme; + const char *seps; const char *uname_b, *uname_e; const char *host_b, *host_e; @@ -682,10 +682,16 @@ url_parse (const char *url, int *error) scheme://host[:port][/path][;params][?query][#fragment] */ + path_b = path_e = NULL; params_b = params_e = NULL; query_b = query_e = NULL; fragment_b = fragment_e = NULL; + /* Initialize separators for optional parts of URL, depending on the + scheme. For example, FTP has params, and HTTP and HTTPS have + query string and fragment. */ + seps = init_seps (scheme); + host_b = p; if (*p == '[') @@ -718,16 +724,28 @@ url_parse (const char *url, int *error) error_code = PE_IPV6_NOT_SUPPORTED; goto error; #endif + + /* The closing bracket must be followed by a separator or by the + null char. */ + /* http://[::1]... */ + /* ^ */ + if (!strchr (seps, *p)) + { + /* Trailing garbage after []-delimited IPv6 address. */ + error_code = PE_INVALID_HOST_NAME; + goto error; + } } else { - p = strpbrk_or_eos (p, ":/;?#"); + p = strpbrk_or_eos (p, seps); host_e = p; } + ++seps; /* advance to '/' */ if (host_b == host_e) { - error_code = PE_EMPTY_HOST; + error_code = PE_INVALID_HOST_NAME; goto error; } @@ -740,76 +758,51 @@ url_parse (const char *url, int *error) /* ^ */ ++p; port_b = p; - p = strpbrk_or_eos (p, "/;?#"); + p = strpbrk_or_eos (p, seps); port_e = p; /* Allow empty port, as per rfc2396. */ if (port_b != port_e) - { - for (port = 0, pp = port_b; pp < port_e; pp++) - { - if (!ISDIGIT (*pp)) - { - /* http://host:12randomgarbage/blah */ - /* ^ */ - error_code = PE_BAD_PORT_NUMBER; - goto error; - } - port = 10 * port + (*pp - '0'); - /* Check for too large port numbers here, before we have - a chance to overflow on bogus port values. */ - if (port > 65535) - { - error_code = PE_BAD_PORT_NUMBER; - goto error; - } - } - } + for (port = 0, pp = port_b; pp < port_e; pp++) + { + if (!ISDIGIT (*pp)) + { + /* http://host:12randomgarbage/blah */ + /* ^ */ + error_code = PE_BAD_PORT_NUMBER; + goto error; + } + port = 10 * port + (*pp - '0'); + /* Check for too large port numbers here, before we have + a chance to overflow on bogus port values. */ + if (port > 0xffff) + { + error_code = PE_BAD_PORT_NUMBER; + goto error; + } + } } + /* Advance to the first separator *after* '/' (either ';' or '?', + depending on the scheme). */ + ++seps; + + /* Get the optional parts of URL, each part being delimited by + current location and the position of the next separator. */ +#define GET_URL_PART(sepchar, var) do { \ + if (*p == sepchar) \ + var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \ + ++seps; \ +} while (0) - if (*p == '/') - { - ++p; - path_b = p; - p = strpbrk_or_eos (p, ";?#"); - path_e = p; - } - else - { - /* Path is not allowed not to exist. */ - path_b = path_e = p; - } + GET_URL_PART ('/', path); + if (supported_schemes[scheme].flags & scm_has_params) + GET_URL_PART (';', params); + if (supported_schemes[scheme].flags & scm_has_query) + GET_URL_PART ('?', query); + if (supported_schemes[scheme].flags & scm_has_fragment) + GET_URL_PART ('#', fragment); - if (*p == ';') - { - ++p; - params_b = p; - p = strpbrk_or_eos (p, "?#"); - params_e = p; - } - if (*p == '?') - { - ++p; - query_b = p; - p = strpbrk_or_eos (p, "#"); - query_e = p; - - /* Hack that allows users to use '?' (a wildcard character) in - FTP URLs without it being interpreted as a query string - delimiter. */ - if (scheme == SCHEME_FTP) - { - query_b = query_e = NULL; - path_e = p; - } - } - if (*p == '#') - { - ++p; - fragment_b = p; - p += strlen (p); - fragment_e = p; - } +#undef GET_URL_PART assert (*p == 0); if (uname_b != uname_e) @@ -1518,8 +1511,7 @@ path_simplify (char *path) { char *h = path; /* hare */ char *t = path; /* tortoise */ - char *beg = path; /* boundary for backing the tortoise */ - char *end = path + strlen (path); + char *end = strchr (path, '\0'); while (h < end) { @@ -1534,26 +1526,17 @@ path_simplify (char *path) { /* Handle "../" by retreating the tortoise by one path element -- but not past beggining. */ - if (t > beg) + if (t > path) { /* Move backwards until T hits the beginning of the previous path element or the beginning of path. */ - for (--t; t > beg && t[-1] != '/'; t--) + for (--t; t > path && t[-1] != '/'; t--) ; } - else - { - /* If we're at the beginning, copy the "../" literally - move the beginning so a later ".." doesn't remove - it. */ - beg = t + 3; - goto regular; - } h += 3; } else { - regular: /* A regular path element. If H hasn't advanced past T, simply skip to the next path element. Otherwise, copy the path element until the next slash. */ @@ -1583,14 +1566,19 @@ path_simplify (char *path) } /* Return the length of URL's path. Path is considered to be - terminated by one of '?', ';', '#', or by the end of the - string. */ + terminated by one or more of the ?query or ;params or #fragment, + depending on the scheme. */ -static int -path_length (const char *url) +static const char * +path_end (const char *url) { - const char *q = strpbrk_or_eos (url, "?;#"); - return q - url; + enum url_scheme scheme = url_scheme (url); + const char *seps; + if (scheme == SCHEME_INVALID) + scheme = SCHEME_HTTP; /* use http semantics for rel links */ + /* +2 to ignore the first two separators ':' and '/' */ + seps = init_seps (scheme) + 2; + return strpbrk_or_eos (url, seps); } /* Find the last occurrence of character C in the range [b, e), or @@ -1629,7 +1617,7 @@ uri_merge (const char *base, const char *link) return xstrdup (link); /* We may not examine BASE past END. */ - end = base + path_length (base); + end = path_end (base); linklength = strlen (link); if (!*link) @@ -1974,8 +1962,8 @@ test_path_simplify (void) { "", "", false }, { ".", "", true }, { "./", "", true }, - { "..", "..", false }, - { "../", "../", false }, + { "..", "", true }, + { "../", "", true }, { "foo", "foo", false }, { "foo/bar", "foo/bar", false }, { "foo///bar", "foo///bar", false }, @@ -1988,9 +1976,9 @@ test_path_simplify (void) { "foo/bar/../x", "foo/x", true }, { "foo/bar/../x/", "foo/x/", true }, { "foo/..", "", true }, - { "foo/../..", "..", true }, - { "foo/../../..", "../..", true }, - { "foo/../../bar/../../baz", "../../baz", true }, + { "foo/../..", "", true }, + { "foo/../../..", "", true }, + { "foo/../../bar/../../baz", "baz", true }, { "a/b/../../c", "c", true }, { "./a/../b", "b", true } };