/* URL handling.
- Copyright (C) 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996-2005 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+enum {
+ scm_disabled = 1, /* for https when OpenSSL fails to init. */
+ scm_has_params = 2, /* whether scheme has ;params */
+ scm_has_query = 4, /* whether scheme has ?query */
+ scm_has_fragment = 8 /* whether scheme has #fragment */
+};
+
struct scheme_data
{
/* Short name of the scheme, such as "http" or "ftp". */
const char *leading_string;
/* Default port of the scheme when none is specified. */
int default_port;
- /* Used for disabling https when OpenSSL fails to init. */
- bool disabled;
- /* Allowed separators, handled by url_parse. For example, ftp
- doesn't support the "?query", and http/https don't support
- ";params". All schemes must support at least "/:". */
- const char *separators;
+ /* Various flags. */
int flags;
};
/* Supported schemes: */
static struct scheme_data supported_schemes[] =
{
- { "http", "http://", DEFAULT_HTTP_PORT, false, "/:?#" },
+ { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment },
#ifdef HAVE_SSL
- { "https", "https://", DEFAULT_HTTPS_PORT, false, "/:?#" },
+ { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
#endif
- { "ftp", "ftp://", DEFAULT_FTP_PORT, false, "/:;#" },
+ { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment },
/* SCHEME_INVALID */
{ NULL, NULL, -1, 0 }
if (0 == strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string)))
{
- if (!(supported_schemes[i].disabled))
+ if (!(supported_schemes[i].flags & scm_disabled))
return (enum url_scheme) i;
else
return SCHEME_INVALID;
void
scheme_disable (enum url_scheme scheme)
{
- supported_schemes[scheme].disabled = true;
+ supported_schemes[scheme].flags |= scm_disabled;
}
/* Skip the username and password, if present in the URL. The
/* Like strpbrk, with the exception that it returns the pointer to the
terminating zero (end-of-string aka "eos") if no matching character
- is found.
-
- Although I normally balk at Gcc-specific optimizations, it probably
- makes sense here: glibc has optimizations that detect strpbrk being
- called with literal string as ACCEPT and inline the search. That
- optimization is defeated if strpbrk is hidden within the call to
- another function. (And no, making strpbrk_or_eos inline doesn't
- help because the check for literal accept is in the
- preprocessor.) */
-
-#if defined(__GNUC__) && __GNUC__ >= 3
-
-#define strpbrk_or_eos(s, accept) ({ \
- char *SOE_p = strpbrk (s, accept); \
- if (!SOE_p) \
- SOE_p = strchr (s, '\0'); \
- SOE_p; \
-})
-
-#else /* not __GNUC__ or old gcc */
+ is found. */
static inline char *
strpbrk_or_eos (const char *s, const char *accept)
p = strchr (s, '\0');
return p;
}
-#endif /* not __GNUC__ or old gcc */
/* Turn STR into lowercase; return true if a character was actually
changed. */
return changed;
}
+static const char *
+init_seps (enum url_scheme scheme)
+{
+ static char seps[8] = ":/";
+ char *p = seps + 2;
+ int flags = supported_schemes[scheme].flags;
+
+ if (flags & scm_has_params)
+ *p++ = ';';
+ if (flags & scm_has_query)
+ *p++ = '?';
+ if (flags & scm_has_fragment)
+ *p++ = '#';
+ *p++ = '\0';
+ return seps;
+}
+
static const char *parse_errors[] = {
#define PE_NO_ERROR 0
N_("No error"),
/* Initialize separators for optional parts of URL, depending on the
scheme. For example, FTP has params, and HTTP and HTTPS have
query string and fragment. */
- seps = supported_schemes[scheme].separators;
+ seps = init_seps (scheme);
host_b = p;
} while (0)
GET_URL_PART ('/', path);
- GET_URL_PART (';', params);
- GET_URL_PART ('?', query);
- GET_URL_PART ('#', fragment);
+ if (supported_schemes[scheme].flags & scm_has_params)
+ GET_URL_PART (';', params);
+ if (supported_schemes[scheme].flags & scm_has_query)
+ GET_URL_PART ('?', query);
+ if (supported_schemes[scheme].flags & scm_has_fragment)
+ GET_URL_PART ('#', fragment);
#undef GET_URL_PART
assert (*p == 0);
}
\f
/* Return the length of URL's path. Path is considered to be
- terminated by one of '?', ';', '#', or by the end of the
- string. */
+ terminated by one or more of the ?query or ;params or #fragment,
+ depending on the scheme. */
-static int
-path_length (const char *url)
+static const char *
+path_end (const char *url)
{
- const char *q = strpbrk_or_eos (url, "?;#");
- return q - url;
+ enum url_scheme scheme = url_scheme (url);
+ const char *seps;
+ if (scheme == SCHEME_INVALID)
+ scheme = SCHEME_HTTP; /* use http semantics for rel links */
+ /* +2 to ignore the first two separators ':' and '/' */
+ seps = init_seps (scheme) + 2;
+ return strpbrk_or_eos (url, seps);
}
/* Find the last occurrence of character C in the range [b, e), or
return xstrdup (link);
/* We may not examine BASE past END. */
- end = base + path_length (base);
+ end = path_end (base);
linklength = strlen (link);
if (!*link)