/* URL handling.
- Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of GNU Wget.
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
-#include <config.h>
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "utils.h"
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+#ifdef __VMS
+#include "vms.h"
+#endif /* def __VMS */
+
#ifdef TESTING
#include "test.h"
#endif
/* Forward declarations: */
-static bool path_simplify (char *);
+static bool path_simplify (enum url_scheme, char *);
\f
/* Support for escaping and unescaping of URL strings. */
{
char c;
/* Do nothing if '%' is not followed by two hex digits. */
- if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
+ if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
goto copychar;
c = X2DIGITS_TO_NUM (h[1], h[2]);
/* Don't unescape %00 because there is no way to insert it
return url_escape_1 (s, urlchr_unsafe, false);
}
+/* URL-escape the unsafe and reserved characters (see urlchr_table) in
+ a given string, returning a freshly allocated string. */
+
+char *
+url_escape_unsafe_and_reserved (const char *s)
+{
+ return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
+}
+
/* URL-escape the unsafe characters (see urlchr_table) in a given
string. If no characters are unsafe, S is returned. */
{
if (*p == '%')
{
- if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
+ if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
return false;
else
/* Garbled %.. sequence: encode `%'. */
return SCHEME_INVALID;
}
-#define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
+#define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
/* Return 1 if the URL begins with any "scheme", 0 otherwise. As
currently implemented, it returns true if URL begins with
{
bool changed = false;
for (; *str; str++)
- if (ISUPPER (*str))
+ if (c_isupper (*str))
{
changed = true;
- *str = TOLOWER (*str);
+ *str = c_tolower (*str);
}
return changed;
}
#define PE_NO_ERROR 0
N_("No error"),
#define PE_UNSUPPORTED_SCHEME 1
- N_("Unsupported scheme"),
-#define PE_INVALID_HOST_NAME 2
+ N_("Unsupported scheme %s"), /* support for format token only here */
+#define PE_MISSING_SCHEME 2
+ N_("Scheme missing"),
+#define PE_INVALID_HOST_NAME 3
N_("Invalid host name"),
-#define PE_BAD_PORT_NUMBER 3
+#define PE_BAD_PORT_NUMBER 4
N_("Bad port number"),
-#define PE_INVALID_USER_NAME 4
+#define PE_INVALID_USER_NAME 5
N_("Invalid user name"),
-#define PE_UNTERMINATED_IPV6_ADDRESS 5
+#define PE_UNTERMINATED_IPV6_ADDRESS 6
N_("Unterminated IPv6 numeric address"),
-#define PE_IPV6_NOT_SUPPORTED 6
+#define PE_IPV6_NOT_SUPPORTED 7
N_("IPv6 addresses not supported"),
-#define PE_INVALID_IPV6_ADDRESS 7
+#define PE_INVALID_IPV6_ADDRESS 8
N_("Invalid IPv6 numeric address")
};
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
{
struct url *u;
const char *p;
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded = NULL;
+ const char *url_encoded = NULL;
+ char *new_url = NULL;
int error_code;
scheme = url_scheme (url);
if (scheme == SCHEME_INVALID)
{
- error_code = PE_UNSUPPORTED_SCHEME;
+ if (url_has_scheme (url))
+ error_code = PE_UNSUPPORTED_SCHEME;
+ else
+ error_code = PE_MISSING_SCHEME;
goto error;
}
- url_encoded = reencode_escapes (url);
+ if (iri && iri->utf8_encode)
+ {
+ iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
+ if (!iri->utf8_encode)
+ new_url = NULL;
+ else
+ iri->orig_url = xstrdup (url);
+ }
+
+ /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/
+ if (percent_encode)
+ url_encoded = reencode_escapes (new_url ? new_url : url);
+ else
+ url_encoded = new_url ? new_url : url;
+
p = url_encoded;
+ if (new_url && url_encoded != new_url)
+ xfree (new_url);
+
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
if (port_b != port_e)
for (port = 0, pp = port_b; pp < port_e; pp++)
{
- if (!ISDIGIT (*pp))
+ if (!c_isdigit (*pp))
{
/* http://host:12randomgarbage/blah */
/* ^ */
u->passwd = passwd;
u->path = strdupdelim (path_b, path_e);
- path_modified = path_simplify (u->path);
+ path_modified = path_simplify (scheme, u->path);
split_path (u->path, &u->dir, &u->file);
host_modified = lowercase_str (u->host);
{
url_unescape (u->host);
host_modified = true;
+
+ /* Apply IDNA regardless of iri->utf8_encode status */
+ if (opt.enable_iri && iri)
+ {
+ char *new = idn_encode (iri, u->host);
+ if (new)
+ {
+ xfree (u->host);
+ u->host = new;
+ host_modified = true;
+ }
+ }
}
if (params_b)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
- if (path_modified || u->fragment || host_modified || path_b == path_e)
+ if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild
if (url_encoded == url)
u->url = xstrdup (url);
else
- u->url = url_encoded;
+ u->url = (char *) url_encoded;
}
return u;
error:
/* Cleanup in case of error: */
if (url_encoded && url_encoded != url)
- xfree (url_encoded);
+ xfree ((char *) url_encoded);
/* Transmit the error code to the caller, if the caller wants to
know. */
/* Return the error message string from ERROR_CODE, which should have
been retrieved from url_parse. The error message is translated. */
-const char *
-url_error (int error_code)
+char *
+url_error (const char *url, int error_code)
{
- assert (error_code >= 0 && error_code < countof (parse_errors));
- return _(parse_errors[error_code]);
+ assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
+
+ if (error_code == PE_UNSUPPORTED_SCHEME)
+ {
+ char *error, *p;
+ char *scheme = xstrdup (url);
+ assert (url_has_scheme (url));
+
+ if ((p = strchr (scheme, ':')))
+ *p = '\0';
+ if (!strcasecmp (scheme, "https"))
+ error = aprintf (_("HTTPS support not compiled in"));
+ else
+ error = aprintf (_(parse_errors[error_code]), quote (scheme));
+ xfree (scheme);
+
+ return error;
+ }
+ else
+ return xstrdup (_(parse_errors[error_code]));
}
/* Split PATH into DIR and FILE. PATH comes from the URL and is
enum {
filechr_not_unix = 1, /* unusable on Unix, / and \0 */
- filechr_not_windows = 2, /* unusable on MSDOS/Windows, one of \|/<>?:*" */
+ filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
filechr_control = 4 /* a control character, e.g. 0-31 */
};
-#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
+#define FILE_CHAR_TEST(c, mask) \
+ ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
+ (filechr_table[(unsigned char)(c)] & (mask)))
/* Shorthands for the table: */
#define U filechr_not_unix
for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
{
if (opt.restrict_files_case == restrict_lowercase)
- *q = TOLOWER (*q);
+ *q = c_tolower (*q);
else
- *q = TOUPPER (*q);
+ *q = c_toupper (*q);
}
}
const char *u_file, *u_query;
char *fname, *unique;
+ char *index_filename = "index.html"; /* The default index file is index.html */
fnres.base = NULL;
fnres.size = 0;
fnres.tail = 0;
+ /* If an alternative index file was defined, change index_filename */
+ if (opt.default_page)
+ index_filename = opt.default_page;
+
+
/* Start with the directory prefix, if specified. */
if (opt.dir_prefix)
append_string (opt.dir_prefix, &fnres);
/* Add the file name. */
if (fnres.tail)
append_char ('/', &fnres);
- u_file = *u->file ? u->file : "index.html";
+ u_file = *u->file ? u->file : index_filename;
append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
/* Append "?query" to the file name. */
if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
&& !(file_exists_p (fname) && !file_non_directory_p (fname)))
- return fname;
+ {
+ unique = fname;
+ }
+ else
+ {
+ unique = unique_name (fname, true);
+ if (unique != fname)
+ xfree (fname);
+ }
+
+/* On VMS, alter the name as required. */
+#ifdef __VMS
+ {
+ char *unique2;
+
+ unique2 = ods_conform( unique);
+ if (unique2 != unique)
+ {
+ xfree (unique);
+ unique = unique2;
+ }
+ }
+#endif /* def __VMS */
- unique = unique_name (fname, true);
- if (unique != fname)
- xfree (fname);
return unique;
}
\f
test case. */
static bool
-path_simplify (char *path)
+path_simplify (enum url_scheme scheme, char *path)
{
char *h = path; /* hare */
char *t = path; /* tortoise */
+ char *beg = path;
char *end = strchr (path, '\0');
while (h < end)
{
/* Handle "../" by retreating the tortoise by one path
element -- but not past beggining. */
- if (t > path)
+ if (t > beg)
{
/* Move backwards until T hits the beginning of the
previous path element or the beginning of path. */
- for (--t; t > path && t[-1] != '/'; t--)
+ for (--t; t > beg && t[-1] != '/'; t--)
;
}
+ else if (scheme == SCHEME_FTP)
+ {
+ /* If we're at the beginning, copy the "../" literally
+ and move the beginning so a later ".." doesn't remove
+ it. This violates RFC 3986; but we do it for FTP
+ anyway because there is otherwise no way to get at a
+ parent directory, when the FTP server drops us in a
+ non-root directory (which is not uncommon). */
+ beg = t + 3;
+ goto regular;
+ }
h += 3;
}
else
{
+ regular:
/* A regular path element. If H hasn't advanced past T,
simply skip to the next path element. Otherwise, copy
the path element until the next slash. */
\f
static int
getchar_from_escaped_string (const char *str, char *c)
-{
+{
const char *p = str;
assert (str && *str);
assert (c);
-
+
if (p[0] == '%')
{
- if (!ISXDIGIT(p[1]) || !ISXDIGIT(p[2]))
+ if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
{
*c = '%';
return 1;
while (*p && *q
&& (pp = getchar_from_escaped_string (p, &ch1))
&& (qq = getchar_from_escaped_string (q, &ch2))
- && (TOLOWER(ch1) == TOLOWER(ch2)))
+ && (c_tolower(ch1) == c_tolower(ch2)))
{
p += pp;
q += qq;
}
-
+
return (*p == 0 && *q == 0 ? true : false);
}
\f
-#if 0
+#ifdef TESTING
/* Debugging and testing support for path_simplify. */
+#if 0
/* Debug: run path_simplify on PATH and return the result in a new
string. Useful for calling from the debugger. */
static char *
path_simplify (copy);
return copy;
}
+#endif
-static void
-run_test (char *test, char *expected_result, bool expected_change)
+static const char *
+run_test (char *test, char *expected_result, enum url_scheme scheme,
+ bool expected_change)
{
char *test_copy = xstrdup (test);
- bool modified = path_simplify (test_copy);
+ bool modified = path_simplify (scheme, test_copy);
if (0 != strcmp (test_copy, expected_result))
{
printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
test, expected_result, test_copy);
+ mu_assert ("", 0);
}
if (modified != expected_change)
{
test);
}
xfree (test_copy);
+ mu_assert ("", modified == expected_change);
+ return NULL;
}
-static void
+const char *
test_path_simplify (void)
{
static struct {
char *test, *result;
+ enum url_scheme scheme;
bool should_modify;
} tests[] = {
- { "", "", false },
- { ".", "", true },
- { "./", "", true },
- { "..", "", true },
- { "../", "", true },
- { "foo", "foo", false },
- { "foo/bar", "foo/bar", false },
- { "foo///bar", "foo///bar", false },
- { "foo/.", "foo/", true },
- { "foo/./", "foo/", true },
- { "foo./", "foo./", false },
- { "foo/../bar", "bar", true },
- { "foo/../bar/", "bar/", true },
- { "foo/bar/..", "foo/", true },
- { "foo/bar/../x", "foo/x", true },
- { "foo/bar/../x/", "foo/x/", true },
- { "foo/..", "", true },
- { "foo/../..", "", true },
- { "foo/../../..", "", true },
- { "foo/../../bar/../../baz", "baz", true },
- { "a/b/../../c", "c", true },
- { "./a/../b", "b", true }
+ { "", "", SCHEME_HTTP, false },
+ { ".", "", SCHEME_HTTP, true },
+ { "./", "", SCHEME_HTTP, true },
+ { "..", "", SCHEME_HTTP, true },
+ { "../", "", SCHEME_HTTP, true },
+ { "..", "..", SCHEME_FTP, false },
+ { "../", "../", SCHEME_FTP, false },
+ { "foo", "foo", SCHEME_HTTP, false },
+ { "foo/bar", "foo/bar", SCHEME_HTTP, false },
+ { "foo///bar", "foo///bar", SCHEME_HTTP, false },
+ { "foo/.", "foo/", SCHEME_HTTP, true },
+ { "foo/./", "foo/", SCHEME_HTTP, true },
+ { "foo./", "foo./", SCHEME_HTTP, false },
+ { "foo/../bar", "bar", SCHEME_HTTP, true },
+ { "foo/../bar/", "bar/", SCHEME_HTTP, true },
+ { "foo/bar/..", "foo/", SCHEME_HTTP, true },
+ { "foo/bar/../x", "foo/x", SCHEME_HTTP, true },
+ { "foo/bar/../x/", "foo/x/", SCHEME_HTTP, true },
+ { "foo/..", "", SCHEME_HTTP, true },
+ { "foo/../..", "", SCHEME_HTTP, true },
+ { "foo/../../..", "", SCHEME_HTTP, true },
+ { "foo/../../bar/../../baz", "baz", SCHEME_HTTP, true },
+ { "foo/../..", "..", SCHEME_FTP, true },
+ { "foo/../../..", "../..", SCHEME_FTP, true },
+ { "foo/../../bar/../../baz", "../../baz", SCHEME_FTP, true },
+ { "a/b/../../c", "c", SCHEME_HTTP, true },
+ { "./a/../b", "b", SCHEME_HTTP, true }
};
int i;
for (i = 0; i < countof (tests); i++)
{
+ const char *message;
char *test = tests[i].test;
char *expected_result = tests[i].result;
+ enum url_scheme scheme = tests[i].scheme;
bool expected_change = tests[i].should_modify;
- run_test (test, expected_result, expected_change);
+ message = run_test (test, expected_result, scheme, expected_change);
+ if (message) return message;
}
+ return NULL;
}
-#endif
-\f
-#ifdef TESTING
const char *
test_append_uri_pathel()
} test_array[] = {
{ "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
};
-
+
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
struct growable dest;