/* URL handling.
- Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
+ Inc.
This file is part of GNU Wget.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#ifdef HAVE_UNISTD_H
-# include <unistd.h>
-#endif
+#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include "url.h"
#include "host.h" /* for is_valid_ipv6_address */
+#ifdef __VMS
+#include "vms.h"
+#endif /* def __VMS */
+
#ifdef TESTING
#include "test.h"
#endif
The transformation is done in place. If you need the original
string intact, make a copy before calling this function. */
-static void
+void
url_unescape (char *s)
{
char *t = s; /* t - tortoise */
return url_escape_1 (s, urlchr_unsafe, false);
}
+/* URL-escape the unsafe and reserved characters (see urlchr_table) in
+ a given string, returning a freshly allocated string. */
+
+char *
+url_escape_unsafe_and_reserved (const char *s)
+{
+ return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
+}
+
/* URL-escape the unsafe characters (see urlchr_table) in a given
string. If no characters are unsafe, S is returned. */
return *p == ':';
}
+bool
+url_valid_scheme (const char *url)
+{
+ enum url_scheme scheme = url_scheme (url);
+ return scheme != SCHEME_INVALID;
+}
+
int
scheme_default_port (enum url_scheme scheme)
{
*p++ = '?';
if (flags & scm_has_fragment)
*p++ = '#';
- *p++ = '\0';
+ *p = '\0';
return seps;
}
#define PE_NO_ERROR 0
N_("No error"),
#define PE_UNSUPPORTED_SCHEME 1
- N_("Unsupported scheme"),
-#define PE_INVALID_HOST_NAME 2
+ N_("Unsupported scheme %s"), /* support for format token only here */
+#define PE_MISSING_SCHEME 2
+ N_("Scheme missing"),
+#define PE_INVALID_HOST_NAME 3
N_("Invalid host name"),
-#define PE_BAD_PORT_NUMBER 3
+#define PE_BAD_PORT_NUMBER 4
N_("Bad port number"),
-#define PE_INVALID_USER_NAME 4
+#define PE_INVALID_USER_NAME 5
N_("Invalid user name"),
-#define PE_UNTERMINATED_IPV6_ADDRESS 5
+#define PE_UNTERMINATED_IPV6_ADDRESS 6
N_("Unterminated IPv6 numeric address"),
-#define PE_IPV6_NOT_SUPPORTED 6
+#define PE_IPV6_NOT_SUPPORTED 7
N_("IPv6 addresses not supported"),
-#define PE_INVALID_IPV6_ADDRESS 7
+#define PE_INVALID_IPV6_ADDRESS 8
N_("Invalid IPv6 numeric address")
};
error, and if ERROR is not NULL, also set *ERROR to the appropriate
error code. */
struct url *
-url_parse (const char *url, int *error)
+url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
{
struct url *u;
const char *p;
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded = NULL;
+ const char *url_encoded = NULL;
+ char *new_url = NULL;
int error_code;
scheme = url_scheme (url);
if (scheme == SCHEME_INVALID)
{
- error_code = PE_UNSUPPORTED_SCHEME;
+ if (url_has_scheme (url))
+ error_code = PE_UNSUPPORTED_SCHEME;
+ else
+ error_code = PE_MISSING_SCHEME;
goto error;
}
- url_encoded = reencode_escapes (url);
+ if (iri && iri->utf8_encode)
+ {
+ iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
+ if (!iri->utf8_encode)
+ new_url = NULL;
+ else
+ {
+ iri->orig_url = xstrdup (url);
+ percent_encode = true;
+ }
+ }
+
+ /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/
+ if (percent_encode)
+ url_encoded = reencode_escapes (new_url ? new_url : url);
+ else
+ url_encoded = new_url ? new_url : url;
+
p = url_encoded;
+ if (new_url && url_encoded != new_url)
+ xfree (new_url);
+
p += strlen (supported_schemes[scheme].leading_string);
uname_b = p;
p = url_skip_credentials (p);
{
url_unescape (u->host);
host_modified = true;
+
+ /* Apply IDNA regardless of iri->utf8_encode status */
+ if (opt.enable_iri && iri)
+ {
+ char *new = idn_encode (iri, u->host);
+ if (new)
+ {
+ xfree (u->host);
+ u->host = new;
+ host_modified = true;
+ }
+ }
}
if (params_b)
if (fragment_b)
u->fragment = strdupdelim (fragment_b, fragment_e);
- if (path_modified || u->fragment || host_modified || path_b == path_e)
+ if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
{
/* If we suspect that a transformation has rendered what
url_string might return different from URL_ENCODED, rebuild
if (url_encoded == url)
u->url = xstrdup (url);
else
- u->url = url_encoded;
+ u->url = (char *) url_encoded;
}
return u;
error:
/* Cleanup in case of error: */
if (url_encoded && url_encoded != url)
- xfree (url_encoded);
+ xfree ((char *) url_encoded);
/* Transmit the error code to the caller, if the caller wants to
know. */
/* Return the error message string from ERROR_CODE, which should have
been retrieved from url_parse. The error message is translated. */
-const char *
-url_error (int error_code)
+char *
+url_error (const char *url, int error_code)
{
assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
- return _(parse_errors[error_code]);
+
+ if (error_code == PE_UNSUPPORTED_SCHEME)
+ {
+ char *error, *p;
+ char *scheme = xstrdup (url);
+ assert (url_has_scheme (url));
+
+ if ((p = strchr (scheme, ':')))
+ *p = '\0';
+ if (!strcasecmp (scheme, "https"))
+ error = aprintf (_("HTTPS support not compiled in"));
+ else
+ error = aprintf (_(parse_errors[error_code]), quote (scheme));
+ xfree (scheme);
+
+ return error;
+ }
+ else
+ return xstrdup (_(parse_errors[error_code]));
}
/* Split PATH into DIR and FILE. PATH comes from the URL and is
The idea is to have a convenient and efficient way to construct a
string by having various functions append data to it. Instead of
passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
- functions in questions, we pass the pointer to this struct. */
+ functions in questions, we pass the pointer to this struct.
+
+ Functions that write to the members in this struct must make sure
+ that base remains null terminated by calling append_null().
+ */
struct growable {
char *base;
- int size;
- int tail;
+ int size; /* memory allocated */
+ int tail; /* string length */
};
/* Ensure that the string can accept APPEND_COUNT more characters past
/* Move the tail position by APPEND_COUNT characters. */
#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
-/* Append the string STR to DEST. NOTICE: the string in DEST is not
- terminated. */
+/* Append NULL to DEST. */
static void
-append_string (const char *str, struct growable *dest)
+append_null (struct growable *dest)
{
- int l = strlen (str);
- GROW (dest, l);
- memcpy (TAIL (dest), str, l);
- TAIL_INCR (dest, l);
+ GROW (dest, 1);
+ *TAIL (dest) = 0;
}
-/* Append CH to DEST. For example, append_char (0, DEST)
- zero-terminates DEST. */
-
+/* Append CH to DEST. */
static void
append_char (char ch, struct growable *dest)
{
- GROW (dest, 1);
- *TAIL (dest) = ch;
- TAIL_INCR (dest, 1);
+ if (ch)
+ {
+ GROW (dest, 1);
+ *TAIL (dest) = ch;
+ TAIL_INCR (dest, 1);
+ }
+
+ append_null (dest);
+}
+
+/* Append the string STR to DEST. */
+static void
+append_string (const char *str, struct growable *dest)
+{
+ int l = strlen (str);
+
+ if (l)
+ {
+ GROW (dest, l);
+ memcpy (TAIL (dest), str, l);
+ TAIL_INCR (dest, l);
+ }
+
+ append_null (dest);
}
+
enum {
filechr_not_unix = 1, /* unusable on Unix, / and \0 */
filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
filechr_control = 4 /* a control character, e.g. 0-31 */
};
-#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
+#define FILE_CHAR_TEST(c, mask) \
+ ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
+ (filechr_table[(unsigned char)(c)] & (mask)))
/* Shorthands for the table: */
#define U filechr_not_unix
query, normally '?'. Since Windows cannot handle '?' as part of
file name, we use '@' instead there. */
#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
+#define FN_QUERY_SEP_STR (opt.restrict_files_os != restrict_windows ? "?" : "@")
/* Quote path element, characters in [b, e), as file name, and append
the quoted string to DEST. Each character is quoted as per
}
assert (q - TAIL (dest) == outlen);
}
-
+
/* Perform inline case transformation if required. */
if (opt.restrict_files_case == restrict_lowercase
|| opt.restrict_files_case == restrict_uppercase)
*q = c_toupper (*q);
}
}
-
+
TAIL_INCR (dest, outlen);
+ append_null (dest);
}
/* Append to DEST the directory structure that corresponds the
}
}
-/* Return a unique file name that matches the given URL as good as
+/* Return a unique file name that matches the given URL as well as
possible. Does not create directories on the file system. */
char *
-url_file_name (const struct url *u)
+url_file_name (const struct url *u, char *replaced_filename)
{
struct growable fnres; /* stands for "file name result" */
+ struct growable temp_fnres;
- const char *u_file, *u_query;
- char *fname, *unique;
+ const char *u_file;
+ char *fname, *unique, *fname_len_check;
+ const char *index_filename = "index.html"; /* The default index file is index.html */
+ size_t max_length;
fnres.base = NULL;
fnres.size = 0;
fnres.tail = 0;
+ temp_fnres.base = NULL;
+ temp_fnres.size = 0;
+ temp_fnres.tail = 0;
+
+ /* If an alternative index file was defined, change index_filename */
+ if (opt.default_page)
+ index_filename = opt.default_page;
+
+
/* Start with the directory prefix, if specified. */
if (opt.dir_prefix)
append_string (opt.dir_prefix, &fnres);
append_dir_structure (u, &fnres);
}
- /* Add the file name. */
- if (fnres.tail)
- append_char ('/', &fnres);
- u_file = *u->file ? u->file : "index.html";
- append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
+ if (!replaced_filename)
+ {
+ /* Create the filename. */
+ u_file = *u->file ? u->file : index_filename;
- /* Append "?query" to the file name. */
- u_query = u->query && *u->query ? u->query : NULL;
- if (u_query)
+ /* Append "?query" to the file name, even if empty,
+ * and create fname_len_check. */
+ if (u->query)
+ fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
+ else
+ fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
+ }
+ else
+ {
+ u_file = replaced_filename;
+ fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
+ }
+
+ append_uri_pathel (fname_len_check,
+ fname_len_check + strlen (fname_len_check), false, &temp_fnres);
+
+ /* Zero-terminate the temporary file name. */
+ append_char ('\0', &temp_fnres);
+
+ /* Check that the length of the file name is acceptable. */
+#ifdef WINDOWS
+ if (MAX_PATH > (fnres.tail + CHOMP_BUFFER + 2))
+ {
+ max_length = MAX_PATH - (fnres.tail + CHOMP_BUFFER + 2);
+ /* FIXME: In Windows a filename is usually limited to 255 characters.
+ To really be accurate you could call GetVolumeInformation() to get
+ lpMaximumComponentLength
+ */
+ if (max_length > 255)
+ {
+ max_length = 255;
+ }
+ }
+ else
+ {
+ max_length = 0;
+ }
+#else
+ max_length = get_max_length (fnres.base, fnres.tail, _PC_NAME_MAX) - CHOMP_BUFFER;
+#endif
+ if (max_length > 0 && strlen (temp_fnres.base) > max_length)
{
- append_char (FN_QUERY_SEP, &fnres);
- append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
+ logprintf (LOG_NOTQUIET, "The name is too long, %lu chars total.\n",
+ (unsigned long) strlen (temp_fnres.base));
+ logprintf (LOG_NOTQUIET, "Trying to shorten...\n");
+
+ /* Shorten the file name. */
+ temp_fnres.base[max_length] = '\0';
+
+ logprintf (LOG_NOTQUIET, "New name is %s.\n", temp_fnres.base);
}
- /* Zero-terminate the file name. */
- append_char ('\0', &fnres);
+ free (fname_len_check);
+
+ /* The filename has already been 'cleaned' by append_uri_pathel() above. So,
+ * just append it. */
+ if (fnres.tail)
+ append_char ('/', &fnres);
+ append_string (temp_fnres.base, &fnres);
fname = fnres.base;
+ /* Make a final check that the path length is acceptable? */
+ /* TODO: check fnres.base for path length problem */
+
+ free (temp_fnres.base);
+
/* Check the cases in which the unique extensions are not used:
1) Clobbering is turned off (-nc).
2) Retrieval with regetting.
3) Timestamping is used.
4) Hierarchy is built.
+ 5) Backups are specified.
The exception is the case when file does exist and is a
directory (see `mkalldirs' for explanation). */
- if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
+ if (ALLOW_CLOBBER
&& !(file_exists_p (fname) && !file_non_directory_p (fname)))
- return fname;
+ {
+ unique = fname;
+ }
+ else
+ {
+ unique = unique_name (fname, true);
+ if (unique != fname)
+ xfree (fname);
+ }
+
+/* On VMS, alter the name as required. */
+#ifdef __VMS
+ {
+ char *unique2;
+
+ unique2 = ods_conform( unique);
+ if (unique2 != unique)
+ {
+ xfree (unique);
+ unique = unique2;
+ }
+ }
+#endif /* def __VMS */
- unique = unique_name (fname, true);
- if (unique != fname)
- xfree (fname);
return unique;
}
\f
}
\f
/* Return true if scheme a is similar to scheme b.
-
+
Schemes are similar if they are equal. If SSL is supported, schemes
are also similar if one is http (SCHEME_HTTP) and the other is https
(SCHEME_HTTPS). */
\f
static int
getchar_from_escaped_string (const char *str, char *c)
-{
+{
const char *p = str;
assert (str && *str);
assert (c);
-
+
if (p[0] == '%')
{
if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
p += pp;
q += qq;
}
-
+
return (*p == 0 && *q == 0 ? true : false);
}
\f
} test_array[] = {
{ "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
};
-
- for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
+
+ for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
struct growable dest;
const char *p = test_array[i].input;
-
+
memset (&dest, 0, sizeof (dest));
-
+
append_string (test_array[i].original_url, &dest);
append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
- append_char ('\0', &dest);
- mu_assert ("test_append_uri_pathel: wrong result",
+ mu_assert ("test_append_uri_pathel: wrong result",
strcmp (dest.base, test_array[i].expected_result) == 0);
}
{ "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/", false },
{ "http://www.adomain.com/path%2f", "http://www.adomain.com/path/", false },
};
-
- for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
+
+ for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
- mu_assert ("test_are_urls_equal: wrong result",
+ mu_assert ("test_are_urls_equal: wrong result",
are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
}