/* URL handling.
- Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
- Free Software Foundation, Inc.
+ Copyright (C) 2005 Free Software Foundation, Inc.
This file is part of GNU Wget.
#include "wget.h"
#include "utils.h"
#include "url.h"
+#include "host.h" /* for is_valid_ipv6_address */
#ifndef errno
extern int errno;
changing the meaning of the URL. For example, you can't decode
"/foo/%2f/bar" into "/foo///bar" because the number and contents of
path components is different. Non-reserved characters can be
- changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". Wget
- uses the rfc1738 set of reserved characters, plus "$" and ",", as
- recommended by rfc2396.
-
- An unsafe characters is the one that should be encoded when URLs
- are placed in foreign environments. E.g. space and newline are
- unsafe in HTTP contexts because HTTP uses them as separator and
+ changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
+ unsafe characters are loosely based on rfc1738, plus "$" and ",",
+ as recommended by rfc2396, and minus "~", which is very frequently
+ used (and sometimes unrecognized as %7E by broken servers).
+
+ An unsafe character is the one that should be encoded when URLs are
+ placed in foreign environments. E.g. space and newline are unsafe
+ in HTTP contexts because HTTP uses them as separator and line
terminator, so they must be encoded to %20 and %0A respectively.
"*" is unsafe in shell context, etc.
#define U urlchr_unsafe
#define RU R|U
-const static unsigned char urlchr_table[256] =
+static const unsigned char urlchr_table[256] =
{
U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
- 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
+ 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
{
const char *p;
- if (url_has_scheme (url))
+ if (url_scheme (url) != SCHEME_INVALID)
return NULL;
/* Look for a ':' or '/'. The former signifies NcFTP syntax, the
N_("Invalid IPv6 numeric address")
};
-#ifdef ENABLE_IPV6
-/* The following two functions were adapted from glibc. */
-
-static int
-is_valid_ipv4_address (const char *str, const char *end)
-{
- int saw_digit = 0;
- int octets = 0;
- int val = 0;
-
- while (str < end)
- {
- int ch = *str++;
-
- if (ch >= '0' && ch <= '9')
- {
- val = val * 10 + (ch - '0');
-
- if (val > 255)
- return 0;
- if (saw_digit == 0)
- {
- if (++octets > 4)
- return 0;
- saw_digit = 1;
- }
- }
- else if (ch == '.' && saw_digit == 1)
- {
- if (octets == 4)
- return 0;
- val = 0;
- saw_digit = 0;
- }
- else
- return 0;
- }
- if (octets < 4)
- return 0;
-
- return 1;
-}
-
-static int
-is_valid_ipv6_address (const char *str, const char *end)
-{
- enum {
- NS_INADDRSZ = 4,
- NS_IN6ADDRSZ = 16,
- NS_INT16SZ = 2
- };
-
- const char *curtok;
- int tp;
- const char *colonp;
- int saw_xdigit;
- unsigned int val;
-
- tp = 0;
- colonp = NULL;
-
- if (str == end)
- return 0;
-
- /* Leading :: requires some special handling. */
- if (*str == ':')
- {
- ++str;
- if (str == end || *str != ':')
- return 0;
- }
-
- curtok = str;
- saw_xdigit = 0;
- val = 0;
-
- while (str < end)
- {
- int ch = *str++;
-
- /* if ch is a number, add it to val. */
- if (ISXDIGIT (ch))
- {
- val <<= 4;
- val |= XDIGIT_TO_NUM (ch);
- if (val > 0xffff)
- return 0;
- saw_xdigit = 1;
- continue;
- }
-
- /* if ch is a colon ... */
- if (ch == ':')
- {
- curtok = str;
- if (saw_xdigit == 0)
- {
- if (colonp != NULL)
- return 0;
- colonp = str + tp;
- continue;
- }
- else if (str == end)
- return 0;
- if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
- return 0;
- tp += NS_INT16SZ;
- saw_xdigit = 0;
- val = 0;
- continue;
- }
-
- /* if ch is a dot ... */
- if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ)
- && is_valid_ipv4_address (curtok, end) == 1)
- {
- tp += NS_INADDRSZ;
- saw_xdigit = 0;
- break;
- }
-
- return 0;
- }
-
- if (saw_xdigit == 1)
- {
- if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
- return 0;
- tp += NS_INT16SZ;
- }
-
- if (colonp != NULL)
- {
- if (tp == NS_IN6ADDRSZ)
- return 0;
- tp = NS_IN6ADDRSZ;
- }
-
- if (tp != NS_IN6ADDRSZ)
- return 0;
-
- return 1;
-}
-#endif
-
/* Parse a URL.
Return a new struct url if successful, NULL on error. In case of
if (scheme == SCHEME_INVALID)
{
error_code = PE_UNSUPPORTED_SCHEME;
- goto error;
+ goto err;
}
url_encoded = reencode_escapes (url);
if (!host_e)
{
error_code = PE_UNTERMINATED_IPV6_ADDRESS;
- goto error;
+ goto err;
}
#ifdef ENABLE_IPV6
if (!is_valid_ipv6_address(host_b, host_e))
{
error_code = PE_INVALID_IPV6_ADDRESS;
- goto error;
+ goto err;
}
/* Continue parsing after the closing ']'. */
p = host_e + 1;
#else
error_code = PE_IPV6_NOT_SUPPORTED;
- goto error;
+ goto err;
#endif
}
else
if (host_b == host_e)
{
error_code = PE_EMPTY_HOST;
- goto error;
+ goto err;
}
port = scheme_default_port (scheme);
/* http://host:12randomgarbage/blah */
/* ^ */
error_code = PE_BAD_PORT_NUMBER;
- goto error;
+ goto err;
}
port = 10 * port + (*pp - '0');
+ /* Check for too large port numbers here, before we have
+ a chance to overflow on bogus port values. */
+ if (port > 65535)
+ {
+ error_code = PE_BAD_PORT_NUMBER;
+ goto err;
+ }
}
}
}
if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
{
error_code = PE_INVALID_USER_NAME;
- goto error;
+ goto err;
}
}
return u;
- error:
+ err:
/* Cleanup in case of error: */
if (url_encoded && url_encoded != url)
xfree (url_encoded);
url_full_path (const struct url *url)
{
int length = full_path_length (url);
- char *full_path = (char *)xmalloc(length + 1);
+ char *full_path = (char *) xmalloc (length + 1);
full_path_write (url, full_path);
full_path[length] = '\0';
return full_path;
}
-/* Escape unsafe and reserved characters, except for the slash
- characters. */
+/* Unescape CHR in an otherwise escaped STR. Used to selectively
+ escaping of certain characters, such as "/" and ":". Returns a
+ count of unescaped chars. */
-static char *
-url_escape_dir (const char *dir)
+static void
+unescape_single_char (char *str, char chr)
{
- char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
- char *h, *t;
- if (newdir == dir)
- return (char *)dir;
-
- /* Unescape slashes in NEWDIR. */
-
- h = newdir; /* hare */
- t = newdir; /* tortoise */
-
+ const char c1 = XNUM_TO_DIGIT (chr >> 4);
+ const char c2 = XNUM_TO_DIGIT (chr & 0xf);
+ char *h = str; /* hare */
+ char *t = str; /* tortoise */
for (; *h; h++, t++)
{
- /* url_escape_1 having converted '/' to "%2F" exactly. */
- if (*h == '%' && h[1] == '2' && h[2] == 'F')
+ if (h[0] == '%' && h[1] == c1 && h[2] == c2)
{
- *t = '/';
+ *t = chr;
h += 2;
}
else
*t = *h;
}
*t = '\0';
+}
+/* Escape unsafe and reserved characters, except for the slash
+ characters. */
+
+static char *
+url_escape_dir (const char *dir)
+{
+ char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
+ if (newdir == dir)
+ return (char *)dir;
+
+ unescape_single_char (newdir, '/');
return newdir;
}
{
const char *p;
char *t;
- struct stat st;
+ struct_stat st;
int res;
p = path + strlen (path);
translate file name back to URL, this would become important
crucial. Right now, it's better to be minimal in escaping. */
-const static unsigned char filechr_table[256] =
+static const unsigned char filechr_table[256] =
{
UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
char *
url_file_name (const struct url *u)
{
- struct growable fnres;
+ struct growable fnres; /* stands for "file name result" */
const char *u_file, *u_query;
char *fname, *unique;
{
if (fnres.tail)
append_char ('/', &fnres);
- append_string (u->host, &fnres);
+ if (0 != strcmp (u->host, ".."))
+ append_string (u->host, &fnres);
+ else
+ /* Host name can come from the network; malicious DNS may
+ allow ".." to be resolved, causing us to write to
+ "../<file>". Defang such host names. */
+ append_string ("%2E%2E", &fnres);
if (u->port != scheme_default_port (u->scheme))
{
char portstr[24];
{
int size;
char *result, *p;
- char *quoted_user = NULL, *quoted_passwd = NULL;
+ char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
int scheme_port = supported_schemes[url->scheme].default_port;
const char *scheme_str = supported_schemes[url->scheme].leading_string;
}
}
- /* Numeric IPv6 addresses can contain ':' and need to be quoted with
- brackets. */
- brackets_around_host = strchr (url->host, ':') != NULL;
+ /* In the unlikely event that the host name contains non-printable
+ characters, quote it for displaying to the user. */
+ quoted_host = url_escape_allow_passthrough (url->host);
+
+ /* Undo the quoting of colons that URL escaping performs. IPv6
+ addresses may legally contain colons, and in that case must be
+ placed in square brackets. */
+ if (quoted_host != url->host)
+ unescape_single_char (quoted_host, ':');
+ brackets_around_host = strchr (quoted_host, ':') != NULL;
size = (strlen (scheme_str)
- + strlen (url->host)
+ + strlen (quoted_host)
+ (brackets_around_host ? 2 : 0)
+ fplen
+ 1);
if (brackets_around_host)
*p++ = '[';
- APPEND (p, url->host);
+ APPEND (p, quoted_host);
if (brackets_around_host)
*p++ = ']';
if (url->port != scheme_port)
if (quoted_user && quoted_user != url->user)
xfree (quoted_user);
- if (quoted_passwd && !hide_password
- && quoted_passwd != url->passwd)
+ if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)
xfree (quoted_passwd);
+ if (quoted_host != url->host)
+ xfree (quoted_host);
return result;
}