#include "wget.h"
#include "utils.h"
#include "url.h"
-#include "host.h"
-#include "hash.h"
#ifndef errno
extern int errno;
{
unsigned char c = *p1++;
*p2++ = '%';
- *p2++ = XNUM_TO_digit (c >> 4);
- *p2++ = XNUM_TO_digit (c & 0xf);
+ *p2++ = XNUM_TO_DIGIT (c >> 4);
+ *p2++ = XNUM_TO_DIGIT (c & 0xf);
}
else
*p2++ = *p1++;
static char *parse_errors[] = {
#define PE_NO_ERROR 0
- "No error",
+ N_("No error"),
#define PE_UNSUPPORTED_SCHEME 1
- "Unsupported scheme",
+ N_("Unsupported scheme"),
#define PE_EMPTY_HOST 2
- "Empty host",
+ N_("Empty host"),
#define PE_BAD_PORT_NUMBER 3
- "Bad port number",
+ N_("Bad port number"),
#define PE_INVALID_USER_NAME 4
- "Invalid user name",
+ N_("Invalid user name"),
#define PE_UNTERMINATED_IPV6_ADDRESS 5
- "Unterminated IPv6 numeric address",
+ N_("Unterminated IPv6 numeric address"),
#define PE_IPV6_NOT_SUPPORTED 6
- "IPv6 addresses not supported",
+ N_("IPv6 addresses not supported"),
#define PE_INVALID_IPV6_ADDRESS 7
- "Invalid IPv6 numeric address"
+ N_("Invalid IPv6 numeric address")
};
-#define SETERR(p, v) do { \
- if (p) \
- *(p) = (v); \
-} while (0)
-
#ifdef ENABLE_IPV6
/* The following two functions were adapted from glibc. */
static int
is_valid_ipv4_address (const char *str, const char *end)
{
- int saw_digit, octets;
- int val;
-
- saw_digit = 0;
- octets = 0;
- val = 0;
+ int saw_digit = 0;
+ int octets = 0;
+ int val = 0;
- while (str < end) {
- int ch = *str++;
+ while (str < end)
+ {
+ int ch = *str++;
- if (ch >= '0' && ch <= '9') {
- val = val * 10 + (ch - '0');
+ if (ch >= '0' && ch <= '9')
+ {
+ val = val * 10 + (ch - '0');
- if (val > 255)
- return 0;
- if (saw_digit == 0) {
- if (++octets > 4)
- return 0;
- saw_digit = 1;
- }
- } else if (ch == '.' && saw_digit == 1) {
- if (octets == 4)
- return 0;
- val = 0;
- saw_digit = 0;
- } else
- return 0;
- }
+ if (val > 255)
+ return 0;
+ if (saw_digit == 0)
+ {
+ if (++octets > 4)
+ return 0;
+ saw_digit = 1;
+ }
+ }
+ else if (ch == '.' && saw_digit == 1)
+ {
+ if (octets == 4)
+ return 0;
+ val = 0;
+ saw_digit = 0;
+ }
+ else
+ return 0;
+ }
if (octets < 4)
return 0;
return 1;
}
-static const int NS_INADDRSZ = 4;
-static const int NS_IN6ADDRSZ = 16;
-static const int NS_INT16SZ = 2;
-
static int
is_valid_ipv6_address (const char *str, const char *end)
{
- static const char xdigits[] = "0123456789abcdef";
+ enum {
+ NS_INADDRSZ = 4,
+ NS_IN6ADDRSZ = 16,
+ NS_INT16SZ = 2
+ };
+
const char *curtok;
int tp;
const char *colonp;
saw_xdigit = 0;
val = 0;
- while (str < end) {
- int ch = *str++;
- const char *pch;
+ while (str < end)
+ {
+ int ch = *str++;
- /* if ch is a number, add it to val. */
- pch = strchr(xdigits, ch);
- if (pch != NULL) {
- val <<= 4;
- val |= (pch - xdigits);
- if (val > 0xffff)
- return 0;
- saw_xdigit = 1;
- continue;
+ /* if ch is a number, add it to val. */
+ if (ISXDIGIT (ch))
+ {
+ val <<= 4;
+ val |= XDIGIT_TO_NUM (ch);
+ if (val > 0xffff)
+ return 0;
+ saw_xdigit = 1;
+ continue;
+ }
+
+ /* if ch is a colon ... */
+ if (ch == ':')
+ {
+ curtok = str;
+ if (saw_xdigit == 0)
+ {
+ if (colonp != NULL)
+ return 0;
+ colonp = str + tp;
+ continue;
+ }
+ else if (str == end)
+ return 0;
+ if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
+ return 0;
+ tp += NS_INT16SZ;
+ saw_xdigit = 0;
+ val = 0;
+ continue;
+ }
+
+ /* if ch is a dot ... */
+ if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ)
+ && is_valid_ipv4_address (curtok, end) == 1)
+ {
+ tp += NS_INADDRSZ;
+ saw_xdigit = 0;
+ break;
+ }
+
+ return 0;
}
- /* if ch is a colon ... */
- if (ch == ':') {
- curtok = str;
- if (saw_xdigit == 0) {
- if (colonp != NULL)
- return 0;
- colonp = str + tp;
- continue;
- } else if (str == end) {
- return 0;
- }
- if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
+ if (saw_xdigit == 1)
+ {
+ if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
return 0;
tp += NS_INT16SZ;
- saw_xdigit = 0;
- val = 0;
- continue;
}
- /* if ch is a dot ... */
- if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
- is_valid_ipv4_address(curtok, end) == 1) {
- tp += NS_INADDRSZ;
- saw_xdigit = 0;
- break;
+ if (colonp != NULL)
+ {
+ if (tp == NS_IN6ADDRSZ)
+ return 0;
+ tp = NS_IN6ADDRSZ;
}
-
- return 0;
- }
-
- if (saw_xdigit == 1) {
- if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
- return 0;
- tp += NS_INT16SZ;
- }
-
- if (colonp != NULL) {
- if (tp == NS_IN6ADDRSZ)
- return 0;
- tp = NS_IN6ADDRSZ;
- }
if (tp != NS_IN6ADDRSZ)
return 0;
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded;
+ char *url_encoded = NULL;
+
+ int error_code;
scheme = url_scheme (url);
if (scheme == SCHEME_INVALID)
{
- SETERR (error, PE_UNSUPPORTED_SCHEME);
- return NULL;
+ error_code = PE_UNSUPPORTED_SCHEME;
+ goto error;
}
url_encoded = reencode_escapes (url);
if (!host_e)
{
- SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
- return NULL;
+ error_code = PE_UNTERMINATED_IPV6_ADDRESS;
+ goto error;
}
#ifdef ENABLE_IPV6
/* Check if the IPv6 address is valid. */
if (!is_valid_ipv6_address(host_b, host_e))
{
- SETERR (error, PE_INVALID_IPV6_ADDRESS);
- return NULL;
+ error_code = PE_INVALID_IPV6_ADDRESS;
+ goto error;
}
/* Continue parsing after the closing ']'. */
p = host_e + 1;
#else
- SETERR (error, PE_IPV6_NOT_SUPPORTED);
- return NULL;
+ error_code = PE_IPV6_NOT_SUPPORTED;
+ goto error;
#endif
}
else
if (host_b == host_e)
{
- SETERR (error, PE_EMPTY_HOST);
- return NULL;
+ error_code = PE_EMPTY_HOST;
+ goto error;
}
port = scheme_default_port (scheme);
{
/* http://host:/whatever */
/* ^ */
- SETERR (error, PE_BAD_PORT_NUMBER);
- return NULL;
+ error_code = PE_BAD_PORT_NUMBER;
+ goto error;
}
for (port = 0, pp = port_b; pp < port_e; pp++)
{
/* http://host:12randomgarbage/blah */
/* ^ */
- SETERR (error, PE_BAD_PORT_NUMBER);
- return NULL;
+ error_code = PE_BAD_PORT_NUMBER;
+ goto error;
}
port = 10 * port + (*pp - '0');
/* uname_b uname_e */
if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
{
- SETERR (error, PE_INVALID_USER_NAME);
- return NULL;
+ error_code = PE_INVALID_USER_NAME;
+ goto error;
}
}
- u = (struct url *)xmalloc (sizeof (struct url));
- memset (u, 0, sizeof (*u));
-
+ u = xnew0 (struct url);
u->scheme = scheme;
u->host = strdupdelim (host_b, host_e);
u->port = port;
url_encoded = NULL;
return u;
+
+ error:
+ /* Cleanup in case of error: */
+ if (url_encoded && url_encoded != url)
+ xfree (url_encoded);
+
+ /* Transmit the error code to the caller, if the caller wants to
+ know. */
+ if (error)
+ *error = error_code;
+ return NULL;
}
+/* Return the error message string from ERROR_CODE, which should have
+ been retrieved from url_parse. The error message is translated. */
+
const char *
url_error (int error_code)
{
assert (error_code >= 0 && error_code < countof (parse_errors));
- return parse_errors[error_code];
+ return _(parse_errors[error_code]);
}
/* Split PATH into DIR and FILE. PATH comes from the URL and is
for (; *h; h++, t++)
{
+ /* url_escape_1 having converted '/' to "%2F" exactly. */
if (*h == '%' && h[1] == '2' && h[2] == 'F')
{
*t = '/';
enum {
filechr_not_unix = 1, /* unusable on Unix, / and \0 */
filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
- filechr_control = 4, /* a control character, e.g. 0-31 */
+ filechr_control = 4 /* a control character, e.g. 0-31 */
};
#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
/* Quote path element, characters in [b, e), as file name, and append
the quoted string to DEST. Each character is quoted as per
- file_unsafe_char and the corresponding table. */
+ file_unsafe_char and the corresponding table.
+
+ If ESCAPED_P is non-zero, the path element is considered to be
+ URL-escaped and will be unescaped prior to inspection. */
static void
-append_uri_pathel (const char *b, const char *e, struct growable *dest)
+append_uri_pathel (const char *b, const char *e, int escaped_p,
+ struct growable *dest)
{
- char *pathel;
- int pathlen;
-
const char *p;
int quoted, outlen;
mask |= filechr_control;
/* Copy [b, e) to PATHEL and URL-unescape it. */
- BOUNDED_TO_ALLOCA (b, e, pathel);
- url_unescape (pathel);
- pathlen = strlen (pathel);
+ if (escaped_p)
+ {
+ char *unescaped;
+ BOUNDED_TO_ALLOCA (b, e, unescaped);
+ url_unescape (unescaped);
+ b = unescaped;
+ e = unescaped + strlen (unescaped);
+ }
- /* Go through PATHEL and check how many characters we'll need to
- add for file quoting. */
+ /* Walk the PATHEL string and check how many characters we'll need
+ to add for file quoting. */
quoted = 0;
- for (p = pathel; *p; p++)
+ for (p = b; p < e; p++)
if (FILE_CHAR_TEST (*p, mask))
++quoted;
- /* p - pathel is the string length. Each quoted char means two
- additional characters in the string, hence 2*quoted. */
- outlen = (p - pathel) + (2 * quoted);
+ /* e-b is the string length. Each quoted char means two additional
+ characters in the string, hence 2*quoted. */
+ outlen = (e - b) + (2 * quoted);
GROW (dest, outlen);
if (!quoted)
{
/* If there's nothing to quote, we don't need to go through the
string the second time. */
- memcpy (TAIL (dest), pathel, outlen);
+ memcpy (TAIL (dest), b, outlen);
}
else
{
char *q = TAIL (dest);
- for (p = pathel; *p; p++)
+ for (p = b; p < e; p++)
{
if (!FILE_CHAR_TEST (*p, mask))
*q++ = *p;
if (cut-- > 0)
continue;
if (pathel == next)
- /* Ignore empty pathels. path_simplify should remove
- occurrences of "//" from the path, but it has special cases
- for starting / which generates an empty pathel here. */
+ /* Ignore empty pathels. */
continue;
if (dest->tail)
append_char ('/', dest);
- append_uri_pathel (pathel, next, dest);
+ append_uri_pathel (pathel, next, 1, dest);
}
}
if (fnres.tail)
append_char ('/', &fnres);
u_file = *u->file ? u->file : "index.html";
- append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
+ append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres);
/* Append "?query" to the file name. */
u_query = u->query && *u->query ? u->query : NULL;
if (u_query)
{
append_char (FN_QUERY_SEP, &fnres);
- append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
+ append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres);
}
/* Zero-terminate the file name. */
}
\f
/* Resolve "." and ".." elements of PATH by destructively modifying
- PATH. "." is resolved by removing that path element, and ".." is
- resolved by removing the preceding path element. Leading and
- trailing slashes are preserved.
+ PATH and return non-zero if PATH has been modified, zero otherwise.
- Return non-zero if any changes have been made.
+ The algorithm is in spirit similar to the one described in rfc1808,
+ although implemented differently, in one pass. To recap, path
+ elements containing only "." are removed, and ".." is taken to mean
+ "back up one element". Single leading and trailing slashes are
+ preserved.
+
+ This function does not handle URL escapes explicitly. If you're
+ passing paths from URLs, make sure to unquote "%2e" and "%2E" to
+ ".", so that this function can find the dots. (Wget's URL parser
+ calls reencode_escapes, which see.)
For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
test examples are provided below. If you change anything in this
function, run test_path_simplify to make sure you haven't broken a
- test case.
-
- A previous version of this function was based on path_simplify()
- from GNU Bash, but it has been rewritten for Wget 1.8.1. */
+ test case. */
static int
path_simplify (char *path)
{
- int change = 0;
- char *p, *end;
+ char *h, *t, *end;
+ /* Preserve the leading '/'. */
if (path[0] == '/')
- ++path; /* preserve the leading '/'. */
+ ++path;
- p = path;
- end = p + strlen (p) + 1; /* position past the terminating zero. */
+ h = path; /* hare */
+ t = path; /* tortoise */
+ end = path + strlen (path);
- while (1)
+ while (h < end)
{
- again:
- /* P should point to the beginning of a path element. */
+ /* Hare should be at the beginning of a path element. */
- if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
+ if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
{
- /* Handle "./foo" by moving "foo" two characters to the
- left. */
- if (*(p + 1) == '/')
- {
- change = 1;
- memmove (p, p + 2, end - (p + 2));
- end -= 2;
- goto again;
- }
- else
- {
- change = 1;
- *p = '\0';
- break;
- }
+ /* Ignore "./". */
+ h += 2;
}
- else if (*p == '.' && *(p + 1) == '.'
- && (*(p + 2) == '/' || *(p + 2) == '\0'))
+ else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
{
- /* Handle "../foo" by moving "foo" one path element to the
- left. */
- char *b = p; /* not p-1 because P can equal PATH */
-
- /* Backtrack by one path element, but not past the beginning
- of PATH. */
-
- /* foo/bar/../baz */
- /* ^ p */
- /* ^ b */
-
- if (b > path)
+ /* Handle "../" by retreating the tortoise by one path
+ element -- but not past beggining of PATH. */
+ if (t > path)
{
- /* Move backwards until B hits the beginning of the
+ /* Move backwards until T hits the beginning of the
previous path element or the beginning of path. */
- for (--b; b > path && *(b - 1) != '/'; b--)
+ for (--t; t > path && t[-1] != '/'; t--)
;
}
-
- change = 1;
- if (*(p + 2) == '/')
+ h += 3;
+ }
+ else if (*h == '/')
+ {
+ /* Ignore empty path elements. Supporting them well is hard
+ (where do you save "http://x.com///y.html"?), and they
+ don't bring any practical gain. Plus, they break our
+ filesystem-influenced assumptions: allowing them would
+ make "x/y//../z" simplify to "x/y/z", whereas most people
+ would expect "x/z". */
+ ++h;
+ }
+ else
+ {
+ /* A regular path element. If H hasn't advanced past T,
+ simply skip to the next path element. Otherwise, copy
+ the path element until the next slash. */
+ if (t == h)
{
- memmove (b, p + 3, end - (p + 3));
- end -= (p + 3) - b;
- p = b;
+ /* Skip the path element, including the slash. */
+ while (h < end && *h != '/')
+ t++, h++;
+ if (h < end)
+ t++, h++;
}
else
{
- *b = '\0';
- break;
+ /* Copy the path element, including the final slash. */
+ while (h < end && *h != '/')
+ *t++ = *h++;
+ if (h < end)
+ *t++ = *h++;
}
-
- goto again;
- }
- else if (*p == '/')
- {
- /* Remove empty path elements. Not mandated by rfc1808 et
- al, but it seems like a good idea to get rid of them.
- Supporting them properly is hard (in which directory do
- you save http://x.com///y.html?) and they don't seem to
- bring much gain. */
- char *q = p;
- while (*q == '/')
- ++q;
- change = 1;
- if (*q == '\0')
- {
- *p = '\0';
- break;
- }
- memmove (p, q, end - q);
- end -= q - p;
- goto again;
}
-
- /* Skip to the next path element. */
- while (*p && *p != '/')
- ++p;
- if (*p == '\0')
- break;
-
- /* Make sure P points to the beginning of the next path element,
- which is location after the slash. */
- ++p;
}
- return change;
+ if (t != h)
+ *t = '\0';
+
+ return t != h;
}
\f
/* Merge BASE with LINK and return the resulting URI.