#include "wget.h"
#include "utils.h"
#include "url.h"
-#include "host.h"
-#include "hash.h"
#ifndef errno
extern int errno;
{
unsigned char c = *p1++;
*p2++ = '%';
- *p2++ = XNUM_TO_digit (c >> 4);
- *p2++ = XNUM_TO_digit (c & 0xf);
+ *p2++ = XNUM_TO_DIGIT (c >> 4);
+ *p2++ = XNUM_TO_DIGIT (c & 0xf);
}
else
*p2++ = *p1++;
static char *parse_errors[] = {
#define PE_NO_ERROR 0
- "No error",
+ N_("No error"),
#define PE_UNSUPPORTED_SCHEME 1
- "Unsupported scheme",
+ N_("Unsupported scheme"),
#define PE_EMPTY_HOST 2
- "Empty host",
+ N_("Empty host"),
#define PE_BAD_PORT_NUMBER 3
- "Bad port number",
+ N_("Bad port number"),
#define PE_INVALID_USER_NAME 4
- "Invalid user name",
+ N_("Invalid user name"),
#define PE_UNTERMINATED_IPV6_ADDRESS 5
- "Unterminated IPv6 numeric address",
+ N_("Unterminated IPv6 numeric address"),
#define PE_IPV6_NOT_SUPPORTED 6
- "IPv6 addresses not supported",
+ N_("IPv6 addresses not supported"),
#define PE_INVALID_IPV6_ADDRESS 7
- "Invalid IPv6 numeric address"
+ N_("Invalid IPv6 numeric address")
};
-#define SETERR(p, v) do { \
- if (p) \
- *(p) = (v); \
-} while (0)
-
#ifdef ENABLE_IPV6
/* The following two functions were adapted from glibc. */
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded;
+ char *url_encoded = NULL;
+
+ int error_code;
scheme = url_scheme (url);
if (scheme == SCHEME_INVALID)
{
- SETERR (error, PE_UNSUPPORTED_SCHEME);
- return NULL;
+ error_code = PE_UNSUPPORTED_SCHEME;
+ goto error;
}
url_encoded = reencode_escapes (url);
if (!host_e)
{
- SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
- return NULL;
+ error_code = PE_UNTERMINATED_IPV6_ADDRESS;
+ goto error;
}
#ifdef ENABLE_IPV6
/* Check if the IPv6 address is valid. */
if (!is_valid_ipv6_address(host_b, host_e))
{
- SETERR (error, PE_INVALID_IPV6_ADDRESS);
- return NULL;
+ error_code = PE_INVALID_IPV6_ADDRESS;
+ goto error;
}
/* Continue parsing after the closing ']'. */
p = host_e + 1;
#else
- SETERR (error, PE_IPV6_NOT_SUPPORTED);
- return NULL;
+ error_code = PE_IPV6_NOT_SUPPORTED;
+ goto error;
#endif
}
else
if (host_b == host_e)
{
- SETERR (error, PE_EMPTY_HOST);
- return NULL;
+ error_code = PE_EMPTY_HOST;
+ goto error;
}
port = scheme_default_port (scheme);
{
/* http://host:/whatever */
/* ^ */
- SETERR (error, PE_BAD_PORT_NUMBER);
- return NULL;
+ error_code = PE_BAD_PORT_NUMBER;
+ goto error;
}
for (port = 0, pp = port_b; pp < port_e; pp++)
{
/* http://host:12randomgarbage/blah */
/* ^ */
- SETERR (error, PE_BAD_PORT_NUMBER);
- return NULL;
+ error_code = PE_BAD_PORT_NUMBER;
+ goto error;
}
port = 10 * port + (*pp - '0');
/* uname_b uname_e */
if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
{
- SETERR (error, PE_INVALID_USER_NAME);
- return NULL;
+ error_code = PE_INVALID_USER_NAME;
+ goto error;
}
}
url_encoded = NULL;
return u;
+
+ error:
+ /* Cleanup in case of error: */
+ if (url_encoded && url_encoded != url)
+ xfree (url_encoded);
+
+ /* Transmit the error code to the caller, if the caller wants to
+ know. */
+ if (error)
+ *error = error_code;
+ return NULL;
}
+/* Return the error message string from ERROR_CODE, which should have
+ been retrieved from url_parse. The error message is translated. */
+
const char *
url_error (int error_code)
{
assert (error_code >= 0 && error_code < countof (parse_errors));
- return parse_errors[error_code];
+ return _(parse_errors[error_code]);
}
/* Split PATH into DIR and FILE. PATH comes from the URL and is
for (; *h; h++, t++)
{
+ /* url_escape_1 having converted '/' to "%2F" exactly. */
if (*h == '%' && h[1] == '2' && h[2] == 'F')
{
*t = '/';
}
\f
/* Resolve "." and ".." elements of PATH by destructively modifying
- PATH. "." is resolved by removing that path element, and ".." is
- resolved by removing the preceding path element. Leading and
- trailing slashes are preserved.
+ PATH and return non-zero if PATH has been modified, zero otherwise.
- Return non-zero if any changes have been made.
+ The algorithm is in spirit similar to the one described in rfc1808,
+ although implemented differently, in one pass. To recap, path
+ elements containing only "." are removed, and ".." is taken to mean
+ "back up one element". Single leading and trailing slashes are
+ preserved.
+
+ This function does not handle URL escapes explicitly. If you're
+ passing paths from URLs, make sure to unquote "%2e" and "%2E" to
+ ".", so that this function can find the dots. (Wget's URL parser
+ calls reencode_escapes, which see.)
For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
test examples are provided below. If you change anything in this
function, run test_path_simplify to make sure you haven't broken a
- test case.
-
- A previous version of this function was based on path_simplify()
- from GNU Bash, but it has been rewritten for Wget 1.8.1. */
+ test case. */
static int
path_simplify (char *path)
{
- int change = 0;
- char *p, *end;
+ char *h, *t, *end;
+ /* Preserve the leading '/'. */
if (path[0] == '/')
- ++path; /* preserve the leading '/'. */
+ ++path;
- p = path;
- end = p + strlen (p) + 1; /* position past the terminating zero. */
+ h = path; /* hare */
+ t = path; /* tortoise */
+ end = path + strlen (path);
- while (1)
+ while (h < end)
{
- again:
- /* P should point to the beginning of a path element. */
+ /* Hare should be at the beginning of a path element. */
- if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
+ if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
{
- /* Handle "./foo" by moving "foo" two characters to the
- left. */
- if (*(p + 1) == '/')
- {
- change = 1;
- memmove (p, p + 2, end - (p + 2));
- end -= 2;
- goto again;
- }
- else
- {
- change = 1;
- *p = '\0';
- break;
- }
+ /* Ignore "./". */
+ h += 2;
}
- else if (*p == '.' && *(p + 1) == '.'
- && (*(p + 2) == '/' || *(p + 2) == '\0'))
+ else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
{
- /* Handle "../foo" by moving "foo" one path element to the
- left. */
- char *b = p; /* not p-1 because P can equal PATH */
-
- /* Backtrack by one path element, but not past the beginning
- of PATH. */
-
- /* foo/bar/../baz */
- /* ^ p */
- /* ^ b */
-
- if (b > path)
+ /* Handle "../" by retreating the tortoise by one path
+ element -- but not past beggining of PATH. */
+ if (t > path)
{
/* Move backwards until B hits the beginning of the
previous path element or the beginning of path. */
- for (--b; b > path && *(b - 1) != '/'; b--)
+ for (--t; t > path && t[-1] != '/'; t--)
;
}
-
- change = 1;
- if (*(p + 2) == '/')
+ h += 3;
+ }
+ else if (*h == '/')
+ {
+ /* Ignore empty path elements. Supporting them well is hard
+ (where do you save "http://x.com///y.html"?), and they
+ don't bring any practical gain. Plus, they break our
+ filesystem-influenced assumptions: allowing them would
+ make "x/y//../z" simplify to "x/y/z", whereas most people
+ would expect "x/z". */
+ ++h;
+ }
+ else
+ {
+ /* A regular path element. If H hasn't advanced past T,
+ simply skip to the next path element. Otherwise, copy
+ the path element until the next slash. */
+ if (t == h)
{
- memmove (b, p + 3, end - (p + 3));
- end -= (p + 3) - b;
- p = b;
+ /* Skip the path element, including the slash. */
+ while (h < end && *h != '/')
+ t++, h++;
+ if (h < end)
+ t++, h++;
}
else
{
- *b = '\0';
- break;
- }
-
- goto again;
- }
- else if (*p == '/')
- {
- /* Remove empty path elements. Not mandated by rfc1808 et
- al, but it seems like a good idea to get rid of them.
- Supporting them properly is hard (in which directory do
- you save http://x.com///y.html?) and they don't seem to
- bring much gain. */
- char *q = p;
- while (*q == '/')
- ++q;
- change = 1;
- if (*q == '\0')
- {
- *p = '\0';
- break;
+ /* Copy the path element, including the final slash. */
+ while (h < end && *h != '/')
+ *t++ = *h++;
+ if (h < end)
+ *t++ = *h++;
}
- memmove (p, q, end - q);
- end -= q - p;
- goto again;
}
-
- /* Skip to the next path element. */
- while (*p && *p != '/')
- ++p;
- if (*p == '\0')
- break;
-
- /* Make sure P points to the beginning of the next path element,
- which is location after the slash. */
- ++p;
}
- return change;
+ if (t != h)
+ *t = '\0';
+
+ return t != h;
}
\f
/* Merge BASE with LINK and return the resulting URI.