X-Git-Url: http://sjero.net/git/?a=blobdiff_plain;f=src%2Furl.c;h=40482b75adf6bca8ecf3992f258f7cf26584d232;hb=5f0a2b3f0846dd4c2f72fc62e7171200d1fd6e06;hp=4f5c7f8ea4cabbbb720160caf456134e9e8e96de;hpb=7b5fb50cb1ce30fd0ddc3e77e376613a861c10aa;p=wget diff --git a/src/url.c b/src/url.c index 4f5c7f8e..40482b75 100644 --- a/src/url.c +++ b/src/url.c @@ -47,23 +47,11 @@ so, delete this exception statement from your version. */ #include "wget.h" #include "utils.h" #include "url.h" -#include "host.h" -#include "hash.h" #ifndef errno extern int errno; #endif -/* Is X "."? */ -#define DOTP(x) ((*(x) == '.') && (!*(x + 1))) -/* Is X ".."? */ -#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2))) - -static const int NS_INADDRSZ = 4; -static const int NS_IN6ADDRSZ = 16; -static const int NS_INT16SZ = 2; - - struct scheme_data { char *leading_string; @@ -86,10 +74,7 @@ static struct scheme_data supported_schemes[] = /* Forward declarations: */ -static char *construct_relative PARAMS ((const char *, const char *)); static int path_simplify PARAMS ((char *)); - - /* Support for encoding and decoding of URL strings. We determine whether a character is unsafe through static table lookup. This @@ -213,8 +198,8 @@ url_escape_1 (const char *s, unsigned char mask, int allow_passthrough) { unsigned char c = *p1++; *p2++ = '%'; - *p2++ = XNUM_TO_digit (c >> 4); - *p2++ = XNUM_TO_digit (c & 0xf); + *p2++ = XNUM_TO_DIGIT (c >> 4); + *p2++ = XNUM_TO_DIGIT (c & 0xf); } else *p2++ = *p1++; @@ -419,6 +404,7 @@ reencode_escapes (const char *s) /* Returns the scheme type if the scheme is supported, or SCHEME_INVALID if not. */ + enum url_scheme url_scheme (const char *url) { @@ -437,37 +423,25 @@ url_scheme (const char *url) return SCHEME_INVALID; } -/* Return the number of characters needed to skip the scheme part of - the URL, e.g. `http://'. If no scheme is found, returns 0. */ -int -url_skip_scheme (const char *url) -{ - const char *p = url; - - /* Skip the scheme name. We allow `-' and `+' because of `whois++', - etc. */ - while (ISALNUM (*p) || *p == '-' || *p == '+') - ++p; - if (*p != ':') - return 0; - /* Skip ':'. */ - ++p; - - /* Skip "//" if found. */ - if (*p == '/' && *(p + 1) == '/') - p += 2; +#define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+') - return p - url; -} +/* Return 1 if the URL begins with any "scheme", 0 otherwise. As + currently implemented, it returns true if URL begins with + [-+a-zA-Z0-9]+: . */ -/* Returns 1 if the URL begins with a scheme (supported or - unsupported), 0 otherwise. */ int url_has_scheme (const char *url) { const char *p = url; - while (ISALNUM (*p) || *p == '-' || *p == '+') + + /* The first char must be a scheme char. */ + if (!*p || !SCHEME_CHAR (*p)) + return 0; + ++p; + /* Followed by 0 or more scheme chars. */ + while (*p && SCHEME_CHAR (*p)) ++p; + /* Terminated by ':'. */ return *p == ':'; } @@ -484,57 +458,51 @@ scheme_disable (enum url_scheme scheme) } /* Skip the username and password, if present here. The function - should be called *not* with the complete URL, but with the part + should *not* be called with the complete URL, but with the part right after the scheme. If no username and password are found, return 0. */ -int -url_skip_uname (const char *url) -{ - const char *p; - /* Look for '@' that comes before '/' or '?'. */ - p = (const char *)strpbrk (url, "/?@"); +static int +url_skip_credentials (const char *url) +{ + /* Look for '@' that comes before terminators, such as '/', '?', + '#', or ';'. */ + const char *p = (const char *)strpbrk (url, "@/?#;"); if (!p || *p != '@') return 0; - - return p - url + 1; + return p + 1 - url; } +/* Parse credentials contained in [BEG, END). The region is expected + to have come from a URL and is unescaped. */ + static int -parse_uname (const char *str, int len, char **user, char **passwd) +parse_credentials (const char *beg, const char *end, char **user, char **passwd) { char *colon; + const char *userend; - if (len == 0) - /* Empty user name not allowed. */ - return 0; + if (beg == end) + return 0; /* empty user name */ - colon = memchr (str, ':', len); - if (colon == str) - /* Empty user name again. */ - return 0; + colon = memchr (beg, ':', end - beg); + if (colon == beg) + return 0; /* again empty user name */ if (colon) { - int pwlen = len - (colon + 1 - str); - *passwd = xmalloc (pwlen + 1); - memcpy (*passwd, colon + 1, pwlen); - (*passwd)[pwlen] = '\0'; - len -= pwlen + 1; + *passwd = strdupdelim (colon + 1, end); + userend = colon; + url_unescape (*passwd); } else - *passwd = NULL; - - *user = xmalloc (len + 1); - memcpy (*user, str, len); - (*user)[len] = '\0'; - - if (*user) - url_unescape (*user); - if (*passwd) - url_unescape (*passwd); - + { + *passwd = NULL; + userend = end; + } + *user = strdupdelim (beg, userend); + url_unescape (*user); return 1; } @@ -550,6 +518,7 @@ parse_uname (const char *str, int len, char **user, char **passwd) foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file If the URL needs not or cannot be rewritten, return NULL. */ + char * rewrite_shorthand_url (const char *url) { @@ -596,7 +565,7 @@ rewrite_shorthand_url (const char *url) } } -static void parse_path PARAMS ((const char *, char **, char **)); +static void split_path PARAMS ((const char *, char **, char **)); /* Like strpbrk, with the exception that it returns the pointer to the terminating zero (end-of-string aka "eos") if no matching character @@ -649,62 +618,60 @@ lowercase_str (char *str) static char *parse_errors[] = { #define PE_NO_ERROR 0 - "No error", + N_("No error"), #define PE_UNSUPPORTED_SCHEME 1 - "Unsupported scheme", + N_("Unsupported scheme"), #define PE_EMPTY_HOST 2 - "Empty host", + N_("Empty host"), #define PE_BAD_PORT_NUMBER 3 - "Bad port number", + N_("Bad port number"), #define PE_INVALID_USER_NAME 4 - "Invalid user name", + N_("Invalid user name"), #define PE_UNTERMINATED_IPV6_ADDRESS 5 - "Unterminated IPv6 numeric address", + N_("Unterminated IPv6 numeric address"), #define PE_IPV6_NOT_SUPPORTED 6 - "IPv6 addresses not supported", + N_("IPv6 addresses not supported"), #define PE_INVALID_IPV6_ADDRESS 7 - "Invalid IPv6 numeric address" + N_("Invalid IPv6 numeric address") }; -#define SETERR(p, v) do { \ - if (p) \ - *(p) = (v); \ -} while (0) - #ifdef ENABLE_IPV6 /* The following two functions were adapted from glibc. */ static int is_valid_ipv4_address (const char *str, const char *end) { - int saw_digit, octets; - int val; - - saw_digit = 0; - octets = 0; - val = 0; + int saw_digit = 0; + int octets = 0; + int val = 0; - while (str < end) { - int ch = *str++; + while (str < end) + { + int ch = *str++; - if (ch >= '0' && ch <= '9') { - val = val * 10 + (ch - '0'); + if (ch >= '0' && ch <= '9') + { + val = val * 10 + (ch - '0'); - if (val > 255) - return 0; - if (saw_digit == 0) { - if (++octets > 4) - return 0; - saw_digit = 1; - } - } else if (ch == '.' && saw_digit == 1) { - if (octets == 4) - return 0; - val = 0; - saw_digit = 0; - } else - return 0; - } + if (val > 255) + return 0; + if (saw_digit == 0) + { + if (++octets > 4) + return 0; + saw_digit = 1; + } + } + else if (ch == '.' && saw_digit == 1) + { + if (octets == 4) + return 0; + val = 0; + saw_digit = 0; + } + else + return 0; + } if (octets < 4) return 0; @@ -714,7 +681,12 @@ is_valid_ipv4_address (const char *str, const char *end) static int is_valid_ipv6_address (const char *str, const char *end) { - static const char xdigits[] = "0123456789abcdef"; + enum { + NS_INADDRSZ = 4, + NS_IN6ADDRSZ = 16, + NS_INT16SZ = 2 + }; + const char *curtok; int tp; const char *colonp; @@ -739,62 +711,67 @@ is_valid_ipv6_address (const char *str, const char *end) saw_xdigit = 0; val = 0; - while (str < end) { - int ch = *str++; - const char *pch; + while (str < end) + { + int ch = *str++; - /* if ch is a number, add it to val. */ - pch = strchr(xdigits, ch); - if (pch != NULL) { - val <<= 4; - val |= (pch - xdigits); - if (val > 0xffff) - return 0; - saw_xdigit = 1; - continue; + /* if ch is a number, add it to val. */ + if (ISXDIGIT (ch)) + { + val <<= 4; + val |= XDIGIT_TO_NUM (ch); + if (val > 0xffff) + return 0; + saw_xdigit = 1; + continue; + } + + /* if ch is a colon ... */ + if (ch == ':') + { + curtok = str; + if (saw_xdigit == 0) + { + if (colonp != NULL) + return 0; + colonp = str + tp; + continue; + } + else if (str == end) + return 0; + if (tp > NS_IN6ADDRSZ - NS_INT16SZ) + return 0; + tp += NS_INT16SZ; + saw_xdigit = 0; + val = 0; + continue; + } + + /* if ch is a dot ... */ + if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) + && is_valid_ipv4_address (curtok, end) == 1) + { + tp += NS_INADDRSZ; + saw_xdigit = 0; + break; + } + + return 0; } - /* if ch is a colon ... */ - if (ch == ':') { - curtok = str; - if (saw_xdigit == 0) { - if (colonp != NULL) - return 0; - colonp = str + tp; - continue; - } else if (str == end) { - return 0; - } - if (tp > NS_IN6ADDRSZ - NS_INT16SZ) + if (saw_xdigit == 1) + { + if (tp > NS_IN6ADDRSZ - NS_INT16SZ) return 0; tp += NS_INT16SZ; - saw_xdigit = 0; - val = 0; - continue; } - /* if ch is a dot ... */ - if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) && - is_valid_ipv4_address(curtok, end) == 1) { - tp += NS_INADDRSZ; - saw_xdigit = 0; - break; + if (colonp != NULL) + { + if (tp == NS_IN6ADDRSZ) + return 0; + tp = NS_IN6ADDRSZ; } - - return 0; - } - - if (saw_xdigit == 1) { - if (tp > NS_IN6ADDRSZ - NS_INT16SZ) - return 0; - tp += NS_INT16SZ; - } - - if (colonp != NULL) { - if (tp == NS_IN6ADDRSZ) - return 0; - tp = NS_IN6ADDRSZ; - } if (tp != NS_IN6ADDRSZ) return 0; @@ -827,13 +804,15 @@ url_parse (const char *url, int *error) int port; char *user = NULL, *passwd = NULL; - char *url_encoded; + char *url_encoded = NULL; + + int error_code; scheme = url_scheme (url); if (scheme == SCHEME_INVALID) { - SETERR (error, PE_UNSUPPORTED_SCHEME); - return NULL; + error_code = PE_UNSUPPORTED_SCHEME; + goto error; } url_encoded = reencode_escapes (url); @@ -841,7 +820,7 @@ url_parse (const char *url, int *error) p += strlen (supported_schemes[scheme].leading_string); uname_b = p; - p += url_skip_uname (p); + p += url_skip_credentials (p); uname_e = p; /* scheme://user:pass@host[:port]... */ @@ -870,23 +849,23 @@ url_parse (const char *url, int *error) if (!host_e) { - SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS); - return NULL; + error_code = PE_UNTERMINATED_IPV6_ADDRESS; + goto error; } #ifdef ENABLE_IPV6 /* Check if the IPv6 address is valid. */ if (!is_valid_ipv6_address(host_b, host_e)) { - SETERR (error, PE_INVALID_IPV6_ADDRESS); - return NULL; + error_code = PE_INVALID_IPV6_ADDRESS; + goto error; } /* Continue parsing after the closing ']'. */ p = host_e + 1; #else - SETERR (error, PE_IPV6_NOT_SUPPORTED); - return NULL; + error_code = PE_IPV6_NOT_SUPPORTED; + goto error; #endif } else @@ -897,8 +876,8 @@ url_parse (const char *url, int *error) if (host_b == host_e) { - SETERR (error, PE_EMPTY_HOST); - return NULL; + error_code = PE_EMPTY_HOST; + goto error; } port = scheme_default_port (scheme); @@ -917,8 +896,8 @@ url_parse (const char *url, int *error) { /* http://host:/whatever */ /* ^ */ - SETERR (error, PE_BAD_PORT_NUMBER); - return NULL; + error_code = PE_BAD_PORT_NUMBER; + goto error; } for (port = 0, pp = port_b; pp < port_e; pp++) @@ -927,8 +906,8 @@ url_parse (const char *url, int *error) { /* http://host:12randomgarbage/blah */ /* ^ */ - SETERR (error, PE_BAD_PORT_NUMBER); - return NULL; + error_code = PE_BAD_PORT_NUMBER; + goto error; } port = 10 * port + (*pp - '0'); @@ -985,16 +964,14 @@ url_parse (const char *url, int *error) /* http://user:pass@host */ /* ^ ^ */ /* uname_b uname_e */ - if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd)) + if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) { - SETERR (error, PE_INVALID_USER_NAME); - return NULL; + error_code = PE_INVALID_USER_NAME; + goto error; } } - u = (struct url *)xmalloc (sizeof (struct url)); - memset (u, 0, sizeof (*u)); - + u = xnew0 (struct url); u->scheme = scheme; u->host = strdupdelim (host_b, host_e); u->port = port; @@ -1003,7 +980,7 @@ url_parse (const char *url, int *error) u->path = strdupdelim (path_b, path_e); path_modified = path_simplify (u->path); - parse_path (u->path, &u->dir, &u->file); + split_path (u->path, &u->dir, &u->file); host_modified = lowercase_str (u->host); @@ -1034,24 +1011,48 @@ url_parse (const char *url, int *error) url_encoded = NULL; return u; + + error: + /* Cleanup in case of error: */ + if (url_encoded && url_encoded != url) + xfree (url_encoded); + + /* Transmit the error code to the caller, if the caller wants to + know. */ + if (error) + *error = error_code; + return NULL; } +/* Return the error message string from ERROR_CODE, which should have + been retrieved from url_parse. The error message is translated. */ + const char * url_error (int error_code) { assert (error_code >= 0 && error_code < countof (parse_errors)); - return parse_errors[error_code]; + return _(parse_errors[error_code]); } -/* Parse PATH into dir and file. PATH is extracted from the URL and - is URL-escaped. The function returns unescaped DIR and FILE. */ +/* Split PATH into DIR and FILE. PATH comes from the URL and is + expected to be URL-escaped. + + The path is split into directory (the part up to the last slash) + and file (the part after the last slash), which are subsequently + unescaped. Examples: + + PATH DIR FILE + "foo/bar/baz" "foo/bar" "baz" + "foo/bar/" "foo/bar" "" + "foo" "" "foo" + "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!) + + DIR and FILE are freshly allocated. */ static void -parse_path (const char *path, char **dir, char **file) +split_path (const char *path, char **dir, char **file) { - char *last_slash; - - last_slash = strrchr (path, '/'); + char *last_slash = strrchr (path, '/'); if (!last_slash) { *dir = xstrdup (""); @@ -1147,6 +1148,7 @@ url_escape_dir (const char *dir) for (; *h; h++, t++) { + /* url_escape_1 having converted '/' to "%2F" exactly. */ if (*h == '%' && h[1] == '2' && h[2] == 'F') { *t = '/'; @@ -1246,128 +1248,6 @@ url_free (struct url *url) xfree (url); } -struct urlpos * -get_urls_file (const char *file) -{ - struct file_memory *fm; - struct urlpos *head, *tail; - const char *text, *text_end; - - /* Load the file. */ - fm = read_file (file); - if (!fm) - { - logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); - return NULL; - } - DEBUGP (("Loaded %s (size %ld).\n", file, fm->length)); - - head = tail = NULL; - text = fm->content; - text_end = fm->content + fm->length; - while (text < text_end) - { - const char *line_beg = text; - const char *line_end = memchr (text, '\n', text_end - text); - if (!line_end) - line_end = text_end; - else - ++line_end; - text = line_end; - - /* Strip whitespace from the beginning and end of line. */ - while (line_beg < line_end && ISSPACE (*line_beg)) - ++line_beg; - while (line_end > line_beg && ISSPACE (*(line_end - 1))) - --line_end; - - if (line_end > line_beg) - { - /* URL is in the [line_beg, line_end) region. */ - - int up_error_code; - char *url_text; - struct urlpos *entry; - struct url *url; - - /* We must copy the URL to a zero-terminated string, and we - can't use alloca because we're in a loop. *sigh*. */ - url_text = strdupdelim (line_beg, line_end); - - if (opt.base_href) - { - /* Merge opt.base_href with URL. */ - char *merged = uri_merge (opt.base_href, url_text); - xfree (url_text); - url_text = merged; - } - - url = url_parse (url_text, &up_error_code); - if (!url) - { - logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n", - file, url_text, url_error (up_error_code)); - xfree (url_text); - continue; - } - xfree (url_text); - - entry = (struct urlpos *)xmalloc (sizeof (struct urlpos)); - memset (entry, 0, sizeof (*entry)); - entry->next = NULL; - entry->url = url; - - if (!head) - head = entry; - else - tail->next = entry; - tail = entry; - } - } - read_file_free (fm); - return head; -} - -/* Free the linked list of urlpos. */ -void -free_urlpos (struct urlpos *l) -{ - while (l) - { - struct urlpos *next = l->next; - if (l->url) - url_free (l->url); - FREE_MAYBE (l->local_name); - xfree (l); - l = next; - } -} - -/* Rotate FNAME opt.backups times */ -void -rotate_backups(const char *fname) -{ - int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1; - char *from = (char *)alloca (maxlen); - char *to = (char *)alloca (maxlen); - struct stat sb; - int i; - - if (stat (fname, &sb) == 0) - if (S_ISREG (sb.st_mode) == 0) - return; - - for (i = opt.backups; i > 1; i--) - { - sprintf (from, "%s.%d", fname, i - 1); - sprintf (to, "%s.%d", fname, i); - rename (from, to); - } - - sprintf (to, "%s.%d", fname, 1); - rename(fname, to); -} - /* Create all the necessary directories for PATH (a file). Calls mkdirhier() internally. */ int @@ -1477,7 +1357,7 @@ append_char (char ch, struct growable *dest) enum { filechr_not_unix = 1, /* unusable on Unix, / and \0 */ filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ - filechr_control = 4, /* a control character, e.g. 0-31 */ + filechr_control = 4 /* a control character, e.g. 0-31 */ }; #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask)) @@ -1545,14 +1425,15 @@ UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */ /* Quote path element, characters in [b, e), as file name, and append the quoted string to DEST. Each character is quoted as per - file_unsafe_char and the corresponding table. */ + file_unsafe_char and the corresponding table. + + If ESCAPED_P is non-zero, the path element is considered to be + URL-escaped and will be unescaped prior to inspection. */ static void -append_uri_pathel (const char *b, const char *e, struct growable *dest) +append_uri_pathel (const char *b, const char *e, int escaped_p, + struct growable *dest) { - char *pathel; - int pathlen; - const char *p; int quoted, outlen; @@ -1565,32 +1446,37 @@ append_uri_pathel (const char *b, const char *e, struct growable *dest) mask |= filechr_control; /* Copy [b, e) to PATHEL and URL-unescape it. */ - BOUNDED_TO_ALLOCA (b, e, pathel); - url_unescape (pathel); - pathlen = strlen (pathel); + if (escaped_p) + { + char *unescaped; + BOUNDED_TO_ALLOCA (b, e, unescaped); + url_unescape (unescaped); + b = unescaped; + e = unescaped + strlen (unescaped); + } - /* Go through PATHEL and check how many characters we'll need to - add for file quoting. */ + /* Walk the PATHEL string and check how many characters we'll need + to add for file quoting. */ quoted = 0; - for (p = pathel; *p; p++) + for (p = b; p < e; p++) if (FILE_CHAR_TEST (*p, mask)) ++quoted; - /* p - pathel is the string length. Each quoted char means two - additional characters in the string, hence 2*quoted. */ - outlen = (p - pathel) + (2 * quoted); + /* e-b is the string length. Each quoted char means two additional + characters in the string, hence 2*quoted. */ + outlen = (e - b) + (2 * quoted); GROW (dest, outlen); if (!quoted) { /* If there's nothing to quote, we don't need to go through the string the second time. */ - memcpy (TAIL (dest), pathel, outlen); + memcpy (TAIL (dest), b, outlen); } else { char *q = TAIL (dest); - for (p = pathel; *p; p++) + for (p = b; p < e; p++) { if (!FILE_CHAR_TEST (*p, mask)) *q++ = *p; @@ -1636,14 +1522,12 @@ append_dir_structure (const struct url *u, struct growable *dest) if (cut-- > 0) continue; if (pathel == next) - /* Ignore empty pathels. path_simplify should remove - occurrences of "//" from the path, but it has special cases - for starting / which generates an empty pathel here. */ + /* Ignore empty pathels. */ continue; if (dest->tail) append_char ('/', dest); - append_uri_pathel (pathel, next, dest); + append_uri_pathel (pathel, next, 1, dest); } } @@ -1663,7 +1547,7 @@ url_file_name (const struct url *u) fnres.tail = 0; /* Start with the directory prefix, if specified. */ - if (!DOTP (opt.dir_prefix)) + if (opt.dir_prefix) append_string (opt.dir_prefix, &fnres); /* If "dirstruct" is turned on (typically the case with -r), add @@ -1692,14 +1576,14 @@ url_file_name (const struct url *u) if (fnres.tail) append_char ('/', &fnres); u_file = *u->file ? u->file : "index.html"; - append_uri_pathel (u_file, u_file + strlen (u_file), &fnres); + append_uri_pathel (u_file, u_file + strlen (u_file), 0, &fnres); /* Append "?query" to the file name. */ u_query = u->query && *u->query ? u->query : NULL; if (u_query) { append_char (FN_QUERY_SEP, &fnres); - append_uri_pathel (u_query, u_query + strlen (u_query), &fnres); + append_uri_pathel (u_query, u_query + strlen (u_query), 1, &fnres); } /* Zero-terminate the file name. */ @@ -1750,342 +1634,302 @@ find_last_char (const char *b, const char *e, char c) } /* Resolve "." and ".." elements of PATH by destructively modifying - PATH. "." is resolved by removing that path element, and ".." is - resolved by removing the preceding path element. Leading and - trailing slashes are preserved. + PATH and return non-zero if PATH has been modified, zero otherwise. + + The algorithm is in spirit similar to the one described in rfc1808, + although implemented differently, in one pass. To recap, path + elements containing only "." are removed, and ".." is taken to mean + "back up one element". Single leading and trailing slashes are + preserved. - Return non-zero if any changes have been made. + This function does not handle URL escapes explicitly. If you're + passing paths from URLs, make sure to unquote "%2e" and "%2E" to + ".", so that this function can find the dots. (Wget's URL parser + calls reencode_escapes, which see.) For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive test examples are provided below. If you change anything in this function, run test_path_simplify to make sure you haven't broken a - test case. - - A previous version of this function was based on path_simplify() - from GNU Bash, but it has been rewritten for Wget 1.8.1. */ + test case. */ static int path_simplify (char *path) { - int change = 0; - char *p, *end; + char *h, *t, *end; + /* Preserve the leading '/'. */ if (path[0] == '/') - ++path; /* preserve the leading '/'. */ + ++path; - p = path; - end = p + strlen (p) + 1; /* position past the terminating zero. */ + h = path; /* hare */ + t = path; /* tortoise */ + end = path + strlen (path); - while (1) + while (h < end) { - again: - /* P should point to the beginning of a path element. */ + /* Hare should be at the beginning of a path element. */ - if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0')) + if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) { - /* Handle "./foo" by moving "foo" two characters to the - left. */ - if (*(p + 1) == '/') - { - change = 1; - memmove (p, p + 2, end - p); - end -= 2; - goto again; - } - else - { - change = 1; - *p = '\0'; - break; - } + /* Ignore "./". */ + h += 2; } - else if (*p == '.' && *(p + 1) == '.' - && (*(p + 2) == '/' || *(p + 2) == '\0')) + else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) { - /* Handle "../foo" by moving "foo" one path element to the - left. */ - char *b = p; /* not p-1 because P can equal PATH */ - - /* Backtrack by one path element, but not past the beginning - of PATH. */ - - /* foo/bar/../baz */ - /* ^ p */ - /* ^ b */ - - if (b > path) + /* Handle "../" by retreating the tortoise by one path + element -- but not past beggining of PATH. */ + if (t > path) { - /* Move backwards until B hits the beginning of the + /* Move backwards until T hits the beginning of the previous path element or the beginning of path. */ - for (--b; b > path && *(b - 1) != '/'; b--) + for (--t; t > path && t[-1] != '/'; t--) ; } - - change = 1; - if (*(p + 2) == '/') + h += 3; + } + else if (*h == '/') + { + /* Ignore empty path elements. Supporting them well is hard + (where do you save "http://x.com///y.html"?), and they + don't bring any practical gain. Plus, they break our + filesystem-influenced assumptions: allowing them would + make "x/y//../z" simplify to "x/y/z", whereas most people + would expect "x/z". */ + ++h; + } + else + { + /* A regular path element. If H hasn't advanced past T, + simply skip to the next path element. Otherwise, copy + the path element until the next slash. */ + if (t == h) { - memmove (b, p + 3, end - (p + 3)); - end -= (p + 3) - b; - p = b; + /* Skip the path element, including the slash. */ + while (h < end && *h != '/') + t++, h++; + if (h < end) + t++, h++; } else { - *b = '\0'; - break; - } - - goto again; - } - else if (*p == '/') - { - /* Remove empty path elements. Not mandated by rfc1808 et - al, but it seems like a good idea to get rid of them. - Supporting them properly is hard (in which directory do - you save http://x.com///y.html?) and they don't seem to - bring much gain. */ - char *q = p; - while (*q == '/') - ++q; - change = 1; - if (*q == '\0') - { - *p = '\0'; - break; + /* Copy the path element, including the final slash. */ + while (h < end && *h != '/') + *t++ = *h++; + if (h < end) + *t++ = *h++; } - memmove (p, q, end - q); - end -= q - p; - goto again; } - - /* Skip to the next path element. */ - while (*p && *p != '/') - ++p; - if (*p == '\0') - break; - - /* Make sure P points to the beginning of the next path element, - which is location after the slash. */ - ++p; } - return change; + if (t != h) + *t = '\0'; + + return t != h; } -/* Resolve the result of "linking" a base URI (BASE) to a - link-specified URI (LINK). +/* Merge BASE with LINK and return the resulting URI. Either of the URIs may be absolute or relative, complete with the - host name, or path only. This tries to behave "reasonably" in all - foreseeable cases. It employs little specific knowledge about - schemes or URL-specific stuff -- it just works on strings. - - The parameters LINKLENGTH is useful if LINK is not zero-terminated. - See uri_merge for a gentler interface to this functionality. + host name, or path only. This tries to reasonably handle all + foreseeable cases. It only employs minimal URL parsing, without + knowledge of the specifics of schemes. Perhaps this function should call path_simplify so that the callers don't have to call url_parse unconditionally. */ -static char * -uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme) + +char * +uri_merge (const char *base, const char *link) { - char *constr; + int linklength; + const char *end; + char *merge; + + if (url_has_scheme (link)) + return xstrdup (link); - if (no_scheme) + /* We may not examine BASE past END. */ + end = base + path_length (base); + linklength = strlen (link); + + if (!*link) + { + /* Empty LINK points back to BASE, query string and all. */ + return xstrdup (base); + } + else if (*link == '?') { - const char *end = base + path_length (base); + /* LINK points to the same location, but changes the query + string. Examples: */ + /* uri_merge("path", "?new") -> "path?new" */ + /* uri_merge("path?foo", "?new") -> "path?new" */ + /* uri_merge("path?foo#bar", "?new") -> "path?new" */ + /* uri_merge("path#foo", "?new") -> "path?new" */ + int baselength = end - base; + merge = xmalloc (baselength + linklength + 1); + memcpy (merge, base, baselength); + memcpy (merge + baselength, link, linklength); + merge[baselength + linklength] = '\0'; + } + else if (*link == '#') + { + /* uri_merge("path", "#new") -> "path#new" */ + /* uri_merge("path#foo", "#new") -> "path#new" */ + /* uri_merge("path?foo", "#new") -> "path?foo#new" */ + /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */ + int baselength; + const char *end1 = strchr (base, '#'); + if (!end1) + end1 = base + strlen (base); + baselength = end1 - base; + merge = xmalloc (baselength + linklength + 1); + memcpy (merge, base, baselength); + memcpy (merge + baselength, link, linklength); + merge[baselength + linklength] = '\0'; + } + else if (*link == '/' && *(link + 1) == '/') + { + /* LINK begins with "//" and so is a net path: we need to + replace everything after (and including) the double slash + with LINK. */ + + /* uri_merge("foo", "//new/bar") -> "//new/bar" */ + /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ + /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */ + + int span; + const char *slash; + const char *start_insert; + + /* Look for first slash. */ + slash = memchr (base, '/', end - base); + /* If found slash and it is a double slash, then replace + from this point, else default to replacing from the + beginning. */ + if (slash && *(slash + 1) == '/') + start_insert = slash; + else + start_insert = base; - if (!*link) - { - /* Empty LINK points back to BASE, query string and all. */ - constr = xstrdup (base); - } - else if (*link == '?') - { - /* LINK points to the same location, but changes the query - string. Examples: */ - /* uri_merge("path", "?new") -> "path?new" */ - /* uri_merge("path?foo", "?new") -> "path?new" */ - /* uri_merge("path?foo#bar", "?new") -> "path?new" */ - /* uri_merge("path#foo", "?new") -> "path?new" */ - int baselength = end - base; - constr = xmalloc (baselength + linklength + 1); - memcpy (constr, base, baselength); - memcpy (constr + baselength, link, linklength); - constr[baselength + linklength] = '\0'; - } - else if (*link == '#') - { - /* uri_merge("path", "#new") -> "path#new" */ - /* uri_merge("path#foo", "#new") -> "path#new" */ - /* uri_merge("path?foo", "#new") -> "path?foo#new" */ - /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */ - int baselength; - const char *end1 = strchr (base, '#'); - if (!end1) - end1 = base + strlen (base); - baselength = end1 - base; - constr = xmalloc (baselength + linklength + 1); - memcpy (constr, base, baselength); - memcpy (constr + baselength, link, linklength); - constr[baselength + linklength] = '\0'; - } - else if (linklength > 1 && *link == '/' && *(link + 1) == '/') + span = start_insert - base; + merge = (char *)xmalloc (span + linklength + 1); + if (span) + memcpy (merge, base, span); + memcpy (merge + span, link, linklength); + merge[span + linklength] = '\0'; + } + else if (*link == '/') + { + /* LINK is an absolute path: we need to replace everything + after (and including) the FIRST slash with LINK. + + So, if BASE is "http://host/whatever/foo/bar", and LINK is + "/qux/xyzzy", our result should be + "http://host/qux/xyzzy". */ + int span; + const char *slash; + const char *start_insert = NULL; /* for gcc to shut up. */ + const char *pos = base; + int seen_slash_slash = 0; + /* We're looking for the first slash, but want to ignore + double slash. */ + again: + slash = memchr (pos, '/', end - pos); + if (slash && !seen_slash_slash) + if (*(slash + 1) == '/') + { + pos = slash + 2; + seen_slash_slash = 1; + goto again; + } + + /* At this point, SLASH is the location of the first / after + "//", or the first slash altogether. START_INSERT is the + pointer to the location where LINK will be inserted. When + examining the last two examples, keep in mind that LINK + begins with '/'. */ + + if (!slash && !seen_slash_slash) + /* example: "foo" */ + /* ^ */ + start_insert = base; + else if (!slash && seen_slash_slash) + /* example: "http://foo" */ + /* ^ */ + start_insert = end; + else if (slash && !seen_slash_slash) + /* example: "foo/bar" */ + /* ^ */ + start_insert = base; + else if (slash && seen_slash_slash) + /* example: "http://something/" */ + /* ^ */ + start_insert = slash; + + span = start_insert - base; + merge = (char *)xmalloc (span + linklength + 1); + if (span) + memcpy (merge, base, span); + memcpy (merge + span, link, linklength); + merge[span + linklength] = '\0'; + } + else + { + /* LINK is a relative URL: we need to replace everything + after last slash (possibly empty) with LINK. + + So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", + our result should be "whatever/foo/qux/xyzzy". */ + int need_explicit_slash = 0; + int span; + const char *start_insert; + const char *last_slash = find_last_char (base, end, '/'); + if (!last_slash) { - /* LINK begins with "//" and so is a net path: we need to - replace everything after (and including) the double slash - with LINK. */ - - /* uri_merge("foo", "//new/bar") -> "//new/bar" */ - /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ - /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */ - - int span; - const char *slash; - const char *start_insert; - - /* Look for first slash. */ - slash = memchr (base, '/', end - base); - /* If found slash and it is a double slash, then replace - from this point, else default to replacing from the - beginning. */ - if (slash && *(slash + 1) == '/') - start_insert = slash; - else - start_insert = base; - - span = start_insert - base; - constr = (char *)xmalloc (span + linklength + 1); - if (span) - memcpy (constr, base, span); - memcpy (constr + span, link, linklength); - constr[span + linklength] = '\0'; + /* No slash found at all. Append LINK to what we have, + but we'll need a slash as a separator. + + Example: if base == "foo" and link == "qux/xyzzy", then + we cannot just append link to base, because we'd get + "fooqux/xyzzy", whereas what we want is + "foo/qux/xyzzy". + + To make sure the / gets inserted, we set + need_explicit_slash to 1. We also set start_insert + to end + 1, so that the length calculations work out + correctly for one more (slash) character. Accessing + that character is fine, since it will be the + delimiter, '\0' or '?'. */ + /* example: "foo?..." */ + /* ^ ('?' gets changed to '/') */ + start_insert = end + 1; + need_explicit_slash = 1; } - else if (*link == '/') + else if (last_slash && last_slash >= base + 2 + && last_slash[-2] == ':' && last_slash[-1] == '/') { - /* LINK is an absolute path: we need to replace everything - after (and including) the FIRST slash with LINK. - - So, if BASE is "http://host/whatever/foo/bar", and LINK is - "/qux/xyzzy", our result should be - "http://host/qux/xyzzy". */ - int span; - const char *slash; - const char *start_insert = NULL; /* for gcc to shut up. */ - const char *pos = base; - int seen_slash_slash = 0; - /* We're looking for the first slash, but want to ignore - double slash. */ - again: - slash = memchr (pos, '/', end - pos); - if (slash && !seen_slash_slash) - if (*(slash + 1) == '/') - { - pos = slash + 2; - seen_slash_slash = 1; - goto again; - } - - /* At this point, SLASH is the location of the first / after - "//", or the first slash altogether. START_INSERT is the - pointer to the location where LINK will be inserted. When - examining the last two examples, keep in mind that LINK - begins with '/'. */ - - if (!slash && !seen_slash_slash) - /* example: "foo" */ - /* ^ */ - start_insert = base; - else if (!slash && seen_slash_slash) - /* example: "http://foo" */ - /* ^ */ - start_insert = end; - else if (slash && !seen_slash_slash) - /* example: "foo/bar" */ - /* ^ */ - start_insert = base; - else if (slash && seen_slash_slash) - /* example: "http://something/" */ - /* ^ */ - start_insert = slash; - - span = start_insert - base; - constr = (char *)xmalloc (span + linklength + 1); - if (span) - memcpy (constr, base, span); - if (linklength) - memcpy (constr + span, link, linklength); - constr[span + linklength] = '\0'; + /* example: http://host" */ + /* ^ */ + start_insert = end + 1; + need_explicit_slash = 1; } else { - /* LINK is a relative URL: we need to replace everything - after last slash (possibly empty) with LINK. - - So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", - our result should be "whatever/foo/qux/xyzzy". */ - int need_explicit_slash = 0; - int span; - const char *start_insert; - const char *last_slash = find_last_char (base, end, '/'); - if (!last_slash) - { - /* No slash found at all. Append LINK to what we have, - but we'll need a slash as a separator. - - Example: if base == "foo" and link == "qux/xyzzy", then - we cannot just append link to base, because we'd get - "fooqux/xyzzy", whereas what we want is - "foo/qux/xyzzy". - - To make sure the / gets inserted, we set - need_explicit_slash to 1. We also set start_insert - to end + 1, so that the length calculations work out - correctly for one more (slash) character. Accessing - that character is fine, since it will be the - delimiter, '\0' or '?'. */ - /* example: "foo?..." */ - /* ^ ('?' gets changed to '/') */ - start_insert = end + 1; - need_explicit_slash = 1; - } - else if (last_slash && last_slash != base && *(last_slash - 1) == '/') - { - /* example: http://host" */ - /* ^ */ - start_insert = end + 1; - need_explicit_slash = 1; - } - else - { - /* example: "whatever/foo/bar" */ - /* ^ */ - start_insert = last_slash + 1; - } - - span = start_insert - base; - constr = (char *)xmalloc (span + linklength + 1); - if (span) - memcpy (constr, base, span); - if (need_explicit_slash) - constr[span - 1] = '/'; - if (linklength) - memcpy (constr + span, link, linklength); - constr[span + linklength] = '\0'; + /* example: "whatever/foo/bar" */ + /* ^ */ + start_insert = last_slash + 1; } + + span = start_insert - base; + merge = (char *)xmalloc (span + linklength + 1); + if (span) + memcpy (merge, base, span); + if (need_explicit_slash) + merge[span - 1] = '/'; + memcpy (merge + span, link, linklength); + merge[span + linklength] = '\0'; } - else /* !no_scheme */ - { - constr = strdupdelim (link, link + linklength); - } - return constr; -} -/* Merge BASE with LINK and return the resulting URI. This is an - interface to uri_merge_1 that assumes that LINK is a - zero-terminated string. */ -char * -uri_merge (const char *base, const char *link) -{ - return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link)); + return merge; } #define APPEND(p, s) do { \ @@ -2192,596 +2036,6 @@ url_string (const struct url *url, int hide_password) return result; } -/* Return the URL of the proxy appropriate for url U. */ -char * -getproxy (struct url *u) -{ - char *proxy = NULL; - char *rewritten_url; - static char rewritten_storage[1024]; - - if (!opt.use_proxy) - return NULL; - if (!no_proxy_match (u->host, (const char **)opt.no_proxy)) - return NULL; - - switch (u->scheme) - { - case SCHEME_HTTP: - proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy"); - break; -#ifdef HAVE_SSL - case SCHEME_HTTPS: - proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy"); - break; -#endif - case SCHEME_FTP: - proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy"); - break; - case SCHEME_INVALID: - break; - } - if (!proxy || !*proxy) - return NULL; - - /* Handle shorthands. `rewritten_storage' is a kludge to allow - getproxy() to return static storage. */ - rewritten_url = rewrite_shorthand_url (proxy); - if (rewritten_url) - { - strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage)); - rewritten_storage[sizeof (rewritten_storage) - 1] = '\0'; - proxy = rewritten_storage; - } - - return proxy; -} - -/* Should a host be accessed through proxy, concerning no_proxy? */ -int -no_proxy_match (const char *host, const char **no_proxy) -{ - if (!no_proxy) - return 1; - else - return !sufmatch (no_proxy, host); -} - -/* Support for converting links for local viewing in downloaded HTML - files. This should be moved to another file, because it has - nothing to do with processing URLs. */ - -static void write_backup_file PARAMS ((const char *, downloaded_file_t)); -static const char *replace_attr PARAMS ((const char *, int, FILE *, - const char *)); -static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *, - const char *, int)); -static char *local_quote_string PARAMS ((const char *)); - -/* Change the links in one HTML file. LINKS is a list of links in the - document, along with their positions and the desired direction of - the conversion. */ -void -convert_links (const char *file, struct urlpos *links) -{ - struct file_memory *fm; - FILE *fp; - const char *p; - downloaded_file_t downloaded_file_return; - - struct urlpos *link; - int to_url_count = 0, to_file_count = 0; - - logprintf (LOG_VERBOSE, _("Converting %s... "), file); - - { - /* First we do a "dry run": go through the list L and see whether - any URL needs to be converted in the first place. If not, just - leave the file alone. */ - int dry_count = 0; - struct urlpos *dry = links; - for (dry = links; dry; dry = dry->next) - if (dry->convert != CO_NOCONVERT) - ++dry_count; - if (!dry_count) - { - logputs (LOG_VERBOSE, _("nothing to do.\n")); - return; - } - } - - fm = read_file (file); - if (!fm) - { - logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), - file, strerror (errno)); - return; - } - - downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file); - if (opt.backup_converted && downloaded_file_return) - write_backup_file (file, downloaded_file_return); - - /* Before opening the file for writing, unlink the file. This is - important if the data in FM is mmaped. In such case, nulling the - file, which is what fopen() below does, would make us read all - zeroes from the mmaped region. */ - if (unlink (file) < 0 && errno != ENOENT) - { - logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"), - file, strerror (errno)); - read_file_free (fm); - return; - } - /* Now open the file for writing. */ - fp = fopen (file, "wb"); - if (!fp) - { - logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), - file, strerror (errno)); - read_file_free (fm); - return; - } - - /* Here we loop through all the URLs in file, replacing those of - them that are downloaded with relative references. */ - p = fm->content; - for (link = links; link; link = link->next) - { - char *url_start = fm->content + link->pos; - - if (link->pos >= fm->length) - { - DEBUGP (("Something strange is going on. Please investigate.")); - break; - } - /* If the URL is not to be converted, skip it. */ - if (link->convert == CO_NOCONVERT) - { - DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos)); - continue; - } - - /* Echo the file contents, up to the offending URL's opening - quote, to the outfile. */ - fwrite (p, 1, url_start - p, fp); - p = url_start; - - switch (link->convert) - { - case CO_CONVERT_TO_RELATIVE: - /* Convert absolute URL to relative. */ - { - char *newname = construct_relative (file, link->local_name); - char *quoted_newname = local_quote_string (newname); - - if (!link->link_refresh_p) - p = replace_attr (p, link->size, fp, quoted_newname); - else - p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, - link->refresh_timeout); - - DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", - link->url->url, newname, link->pos, file)); - xfree (newname); - xfree (quoted_newname); - ++to_file_count; - break; - } - case CO_CONVERT_TO_COMPLETE: - /* Convert the link to absolute URL. */ - { - char *newlink = link->url->url; - char *quoted_newlink = html_quote_string (newlink); - - if (!link->link_refresh_p) - p = replace_attr (p, link->size, fp, quoted_newlink); - else - p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink, - link->refresh_timeout); - - DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", - newlink, link->pos, file)); - xfree (quoted_newlink); - ++to_url_count; - break; - } - case CO_NULLIFY_BASE: - /* Change the base href to "". */ - p = replace_attr (p, link->size, fp, ""); - break; - case CO_NOCONVERT: - abort (); - break; - } - } - - /* Output the rest of the file. */ - if (p - fm->content < fm->length) - fwrite (p, 1, fm->length - (p - fm->content), fp); - fclose (fp); - read_file_free (fm); - - logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count); -} - -/* Construct and return a malloced copy of the relative link from two - pieces of information: local name S1 of the referring file and - local name S2 of the referred file. - - So, if S1 is "jagor.srce.hr/index.html" and S2 is - "jagor.srce.hr/images/news.gif", the function will return - "images/news.gif". - - Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is - "fly.cc.fer.hr/images/fly.gif", the function will return - "../images/fly.gif". - - Caveats: S1 should not begin with `/', unless S2 also begins with - '/'. S1 should not contain things like ".." and such -- - construct_relative ("fly/ioccc/../index.html", - "fly/images/fly.gif") will fail. (A workaround is to call - something like path_simplify() on S1). */ -static char * -construct_relative (const char *s1, const char *s2) -{ - int i, cnt, sepdirs1; - char *res; - - if (*s2 == '/') - return xstrdup (s2); - /* S1 should *not* be absolute, if S2 wasn't. */ - assert (*s1 != '/'); - i = cnt = 0; - /* Skip the directories common to both strings. */ - while (1) - { - while (s1[i] && s2[i] - && (s1[i] == s2[i]) - && (s1[i] != '/') - && (s2[i] != '/')) - ++i; - if (s1[i] == '/' && s2[i] == '/') - cnt = ++i; - else - break; - } - for (sepdirs1 = 0; s1[i]; i++) - if (s1[i] == '/') - ++sepdirs1; - /* Now, construct the file as of: - - ../ repeated sepdirs1 time - - all the non-mutual directories of S2. */ - res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1); - for (i = 0; i < sepdirs1; i++) - memcpy (res + 3 * i, "../", 3); - strcpy (res + 3 * i, s2 + cnt); - return res; -} - -static void -write_backup_file (const char *file, downloaded_file_t downloaded_file_return) -{ - /* Rather than just writing over the original .html file with the - converted version, save the former to *.orig. Note we only do - this for files we've _successfully_ downloaded, so we don't - clobber .orig files sitting around from previous invocations. */ - - /* Construct the backup filename as the original name plus ".orig". */ - size_t filename_len = strlen(file); - char* filename_plus_orig_suffix; - boolean already_wrote_backup_file = FALSE; - slist* converted_file_ptr; - static slist* converted_files = NULL; - - if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) - { - /* Just write "orig" over "html". We need to do it this way - because when we're checking to see if we've downloaded the - file before (to see if we can skip downloading it), we don't - know if it's a text/html file. Therefore we don't know yet - at that stage that -E is going to cause us to tack on - ".html", so we need to compare vs. the original URL plus - ".orig", not the original URL plus ".html.orig". */ - filename_plus_orig_suffix = alloca (filename_len + 1); - strcpy(filename_plus_orig_suffix, file); - strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig"); - } - else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */ - { - /* Append ".orig" to the name. */ - filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig")); - strcpy(filename_plus_orig_suffix, file); - strcpy(filename_plus_orig_suffix + filename_len, ".orig"); - } - - /* We can get called twice on the same URL thanks to the - convert_all_links() call in main(). If we write the .orig file - each time in such a case, it'll end up containing the first-pass - conversion, not the original file. So, see if we've already been - called on this file. */ - converted_file_ptr = converted_files; - while (converted_file_ptr != NULL) - if (strcmp(converted_file_ptr->string, file) == 0) - { - already_wrote_backup_file = TRUE; - break; - } - else - converted_file_ptr = converted_file_ptr->next; - - if (!already_wrote_backup_file) - { - /* Rename to .orig before former gets written over. */ - if (rename(file, filename_plus_orig_suffix) != 0) - logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), - file, filename_plus_orig_suffix, strerror (errno)); - - /* Remember that we've already written a .orig backup for this file. - Note that we never free this memory since we need it till the - convert_all_links() call, which is one of the last things the - program does before terminating. BTW, I'm not sure if it would be - safe to just set 'converted_file_ptr->string' to 'file' below, - rather than making a copy of the string... Another note is that I - thought I could just add a field to the urlpos structure saying - that we'd written a .orig file for this URL, but that didn't work, - so I had to make this separate list. - -- Dan Harkless - - This [adding a field to the urlpos structure] didn't work - because convert_file() is called from convert_all_links at - the end of the retrieval with a freshly built new urlpos - list. - -- Hrvoje Niksic - */ - converted_file_ptr = xmalloc(sizeof(*converted_file_ptr)); - converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */ - converted_file_ptr->next = converted_files; - converted_files = converted_file_ptr; - } -} - -static int find_fragment PARAMS ((const char *, int, const char **, - const char **)); - -/* Replace an attribute's original text with NEW_TEXT. */ - -static const char * -replace_attr (const char *p, int size, FILE *fp, const char *new_text) -{ - int quote_flag = 0; - char quote_char = '\"'; /* use "..." for quoting, unless the - original value is quoted, in which - case reuse its quoting char. */ - const char *frag_beg, *frag_end; - - /* Structure of our string is: - "...old-contents..." - <--- size ---> (with quotes) - OR: - ...old-contents... - <--- size --> (no quotes) */ - - if (*p == '\"' || *p == '\'') - { - quote_char = *p; - quote_flag = 1; - ++p; - size -= 2; /* disregard opening and closing quote */ - } - putc (quote_char, fp); - fputs (new_text, fp); - - /* Look for fragment identifier, if any. */ - if (find_fragment (p, size, &frag_beg, &frag_end)) - fwrite (frag_beg, 1, frag_end - frag_beg, fp); - p += size; - if (quote_flag) - ++p; - putc (quote_char, fp); - - return p; -} - -/* The same as REPLACE_ATTR, but used when replacing - because we need to - append "timeout_value; URL=" before the next_text. */ - -static const char * -replace_attr_refresh_hack (const char *p, int size, FILE *fp, - const char *new_text, int timeout) -{ - /* "0; URL=..." */ - char *new_with_timeout = (char *)alloca (numdigit (timeout) - + 6 /* "; URL=" */ - + strlen (new_text) - + 1); - sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text); - - return replace_attr (p, size, fp, new_with_timeout); -} - -/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not - preceded by '&'. If the character is not found, return zero. If - the character is found, return 1 and set BP and EP to point to the - beginning and end of the region. - - This is used for finding the fragment indentifiers in URLs. */ - -static int -find_fragment (const char *beg, int size, const char **bp, const char **ep) -{ - const char *end = beg + size; - int saw_amp = 0; - for (; beg < end; beg++) - { - switch (*beg) - { - case '&': - saw_amp = 1; - break; - case '#': - if (!saw_amp) - { - *bp = beg; - *ep = end; - return 1; - } - /* fallthrough */ - default: - saw_amp = 0; - } - } - return 0; -} - -/* Quote FILE for use as local reference to an HTML file. - - We quote ? as %3F to avoid passing part of the file name as the - parameter when browsing the converted file through HTTP. However, - it is safe to do this only when `--html-extension' is turned on. - This is because converting "index.html?foo=bar" to - "index.html%3Ffoo=bar" would break local browsing, as the latter - isn't even recognized as an HTML file! However, converting - "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be - safe for both local and HTTP-served browsing. */ - -static char * -local_quote_string (const char *file) -{ - const char *file_sans_qmark; - int qm; - - if (!opt.html_extension) - return html_quote_string (file); - - qm = count_char (file, '?'); - - if (qm) - { - const char *from = file; - char *to, *newname; - - /* qm * 2 because we replace each question mark with "%3F", - i.e. replace one char with three, hence two more. */ - int fsqlen = strlen (file) + qm * 2; - - to = newname = (char *)alloca (fsqlen + 1); - for (; *from; from++) - { - if (*from != '?') - *to++ = *from; - else - { - *to++ = '%'; - *to++ = '3'; - *to++ = 'F'; - } - } - assert (to - newname == fsqlen); - *to = '\0'; - - file_sans_qmark = newname; - } - else - file_sans_qmark = file; - - return html_quote_string (file_sans_qmark); -} - -/* We're storing "modes" of type downloaded_file_t in the hash table. - However, our hash tables only accept pointers for keys and values. - So when we need a pointer, we use the address of a - downloaded_file_t variable of static storage. */ - -static downloaded_file_t * -downloaded_mode_to_ptr (downloaded_file_t mode) -{ - static downloaded_file_t - v1 = FILE_NOT_ALREADY_DOWNLOADED, - v2 = FILE_DOWNLOADED_NORMALLY, - v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, - v4 = CHECK_FOR_FILE; - - switch (mode) - { - case FILE_NOT_ALREADY_DOWNLOADED: - return &v1; - case FILE_DOWNLOADED_NORMALLY: - return &v2; - case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED: - return &v3; - case CHECK_FOR_FILE: - return &v4; - } - return NULL; -} - -/* This should really be merged with dl_file_url_map and - downloaded_html_files in recur.c. This was originally a list, but - I changed it to a hash table beause it was actually taking a lot of - time to find things in it. */ - -static struct hash_table *downloaded_files_hash; - -/* Remembers which files have been downloaded. In the standard case, should be - called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually - download successfully (i.e. not for ones we have failures on or that we skip - due to -N). - - When we've downloaded a file and tacked on a ".html" extension due to -E, - call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than - FILE_DOWNLOADED_NORMALLY. - - If you just want to check if a file has been previously added without adding - it, call with mode == CHECK_FOR_FILE. Please be sure to call this function - with local filenames, not remote URLs. */ -downloaded_file_t -downloaded_file (downloaded_file_t mode, const char *file) -{ - downloaded_file_t *ptr; - - if (mode == CHECK_FOR_FILE) - { - if (!downloaded_files_hash) - return FILE_NOT_ALREADY_DOWNLOADED; - ptr = hash_table_get (downloaded_files_hash, file); - if (!ptr) - return FILE_NOT_ALREADY_DOWNLOADED; - return *ptr; - } - - if (!downloaded_files_hash) - downloaded_files_hash = make_string_hash_table (0); - - ptr = hash_table_get (downloaded_files_hash, file); - if (ptr) - return *ptr; - - ptr = downloaded_mode_to_ptr (mode); - hash_table_put (downloaded_files_hash, xstrdup (file), &ptr); - - return FILE_NOT_ALREADY_DOWNLOADED; -} - -static int -df_free_mapper (void *key, void *value, void *ignored) -{ - xfree (key); - return 0; -} - -void -downloaded_files_free (void) -{ - if (downloaded_files_hash) - { - hash_table_map (downloaded_files_hash, df_free_mapper, NULL); - hash_table_destroy (downloaded_files_hash); - downloaded_files_hash = NULL; - } -} - /* Return non-zero if scheme a is similar to scheme b. Schemes are similar if they are equal. If SSL is supported, schemes