2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
64 /* Supported schemes: */
65 static struct scheme_data supported_schemes[] =
67 { "http://", DEFAULT_HTTP_PORT, 1 },
69 { "https://", DEFAULT_HTTPS_PORT, 1 },
71 { "ftp://", DEFAULT_FTP_PORT, 1 },
77 /* Forward declarations: */
79 static char *construct_relative PARAMS ((const char *, const char *));
80 static int path_simplify PARAMS ((char *));
84 /* Support for encoding and decoding of URL strings. We determine
85 whether a character is unsafe through static table lookup. This
86 code assumes ASCII character set and 8-bit chars. */
89 /* rfc1738 reserved chars, preserved from encoding. */
92 /* rfc1738 unsafe chars, plus some more. */
96 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
97 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
98 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
100 /* Shorthands for the table: */
101 #define R urlchr_reserved
102 #define U urlchr_unsafe
105 const static unsigned char urlchr_table[256] =
107 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
108 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
109 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
110 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
111 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
112 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
113 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
114 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
115 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
116 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
117 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
118 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
119 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
120 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
121 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
122 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
127 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
132 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
138 /* URL-unescape the string S.
140 This is done by transforming the sequences "%HH" to the character
141 represented by the hexadecimal digits HH. If % is not followed by
142 two hexadecimal digits, it is inserted literally.
144 The transformation is done in place. If you need the original
145 string intact, make a copy before calling this function. */
148 url_unescape (char *s)
150 char *t = s; /* t - tortoise */
151 char *h = s; /* h - hare */
162 /* Do nothing if '%' is not followed by two hex digits. */
163 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
165 *t = X2DIGITS_TO_NUM (h[1], h[2]);
172 /* The core of url_escape_* functions. Escapes the characters that
173 match the provided mask in urlchr_table.
175 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
176 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
177 freshly allocated string will be returned in all cases. */
180 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
187 for (p1 = s; *p1; p1++)
188 if (urlchr_test (*p1, mask))
189 addition += 2; /* Two more characters (hex digits) */
192 return allow_passthrough ? (char *)s : xstrdup (s);
194 newlen = (p1 - s) + addition;
195 newstr = (char *)xmalloc (newlen + 1);
201 /* Quote the characters that match the test mask. */
202 if (urlchr_test (*p1, mask))
204 unsigned char c = *p1++;
206 *p2++ = XNUM_TO_digit (c >> 4);
207 *p2++ = XNUM_TO_digit (c & 0xf);
212 assert (p2 - newstr == newlen);
218 /* URL-escape the unsafe characters (see urlchr_table) in a given
219 string, returning a freshly allocated string. */
222 url_escape (const char *s)
224 return url_escape_1 (s, urlchr_unsafe, 0);
227 /* URL-escape the unsafe characters (see urlchr_table) in a given
228 string. If no characters are unsafe, S is returned. */
231 url_escape_allow_passthrough (const char *s)
233 return url_escape_1 (s, urlchr_unsafe, 1);
236 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
238 /* Decide whether to encode, decode, or pass through the char at P.
239 This used to be a macro, but it got a little too convoluted. */
240 static inline enum copy_method
241 decide_copy_method (const char *p)
245 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
247 /* %xx sequence: decode it, unless it would decode to an
248 unsafe or a reserved char; in that case, leave it as
250 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
251 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
252 return CM_PASSTHROUGH;
257 /* Garbled %.. sequence: encode `%'. */
260 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
263 return CM_PASSTHROUGH;
266 /* Translate a %-escaped (but possibly non-conformant) input string S
267 into a %-escaped (and conformant) output string. If no characters
268 are encoded or decoded, return the same string S; otherwise, return
269 a freshly allocated string with the new contents.
271 After a URL has been run through this function, the protocols that
272 use `%' as the quote character can use the resulting string as-is,
273 while those that don't call url_unescape() to get to the intended
274 data. This function is also stable: after an input string is
275 transformed the first time, all further transformations of the
276 result yield the same result string.
278 Let's discuss why this function is needed.
280 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
281 space character would mess up the HTTP request, it needs to be
284 GET /abc%20def HTTP/1.0
286 It appears that the unsafe chars need to be quoted, for example
287 with url_escape. But what if we're requested to download
288 `abc%20def'? url_escape transforms "%" to "%25", which would leave
289 us with `abc%2520def'. This is incorrect -- since %-escapes are
290 part of URL syntax, "%20" is the correct way to denote a literal
291 space on the Wget command line. This leaves us in the conclusion
292 that in that case Wget should not call url_escape, but leave the
295 And what if the requested URI is `abc%20 def'? If we call
296 url_escape, we end up with `/abc%2520%20def', which is almost
297 certainly not intended. If we don't call url_escape, we are left
298 with the embedded space and cannot complete the request. What the
299 user meant was for Wget to request `/abc%20%20def', and this is
300 where reencode_escapes kicks in.
302 Wget used to solve this by first decoding %-quotes, and then
303 encoding all the "unsafe" characters found in the resulting string.
304 This was wrong because it didn't preserve certain URL special
305 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
306 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
307 whether we considered `+' reserved (it is). One of these results
308 is inevitable because by the second step we would lose information
309 on whether the `+' was originally encoded or not. Both results
310 were wrong because in CGI parameters + means space, while %2B means
311 literal plus. reencode_escapes correctly translates the above to
312 "a%2B+b", i.e. returns the original string.
314 This function uses an algorithm proposed by Anon Sricharoenchai:
316 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
319 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
322 ...except that this code conflates the two steps, and decides
323 whether to encode, decode, or pass through each character in turn.
324 The function still uses two passes, but their logic is the same --
325 the first pass exists merely for the sake of allocation. Another
326 small difference is that we include `+' to URL_RESERVED.
330 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
332 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
336 "foo bar" -> "foo%20bar"
337 "foo%20bar" -> "foo%20bar"
338 "foo %20bar" -> "foo%20%20bar"
339 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
340 "foo%25%20bar" -> "foo%25%20bar"
341 "foo%2%20bar" -> "foo%252%20bar"
342 "foo+bar" -> "foo+bar" (plus is reserved!)
343 "foo%2b+bar" -> "foo%2b+bar" */
346 reencode_escapes (const char *s)
352 int encode_count = 0;
353 int decode_count = 0;
355 /* First, pass through the string to see if there's anything to do,
356 and to calculate the new length. */
357 for (p1 = s; *p1; p1++)
359 switch (decide_copy_method (p1))
372 if (!encode_count && !decode_count)
373 /* The string is good as it is. */
374 return (char *)s; /* C const model sucks. */
377 /* Each encoding adds two characters (hex digits), while each
378 decoding removes two characters. */
379 newlen = oldlen + 2 * (encode_count - decode_count);
380 newstr = xmalloc (newlen + 1);
387 switch (decide_copy_method (p1))
391 unsigned char c = *p1++;
393 *p2++ = XNUM_TO_DIGIT (c >> 4);
394 *p2++ = XNUM_TO_DIGIT (c & 0xf);
398 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
399 p1 += 3; /* skip %xx */
406 assert (p2 - newstr == newlen);
410 /* Returns the scheme type if the scheme is supported, or
411 SCHEME_INVALID if not. */
414 url_scheme (const char *url)
418 for (i = 0; supported_schemes[i].leading_string; i++)
419 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
420 strlen (supported_schemes[i].leading_string)))
422 if (supported_schemes[i].enabled)
423 return (enum url_scheme) i;
425 return SCHEME_INVALID;
428 return SCHEME_INVALID;
431 #define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')
433 /* Return 1 if the URL begins with any "scheme", 0 otherwise. As
434 currently implemented, it returns true if URL begins with
438 url_has_scheme (const char *url)
442 /* The first char must be a scheme char. */
443 if (!*p || !SCHEME_CHAR (*p))
446 /* Followed by 0 or more scheme chars. */
447 while (*p && SCHEME_CHAR (*p))
449 /* Terminated by ':'. */
454 scheme_default_port (enum url_scheme scheme)
456 return supported_schemes[scheme].default_port;
460 scheme_disable (enum url_scheme scheme)
462 supported_schemes[scheme].enabled = 0;
465 /* Skip the username and password, if present here. The function
466 should *not* be called with the complete URL, but with the part
467 right after the scheme.
469 If no username and password are found, return 0. */
472 url_skip_credentials (const char *url)
474 /* Look for '@' that comes before terminators, such as '/', '?',
476 const char *p = (const char *)strpbrk (url, "@/?#;");
482 /* Parse credentials contained in [BEG, END). The region is expected
483 to have come from a URL and is unescaped. */
486 parse_credentials (const char *beg, const char *end, char **user, char **passwd)
492 return 0; /* empty user name */
494 colon = memchr (beg, ':', end - beg);
496 return 0; /* again empty user name */
500 *passwd = strdupdelim (colon + 1, end);
502 url_unescape (*passwd);
509 *user = strdupdelim (beg, userend);
510 url_unescape (*user);
514 /* Used by main.c: detect URLs written using the "shorthand" URL forms
515 popularized by Netscape and NcFTP. HTTP shorthands look like this:
517 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
518 www.foo.com[:port] -> http://www.foo.com[:port]
520 FTP shorthands look like this:
522 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
523 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
525 If the URL needs not or cannot be rewritten, return NULL. */
528 rewrite_shorthand_url (const char *url)
532 if (url_has_scheme (url))
535 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
537 for (p = url; *p && *p != ':' && *p != '/'; p++)
547 /* If the characters after the colon and before the next slash
548 or end of string are all digits, it's HTTP. */
550 for (pp = p + 1; ISDIGIT (*pp); pp++)
552 if (digits > 0 && (*pp == '/' || *pp == '\0'))
555 /* Prepend "ftp://" to the entire URL... */
556 res = xmalloc (6 + strlen (url) + 1);
557 sprintf (res, "ftp://%s", url);
558 /* ...and replace ':' with '/'. */
559 res[6 + (p - url)] = '/';
566 /* Just prepend "http://" to what we have. */
567 res = xmalloc (7 + strlen (url) + 1);
568 sprintf (res, "http://%s", url);
573 static void split_path PARAMS ((const char *, char **, char **));
575 /* Like strpbrk, with the exception that it returns the pointer to the
576 terminating zero (end-of-string aka "eos") if no matching character
579 Although I normally balk at Gcc-specific optimizations, it probably
580 makes sense here: glibc has optimizations that detect strpbrk being
581 called with literal string as ACCEPT and inline the search. That
582 optimization is defeated if strpbrk is hidden within the call to
583 another function. (And no, making strpbrk_or_eos inline doesn't
584 help because the check for literal accept is in the
589 #define strpbrk_or_eos(s, accept) ({ \
590 char *SOE_p = strpbrk (s, accept); \
592 SOE_p = (char *)s + strlen (s); \
596 #else /* not __GNUC__ */
599 strpbrk_or_eos (const char *s, const char *accept)
601 char *p = strpbrk (s, accept);
603 p = (char *)s + strlen (s);
608 /* Turn STR into lowercase; return non-zero if a character was
612 lowercase_str (char *str)
619 *str = TOLOWER (*str);
624 static char *parse_errors[] = {
625 #define PE_NO_ERROR 0
627 #define PE_UNSUPPORTED_SCHEME 1
628 "Unsupported scheme",
629 #define PE_EMPTY_HOST 2
631 #define PE_BAD_PORT_NUMBER 3
633 #define PE_INVALID_USER_NAME 4
635 #define PE_UNTERMINATED_IPV6_ADDRESS 5
636 "Unterminated IPv6 numeric address",
637 #define PE_IPV6_NOT_SUPPORTED 6
638 "IPv6 addresses not supported",
639 #define PE_INVALID_IPV6_ADDRESS 7
640 "Invalid IPv6 numeric address"
643 #define SETERR(p, v) do { \
649 /* The following two functions were adapted from glibc. */
652 is_valid_ipv4_address (const char *str, const char *end)
654 int saw_digit, octets;
664 if (ch >= '0' && ch <= '9') {
665 val = val * 10 + (ch - '0');
669 if (saw_digit == 0) {
674 } else if (ch == '.' && saw_digit == 1) {
688 static const int NS_INADDRSZ = 4;
689 static const int NS_IN6ADDRSZ = 16;
690 static const int NS_INT16SZ = 2;
693 is_valid_ipv6_address (const char *str, const char *end)
695 static const char xdigits[] = "0123456789abcdef";
708 /* Leading :: requires some special handling. */
712 if (str == end || *str != ':')
724 /* if ch is a number, add it to val. */
725 pch = strchr(xdigits, ch);
728 val |= (pch - xdigits);
735 /* if ch is a colon ... */
738 if (saw_xdigit == 0) {
743 } else if (str == end) {
746 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
754 /* if ch is a dot ... */
755 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
756 is_valid_ipv4_address(curtok, end) == 1) {
765 if (saw_xdigit == 1) {
766 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
771 if (colonp != NULL) {
772 if (tp == NS_IN6ADDRSZ)
777 if (tp != NS_IN6ADDRSZ)
786 Return a new struct url if successful, NULL on error. In case of
787 error, and if ERROR is not NULL, also set *ERROR to the appropriate
790 url_parse (const char *url, int *error)
794 int path_modified, host_modified;
796 enum url_scheme scheme;
798 const char *uname_b, *uname_e;
799 const char *host_b, *host_e;
800 const char *path_b, *path_e;
801 const char *params_b, *params_e;
802 const char *query_b, *query_e;
803 const char *fragment_b, *fragment_e;
806 char *user = NULL, *passwd = NULL;
810 scheme = url_scheme (url);
811 if (scheme == SCHEME_INVALID)
813 SETERR (error, PE_UNSUPPORTED_SCHEME);
817 url_encoded = reencode_escapes (url);
820 p += strlen (supported_schemes[scheme].leading_string);
822 p += url_skip_credentials (p);
825 /* scheme://user:pass@host[:port]... */
828 /* We attempt to break down the URL into the components path,
829 params, query, and fragment. They are ordered like this:
831 scheme://host[:port][/path][;params][?query][#fragment] */
833 params_b = params_e = NULL;
834 query_b = query_e = NULL;
835 fragment_b = fragment_e = NULL;
841 /* Handle IPv6 address inside square brackets. Ideally we'd
842 just look for the terminating ']', but rfc2732 mandates
843 rejecting invalid IPv6 addresses. */
845 /* The address begins after '['. */
847 host_e = strchr (host_b, ']');
851 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
856 /* Check if the IPv6 address is valid. */
857 if (!is_valid_ipv6_address(host_b, host_e))
859 SETERR (error, PE_INVALID_IPV6_ADDRESS);
863 /* Continue parsing after the closing ']'. */
866 SETERR (error, PE_IPV6_NOT_SUPPORTED);
872 p = strpbrk_or_eos (p, ":/;?#");
876 if (host_b == host_e)
878 SETERR (error, PE_EMPTY_HOST);
882 port = scheme_default_port (scheme);
885 const char *port_b, *port_e, *pp;
887 /* scheme://host:port/tralala */
891 p = strpbrk_or_eos (p, "/;?#");
894 if (port_b == port_e)
896 /* http://host:/whatever */
898 SETERR (error, PE_BAD_PORT_NUMBER);
902 for (port = 0, pp = port_b; pp < port_e; pp++)
906 /* http://host:12randomgarbage/blah */
908 SETERR (error, PE_BAD_PORT_NUMBER);
912 port = 10 * port + (*pp - '0');
920 p = strpbrk_or_eos (p, ";?#");
925 /* Path is not allowed not to exist. */
933 p = strpbrk_or_eos (p, "?#");
940 p = strpbrk_or_eos (p, "#");
943 /* Hack that allows users to use '?' (a wildcard character) in
944 FTP URLs without it being interpreted as a query string
946 if (scheme == SCHEME_FTP)
948 query_b = query_e = NULL;
961 if (uname_b != uname_e)
963 /* http://user:pass@host */
965 /* uname_b uname_e */
966 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
968 SETERR (error, PE_INVALID_USER_NAME);
973 u = (struct url *)xmalloc (sizeof (struct url));
974 memset (u, 0, sizeof (*u));
977 u->host = strdupdelim (host_b, host_e);
982 u->path = strdupdelim (path_b, path_e);
983 path_modified = path_simplify (u->path);
984 split_path (u->path, &u->dir, &u->file);
986 host_modified = lowercase_str (u->host);
989 u->params = strdupdelim (params_b, params_e);
991 u->query = strdupdelim (query_b, query_e);
993 u->fragment = strdupdelim (fragment_b, fragment_e);
995 if (path_modified || u->fragment || host_modified || path_b == path_e)
997 /* If we suspect that a transformation has rendered what
998 url_string might return different from URL_ENCODED, rebuild
999 u->url using url_string. */
1000 u->url = url_string (u, 0);
1002 if (url_encoded != url)
1003 xfree ((char *) url_encoded);
1007 if (url_encoded == url)
1008 u->url = xstrdup (url);
1010 u->url = url_encoded;
1018 url_error (int error_code)
1020 assert (error_code >= 0 && error_code < countof (parse_errors));
1021 return parse_errors[error_code];
1024 /* Split PATH into DIR and FILE. PATH comes from the URL and is
1025 expected to be URL-escaped.
1027 The path is split into directory (the part up to the last slash)
1028 and file (the part after the last slash), which are subsequently
1029 unescaped. Examples:
1032 "foo/bar/baz" "foo/bar" "baz"
1033 "foo/bar/" "foo/bar" ""
1035 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!)
1037 DIR and FILE are freshly allocated. */
1040 split_path (const char *path, char **dir, char **file)
1042 char *last_slash = strrchr (path, '/');
1045 *dir = xstrdup ("");
1046 *file = xstrdup (path);
1050 *dir = strdupdelim (path, last_slash);
1051 *file = xstrdup (last_slash + 1);
1053 url_unescape (*dir);
1054 url_unescape (*file);
1057 /* Note: URL's "full path" is the path with the query string and
1058 params appended. The "fragment" (#foo) is intentionally ignored,
1059 but that might be changed. For example, if the original URL was
1060 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1061 the full path will be "/foo/bar/baz;bullshit?querystring". */
1063 /* Return the length of the full path, without the terminating
1067 full_path_length (const struct url *url)
1071 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1082 /* Write out the full path. */
1085 full_path_write (const struct url *url, char *where)
1087 #define FROB(el, chr) do { \
1088 char *f_el = url->el; \
1090 int l = strlen (f_el); \
1092 memcpy (where, f_el, l); \
1104 /* Public function for getting the "full path". E.g. if u->path is
1105 "foo/bar" and u->query is "param=value", full_path will be
1106 "/foo/bar?param=value". */
1109 url_full_path (const struct url *url)
1111 int length = full_path_length (url);
1112 char *full_path = (char *)xmalloc(length + 1);
1114 full_path_write (url, full_path);
1115 full_path[length] = '\0';
1120 /* Escape unsafe and reserved characters, except for the slash
1124 url_escape_dir (const char *dir)
1126 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1131 /* Unescape slashes in NEWDIR. */
1133 h = newdir; /* hare */
1134 t = newdir; /* tortoise */
1136 for (; *h; h++, t++)
1138 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1151 /* Sync u->path and u->url with u->dir and u->file. Called after
1152 u->file or u->dir have been changed, typically by the FTP code. */
1155 sync_path (struct url *u)
1157 char *newpath, *efile, *edir;
1161 /* u->dir and u->file are not escaped. URL-escape them before
1162 reassembling them into u->path. That way, if they contain
1163 separators like '?' or even if u->file contains slashes, the
1164 path will be correctly assembled. (u->file can contain slashes
1165 if the URL specifies it with %2f, or if an FTP server returns
1167 edir = url_escape_dir (u->dir);
1168 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1171 newpath = xstrdup (efile);
1174 int dirlen = strlen (edir);
1175 int filelen = strlen (efile);
1177 /* Copy "DIR/FILE" to newpath. */
1178 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1179 memcpy (p, edir, dirlen);
1182 memcpy (p, efile, filelen);
1191 if (efile != u->file)
1194 /* Regenerate u->url as well. */
1196 u->url = url_string (u, 0);
1199 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1200 This way we can sync u->path and u->url when they get changed. */
1203 url_set_dir (struct url *url, const char *newdir)
1206 url->dir = xstrdup (newdir);
1211 url_set_file (struct url *url, const char *newfile)
1214 url->file = xstrdup (newfile);
1219 url_free (struct url *url)
1225 FREE_MAYBE (url->params);
1226 FREE_MAYBE (url->query);
1227 FREE_MAYBE (url->fragment);
1228 FREE_MAYBE (url->user);
1229 FREE_MAYBE (url->passwd);
1238 get_urls_file (const char *file)
1240 struct file_memory *fm;
1241 struct urlpos *head, *tail;
1242 const char *text, *text_end;
1244 /* Load the file. */
1245 fm = read_file (file);
1248 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1251 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1255 text_end = fm->content + fm->length;
1256 while (text < text_end)
1258 const char *line_beg = text;
1259 const char *line_end = memchr (text, '\n', text_end - text);
1261 line_end = text_end;
1266 /* Strip whitespace from the beginning and end of line. */
1267 while (line_beg < line_end && ISSPACE (*line_beg))
1269 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1272 if (line_end > line_beg)
1274 /* URL is in the [line_beg, line_end) region. */
1278 struct urlpos *entry;
1281 /* We must copy the URL to a zero-terminated string, and we
1282 can't use alloca because we're in a loop. *sigh*. */
1283 url_text = strdupdelim (line_beg, line_end);
1287 /* Merge opt.base_href with URL. */
1288 char *merged = uri_merge (opt.base_href, url_text);
1293 url = url_parse (url_text, &up_error_code);
1296 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1297 file, url_text, url_error (up_error_code));
1303 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1304 memset (entry, 0, sizeof (*entry));
1315 read_file_free (fm);
1319 /* Free the linked list of urlpos. */
1321 free_urlpos (struct urlpos *l)
1325 struct urlpos *next = l->next;
1328 FREE_MAYBE (l->local_name);
1334 /* Rotate FNAME opt.backups times */
1336 rotate_backups(const char *fname)
1338 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1339 char *from = (char *)alloca (maxlen);
1340 char *to = (char *)alloca (maxlen);
1344 if (stat (fname, &sb) == 0)
1345 if (S_ISREG (sb.st_mode) == 0)
1348 for (i = opt.backups; i > 1; i--)
1350 sprintf (from, "%s.%d", fname, i - 1);
1351 sprintf (to, "%s.%d", fname, i);
1355 sprintf (to, "%s.%d", fname, 1);
1359 /* Create all the necessary directories for PATH (a file). Calls
1360 mkdirhier() internally. */
1362 mkalldirs (const char *path)
1369 p = path + strlen (path);
1370 for (; *p != '/' && p != path; p--)
1373 /* Don't create if it's just a file. */
1374 if ((p == path) && (*p != '/'))
1376 t = strdupdelim (path, p);
1378 /* Check whether the directory exists. */
1379 if ((stat (t, &st) == 0))
1381 if (S_ISDIR (st.st_mode))
1388 /* If the dir exists as a file name, remove it first. This
1389 is *only* for Wget to work with buggy old CERN http
1390 servers. Here is the scenario: When Wget tries to
1391 retrieve a directory without a slash, e.g.
1392 http://foo/bar (bar being a directory), CERN server will
1393 not redirect it too http://foo/bar/ -- it will generate a
1394 directory listing containing links to bar/file1,
1395 bar/file2, etc. Wget will lose because it saves this
1396 HTML listing to a file `bar', so it cannot create the
1397 directory. To work around this, if the file of the same
1398 name exists, we just remove it and create the directory
1400 DEBUGP (("Removing %s because of directory danger!\n", t));
1404 res = make_directory (t);
1406 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1411 /* Functions for constructing the file name out of URL components. */
1413 /* A growable string structure, used by url_file_name and friends.
1414 This should perhaps be moved to utils.c.
1416 The idea is to have a convenient and efficient way to construct a
1417 string by having various functions append data to it. Instead of
1418 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1419 functions in questions, we pass the pointer to this struct. */
1427 /* Ensure that the string can accept APPEND_COUNT more characters past
1428 the current TAIL position. If necessary, this will grow the string
1429 and update its allocated size. If the string is already large
1430 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1431 #define GROW(g, append_size) do { \
1432 struct growable *G_ = g; \
1433 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1436 /* Return the tail position of the string. */
1437 #define TAIL(r) ((r)->base + (r)->tail)
1439 /* Move the tail position by APPEND_COUNT characters. */
1440 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1442 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1446 append_string (const char *str, struct growable *dest)
1448 int l = strlen (str);
1450 memcpy (TAIL (dest), str, l);
1451 TAIL_INCR (dest, l);
1454 /* Append CH to DEST. For example, append_char (0, DEST)
1455 zero-terminates DEST. */
1458 append_char (char ch, struct growable *dest)
1462 TAIL_INCR (dest, 1);
1466 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1467 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1468 filechr_control = 4, /* a control character, e.g. 0-31 */
1471 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1473 /* Shorthands for the table: */
1474 #define U filechr_not_unix
1475 #define W filechr_not_windows
1476 #define C filechr_control
1481 /* Table of characters unsafe under various conditions (see above).
1483 Arguably we could also claim `%' to be unsafe, since we use it as
1484 the escape character. If we ever want to be able to reliably
1485 translate file name back to URL, this would become important
1486 crucial. Right now, it's better to be minimal in escaping. */
1488 const static unsigned char filechr_table[256] =
1490 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1491 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1492 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1493 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1494 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1495 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1496 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1497 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1498 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1499 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1500 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1501 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1502 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1503 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1504 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1505 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1507 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1508 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1512 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1523 /* FN_PORT_SEP is the separator between host and port in file names
1524 for non-standard port numbers. On Unix this is normally ':', as in
1525 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1526 because Windows can't handle ':' in file names. */
1527 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1529 /* FN_QUERY_SEP is the separator between the file name and the URL
1530 query, normally '?'. Since Windows cannot handle '?' as part of
1531 file name, we use '@' instead there. */
1532 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1534 /* Quote path element, characters in [b, e), as file name, and append
1535 the quoted string to DEST. Each character is quoted as per
1536 file_unsafe_char and the corresponding table. */
1539 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1548 if (opt.restrict_files_os == restrict_unix)
1549 mask = filechr_not_unix;
1551 mask = filechr_not_windows;
1552 if (opt.restrict_files_ctrl)
1553 mask |= filechr_control;
1555 /* Copy [b, e) to PATHEL and URL-unescape it. */
1556 BOUNDED_TO_ALLOCA (b, e, pathel);
1557 url_unescape (pathel);
1558 pathlen = strlen (pathel);
1560 /* Go through PATHEL and check how many characters we'll need to
1561 add for file quoting. */
1563 for (p = pathel; *p; p++)
1564 if (FILE_CHAR_TEST (*p, mask))
1567 /* p - pathel is the string length. Each quoted char means two
1568 additional characters in the string, hence 2*quoted. */
1569 outlen = (p - pathel) + (2 * quoted);
1570 GROW (dest, outlen);
1574 /* If there's nothing to quote, we don't need to go through the
1575 string the second time. */
1576 memcpy (TAIL (dest), pathel, outlen);
1580 char *q = TAIL (dest);
1581 for (p = pathel; *p; p++)
1583 if (!FILE_CHAR_TEST (*p, mask))
1587 unsigned char ch = *p;
1589 *q++ = XNUM_TO_DIGIT (ch >> 4);
1590 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1593 assert (q - TAIL (dest) == outlen);
1595 TAIL_INCR (dest, outlen);
1598 /* Append to DEST the directory structure that corresponds the
1599 directory part of URL's path. For example, if the URL is
1600 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1602 Each path element ("dir1" and "dir2" in the above example) is
1603 examined, url-unescaped, and re-escaped as file name element.
1605 Additionally, it cuts as many directories from the path as
1606 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1607 will produce "bar" for the above example. For 2 or more, it will
1610 Each component of the path is quoted for use as file name. */
1613 append_dir_structure (const struct url *u, struct growable *dest)
1615 char *pathel, *next;
1616 int cut = opt.cut_dirs;
1618 /* Go through the path components, de-URL-quote them, and quote them
1619 (if necessary) as file names. */
1622 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1627 /* Ignore empty pathels. path_simplify should remove
1628 occurrences of "//" from the path, but it has special cases
1629 for starting / which generates an empty pathel here. */
1633 append_char ('/', dest);
1634 append_uri_pathel (pathel, next, dest);
1638 /* Return a unique file name that matches the given URL as good as
1639 possible. Does not create directories on the file system. */
1642 url_file_name (const struct url *u)
1644 struct growable fnres;
1646 char *u_file, *u_query;
1647 char *fname, *unique;
1653 /* Start with the directory prefix, if specified. */
1655 append_string (opt.dir_prefix, &fnres);
1657 /* If "dirstruct" is turned on (typically the case with -r), add
1658 the host and port (unless those have been turned off) and
1659 directory structure. */
1662 if (opt.add_hostdir)
1665 append_char ('/', &fnres);
1666 append_string (u->host, &fnres);
1667 if (u->port != scheme_default_port (u->scheme))
1670 number_to_string (portstr, u->port);
1671 append_char (FN_PORT_SEP, &fnres);
1672 append_string (portstr, &fnres);
1676 append_dir_structure (u, &fnres);
1679 /* Add the file name. */
1681 append_char ('/', &fnres);
1682 u_file = *u->file ? u->file : "index.html";
1683 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1685 /* Append "?query" to the file name. */
1686 u_query = u->query && *u->query ? u->query : NULL;
1689 append_char (FN_QUERY_SEP, &fnres);
1690 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1693 /* Zero-terminate the file name. */
1694 append_char ('\0', &fnres);
1698 /* Check the cases in which the unique extensions are not used:
1699 1) Clobbering is turned off (-nc).
1700 2) Retrieval with regetting.
1701 3) Timestamping is used.
1702 4) Hierarchy is built.
1704 The exception is the case when file does exist and is a
1705 directory (see `mkalldirs' for explanation). */
1707 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1708 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1711 unique = unique_name (fname, 1);
1712 if (unique != fname)
1717 /* Return the length of URL's path. Path is considered to be
1718 terminated by one of '?', ';', '#', or by the end of the
1721 path_length (const char *url)
1723 const char *q = strpbrk_or_eos (url, "?;#");
1727 /* Find the last occurrence of character C in the range [b, e), or
1728 NULL, if none are present. This is equivalent to strrchr(b, c),
1729 except that it accepts an END argument instead of requiring the
1730 string to be zero-terminated. Why is there no memrchr()? */
1732 find_last_char (const char *b, const char *e, char c)
1740 /* Resolve "." and ".." elements of PATH by destructively modifying
1741 PATH. "." is resolved by removing that path element, and ".." is
1742 resolved by removing the preceding path element. Leading and
1743 trailing slashes are preserved.
1745 Return non-zero if any changes have been made.
1747 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1748 test examples are provided below. If you change anything in this
1749 function, run test_path_simplify to make sure you haven't broken a
1752 A previous version of this function was based on path_simplify()
1753 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1756 path_simplify (char *path)
1762 ++path; /* preserve the leading '/'. */
1765 end = p + strlen (p) + 1; /* position past the terminating zero. */
1770 /* P should point to the beginning of a path element. */
1772 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1774 /* Handle "./foo" by moving "foo" two characters to the
1776 if (*(p + 1) == '/')
1779 memmove (p, p + 2, end - p);
1790 else if (*p == '.' && *(p + 1) == '.'
1791 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1793 /* Handle "../foo" by moving "foo" one path element to the
1795 char *b = p; /* not p-1 because P can equal PATH */
1797 /* Backtrack by one path element, but not past the beginning
1800 /* foo/bar/../baz */
1806 /* Move backwards until B hits the beginning of the
1807 previous path element or the beginning of path. */
1808 for (--b; b > path && *(b - 1) != '/'; b--)
1813 if (*(p + 2) == '/')
1815 memmove (b, p + 3, end - (p + 3));
1829 /* Remove empty path elements. Not mandated by rfc1808 et
1830 al, but it seems like a good idea to get rid of them.
1831 Supporting them properly is hard (in which directory do
1832 you save http://x.com///y.html?) and they don't seem to
1843 memmove (p, q, end - q);
1848 /* Skip to the next path element. */
1849 while (*p && *p != '/')
1854 /* Make sure P points to the beginning of the next path element,
1855 which is location after the slash. */
1862 /* Merge BASE with LINK and return the resulting URI.
1864 Either of the URIs may be absolute or relative, complete with the
1865 host name, or path only. This tries to reasonably handle all
1866 foreseeable cases. It only employs minimal URL parsing, without
1867 knowledge of the specifics of schemes.
1869 Perhaps this function should call path_simplify so that the callers
1870 don't have to call url_parse unconditionally. */
1873 uri_merge (const char *base, const char *link)
1879 if (url_has_scheme (link))
1880 return xstrdup (link);
1882 /* We may not examine BASE past END. */
1883 end = base + path_length (base);
1884 linklength = strlen (link);
1888 /* Empty LINK points back to BASE, query string and all. */
1889 return xstrdup (base);
1891 else if (*link == '?')
1893 /* LINK points to the same location, but changes the query
1894 string. Examples: */
1895 /* uri_merge("path", "?new") -> "path?new" */
1896 /* uri_merge("path?foo", "?new") -> "path?new" */
1897 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1898 /* uri_merge("path#foo", "?new") -> "path?new" */
1899 int baselength = end - base;
1900 merge = xmalloc (baselength + linklength + 1);
1901 memcpy (merge, base, baselength);
1902 memcpy (merge + baselength, link, linklength);
1903 merge[baselength + linklength] = '\0';
1905 else if (*link == '#')
1907 /* uri_merge("path", "#new") -> "path#new" */
1908 /* uri_merge("path#foo", "#new") -> "path#new" */
1909 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1910 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1912 const char *end1 = strchr (base, '#');
1914 end1 = base + strlen (base);
1915 baselength = end1 - base;
1916 merge = xmalloc (baselength + linklength + 1);
1917 memcpy (merge, base, baselength);
1918 memcpy (merge + baselength, link, linklength);
1919 merge[baselength + linklength] = '\0';
1921 else if (*link == '/' && *(link + 1) == '/')
1923 /* LINK begins with "//" and so is a net path: we need to
1924 replace everything after (and including) the double slash
1927 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1928 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1929 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1933 const char *start_insert;
1935 /* Look for first slash. */
1936 slash = memchr (base, '/', end - base);
1937 /* If found slash and it is a double slash, then replace
1938 from this point, else default to replacing from the
1940 if (slash && *(slash + 1) == '/')
1941 start_insert = slash;
1943 start_insert = base;
1945 span = start_insert - base;
1946 merge = (char *)xmalloc (span + linklength + 1);
1948 memcpy (merge, base, span);
1949 memcpy (merge + span, link, linklength);
1950 merge[span + linklength] = '\0';
1952 else if (*link == '/')
1954 /* LINK is an absolute path: we need to replace everything
1955 after (and including) the FIRST slash with LINK.
1957 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1958 "/qux/xyzzy", our result should be
1959 "http://host/qux/xyzzy". */
1962 const char *start_insert = NULL; /* for gcc to shut up. */
1963 const char *pos = base;
1964 int seen_slash_slash = 0;
1965 /* We're looking for the first slash, but want to ignore
1968 slash = memchr (pos, '/', end - pos);
1969 if (slash && !seen_slash_slash)
1970 if (*(slash + 1) == '/')
1973 seen_slash_slash = 1;
1977 /* At this point, SLASH is the location of the first / after
1978 "//", or the first slash altogether. START_INSERT is the
1979 pointer to the location where LINK will be inserted. When
1980 examining the last two examples, keep in mind that LINK
1983 if (!slash && !seen_slash_slash)
1984 /* example: "foo" */
1986 start_insert = base;
1987 else if (!slash && seen_slash_slash)
1988 /* example: "http://foo" */
1991 else if (slash && !seen_slash_slash)
1992 /* example: "foo/bar" */
1994 start_insert = base;
1995 else if (slash && seen_slash_slash)
1996 /* example: "http://something/" */
1998 start_insert = slash;
2000 span = start_insert - base;
2001 merge = (char *)xmalloc (span + linklength + 1);
2003 memcpy (merge, base, span);
2004 memcpy (merge + span, link, linklength);
2005 merge[span + linklength] = '\0';
2009 /* LINK is a relative URL: we need to replace everything
2010 after last slash (possibly empty) with LINK.
2012 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2013 our result should be "whatever/foo/qux/xyzzy". */
2014 int need_explicit_slash = 0;
2016 const char *start_insert;
2017 const char *last_slash = find_last_char (base, end, '/');
2020 /* No slash found at all. Append LINK to what we have,
2021 but we'll need a slash as a separator.
2023 Example: if base == "foo" and link == "qux/xyzzy", then
2024 we cannot just append link to base, because we'd get
2025 "fooqux/xyzzy", whereas what we want is
2028 To make sure the / gets inserted, we set
2029 need_explicit_slash to 1. We also set start_insert
2030 to end + 1, so that the length calculations work out
2031 correctly for one more (slash) character. Accessing
2032 that character is fine, since it will be the
2033 delimiter, '\0' or '?'. */
2034 /* example: "foo?..." */
2035 /* ^ ('?' gets changed to '/') */
2036 start_insert = end + 1;
2037 need_explicit_slash = 1;
2039 else if (last_slash && last_slash >= base + 2
2040 && last_slash[-2] == ':' && last_slash[-1] == '/')
2042 /* example: http://host" */
2044 start_insert = end + 1;
2045 need_explicit_slash = 1;
2049 /* example: "whatever/foo/bar" */
2051 start_insert = last_slash + 1;
2054 span = start_insert - base;
2055 merge = (char *)xmalloc (span + linklength + 1);
2057 memcpy (merge, base, span);
2058 if (need_explicit_slash)
2059 merge[span - 1] = '/';
2060 memcpy (merge + span, link, linklength);
2061 merge[span + linklength] = '\0';
2067 #define APPEND(p, s) do { \
2068 int len = strlen (s); \
2069 memcpy (p, s, len); \
2073 /* Use this instead of password when the actual password is supposed
2074 to be hidden. We intentionally use a generic string without giving
2075 away the number of characters in the password, like previous
2077 #define HIDDEN_PASSWORD "*password*"
2079 /* Recreate the URL string from the data in URL.
2081 If HIDE is non-zero (as it is when we're calling this on a URL we
2082 plan to print, but not when calling it to canonicalize a URL for
2083 use within the program), password will be hidden. Unsafe
2084 characters in the URL will be quoted. */
2087 url_string (const struct url *url, int hide_password)
2091 char *quoted_user = NULL, *quoted_passwd = NULL;
2093 int scheme_port = supported_schemes[url->scheme].default_port;
2094 char *scheme_str = supported_schemes[url->scheme].leading_string;
2095 int fplen = full_path_length (url);
2097 int brackets_around_host = 0;
2099 assert (scheme_str != NULL);
2101 /* Make sure the user name and password are quoted. */
2104 quoted_user = url_escape_allow_passthrough (url->user);
2108 quoted_passwd = HIDDEN_PASSWORD;
2110 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2114 if (strchr (url->host, ':'))
2115 brackets_around_host = 1;
2117 size = (strlen (scheme_str)
2118 + strlen (url->host)
2119 + (brackets_around_host ? 2 : 0)
2122 if (url->port != scheme_port)
2123 size += 1 + numdigit (url->port);
2126 size += 1 + strlen (quoted_user);
2128 size += 1 + strlen (quoted_passwd);
2131 p = result = xmalloc (size);
2133 APPEND (p, scheme_str);
2136 APPEND (p, quoted_user);
2140 APPEND (p, quoted_passwd);
2145 if (brackets_around_host)
2147 APPEND (p, url->host);
2148 if (brackets_around_host)
2150 if (url->port != scheme_port)
2153 p = number_to_string (p, url->port);
2156 full_path_write (url, p);
2160 assert (p - result == size);
2162 if (quoted_user && quoted_user != url->user)
2163 xfree (quoted_user);
2164 if (quoted_passwd && !hide_password
2165 && quoted_passwd != url->passwd)
2166 xfree (quoted_passwd);
2171 /* Return the URL of the proxy appropriate for url U. */
2173 getproxy (struct url *u)
2176 char *rewritten_url;
2177 static char rewritten_storage[1024];
2181 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2187 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2191 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2195 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2197 case SCHEME_INVALID:
2200 if (!proxy || !*proxy)
2203 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2204 getproxy() to return static storage. */
2205 rewritten_url = rewrite_shorthand_url (proxy);
2208 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2209 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2210 proxy = rewritten_storage;
2216 /* Should a host be accessed through proxy, concerning no_proxy? */
2218 no_proxy_match (const char *host, const char **no_proxy)
2223 return !sufmatch (no_proxy, host);
2226 /* Support for converting links for local viewing in downloaded HTML
2227 files. This should be moved to another file, because it has
2228 nothing to do with processing URLs. */
2230 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2231 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2233 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2234 const char *, int));
2235 static char *local_quote_string PARAMS ((const char *));
2237 /* Change the links in one HTML file. LINKS is a list of links in the
2238 document, along with their positions and the desired direction of
2241 convert_links (const char *file, struct urlpos *links)
2243 struct file_memory *fm;
2246 downloaded_file_t downloaded_file_return;
2248 struct urlpos *link;
2249 int to_url_count = 0, to_file_count = 0;
2251 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2254 /* First we do a "dry run": go through the list L and see whether
2255 any URL needs to be converted in the first place. If not, just
2256 leave the file alone. */
2258 struct urlpos *dry = links;
2259 for (dry = links; dry; dry = dry->next)
2260 if (dry->convert != CO_NOCONVERT)
2264 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2269 fm = read_file (file);
2272 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2273 file, strerror (errno));
2277 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2278 if (opt.backup_converted && downloaded_file_return)
2279 write_backup_file (file, downloaded_file_return);
2281 /* Before opening the file for writing, unlink the file. This is
2282 important if the data in FM is mmaped. In such case, nulling the
2283 file, which is what fopen() below does, would make us read all
2284 zeroes from the mmaped region. */
2285 if (unlink (file) < 0 && errno != ENOENT)
2287 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2288 file, strerror (errno));
2289 read_file_free (fm);
2292 /* Now open the file for writing. */
2293 fp = fopen (file, "wb");
2296 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2297 file, strerror (errno));
2298 read_file_free (fm);
2302 /* Here we loop through all the URLs in file, replacing those of
2303 them that are downloaded with relative references. */
2305 for (link = links; link; link = link->next)
2307 char *url_start = fm->content + link->pos;
2309 if (link->pos >= fm->length)
2311 DEBUGP (("Something strange is going on. Please investigate."));
2314 /* If the URL is not to be converted, skip it. */
2315 if (link->convert == CO_NOCONVERT)
2317 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2321 /* Echo the file contents, up to the offending URL's opening
2322 quote, to the outfile. */
2323 fwrite (p, 1, url_start - p, fp);
2326 switch (link->convert)
2328 case CO_CONVERT_TO_RELATIVE:
2329 /* Convert absolute URL to relative. */
2331 char *newname = construct_relative (file, link->local_name);
2332 char *quoted_newname = local_quote_string (newname);
2334 if (!link->link_refresh_p)
2335 p = replace_attr (p, link->size, fp, quoted_newname);
2337 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2338 link->refresh_timeout);
2340 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2341 link->url->url, newname, link->pos, file));
2343 xfree (quoted_newname);
2347 case CO_CONVERT_TO_COMPLETE:
2348 /* Convert the link to absolute URL. */
2350 char *newlink = link->url->url;
2351 char *quoted_newlink = html_quote_string (newlink);
2353 if (!link->link_refresh_p)
2354 p = replace_attr (p, link->size, fp, quoted_newlink);
2356 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2357 link->refresh_timeout);
2359 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2360 newlink, link->pos, file));
2361 xfree (quoted_newlink);
2365 case CO_NULLIFY_BASE:
2366 /* Change the base href to "". */
2367 p = replace_attr (p, link->size, fp, "");
2375 /* Output the rest of the file. */
2376 if (p - fm->content < fm->length)
2377 fwrite (p, 1, fm->length - (p - fm->content), fp);
2379 read_file_free (fm);
2381 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2384 /* Construct and return a malloced copy of the relative link from two
2385 pieces of information: local name S1 of the referring file and
2386 local name S2 of the referred file.
2388 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2389 "jagor.srce.hr/images/news.gif", the function will return
2392 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2393 "fly.cc.fer.hr/images/fly.gif", the function will return
2394 "../images/fly.gif".
2396 Caveats: S1 should not begin with `/', unless S2 also begins with
2397 '/'. S1 should not contain things like ".." and such --
2398 construct_relative ("fly/ioccc/../index.html",
2399 "fly/images/fly.gif") will fail. (A workaround is to call
2400 something like path_simplify() on S1). */
2402 construct_relative (const char *s1, const char *s2)
2404 int i, cnt, sepdirs1;
2408 return xstrdup (s2);
2409 /* S1 should *not* be absolute, if S2 wasn't. */
2410 assert (*s1 != '/');
2412 /* Skip the directories common to both strings. */
2415 while (s1[i] && s2[i]
2420 if (s1[i] == '/' && s2[i] == '/')
2425 for (sepdirs1 = 0; s1[i]; i++)
2428 /* Now, construct the file as of:
2429 - ../ repeated sepdirs1 time
2430 - all the non-mutual directories of S2. */
2431 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2432 for (i = 0; i < sepdirs1; i++)
2433 memcpy (res + 3 * i, "../", 3);
2434 strcpy (res + 3 * i, s2 + cnt);
2439 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2441 /* Rather than just writing over the original .html file with the
2442 converted version, save the former to *.orig. Note we only do
2443 this for files we've _successfully_ downloaded, so we don't
2444 clobber .orig files sitting around from previous invocations. */
2446 /* Construct the backup filename as the original name plus ".orig". */
2447 size_t filename_len = strlen(file);
2448 char* filename_plus_orig_suffix;
2449 boolean already_wrote_backup_file = FALSE;
2450 slist* converted_file_ptr;
2451 static slist* converted_files = NULL;
2453 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2455 /* Just write "orig" over "html". We need to do it this way
2456 because when we're checking to see if we've downloaded the
2457 file before (to see if we can skip downloading it), we don't
2458 know if it's a text/html file. Therefore we don't know yet
2459 at that stage that -E is going to cause us to tack on
2460 ".html", so we need to compare vs. the original URL plus
2461 ".orig", not the original URL plus ".html.orig". */
2462 filename_plus_orig_suffix = alloca (filename_len + 1);
2463 strcpy(filename_plus_orig_suffix, file);
2464 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2466 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2468 /* Append ".orig" to the name. */
2469 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2470 strcpy(filename_plus_orig_suffix, file);
2471 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2474 /* We can get called twice on the same URL thanks to the
2475 convert_all_links() call in main(). If we write the .orig file
2476 each time in such a case, it'll end up containing the first-pass
2477 conversion, not the original file. So, see if we've already been
2478 called on this file. */
2479 converted_file_ptr = converted_files;
2480 while (converted_file_ptr != NULL)
2481 if (strcmp(converted_file_ptr->string, file) == 0)
2483 already_wrote_backup_file = TRUE;
2487 converted_file_ptr = converted_file_ptr->next;
2489 if (!already_wrote_backup_file)
2491 /* Rename <file> to <file>.orig before former gets written over. */
2492 if (rename(file, filename_plus_orig_suffix) != 0)
2493 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2494 file, filename_plus_orig_suffix, strerror (errno));
2496 /* Remember that we've already written a .orig backup for this file.
2497 Note that we never free this memory since we need it till the
2498 convert_all_links() call, which is one of the last things the
2499 program does before terminating. BTW, I'm not sure if it would be
2500 safe to just set 'converted_file_ptr->string' to 'file' below,
2501 rather than making a copy of the string... Another note is that I
2502 thought I could just add a field to the urlpos structure saying
2503 that we'd written a .orig file for this URL, but that didn't work,
2504 so I had to make this separate list.
2505 -- Dan Harkless <wget@harkless.org>
2507 This [adding a field to the urlpos structure] didn't work
2508 because convert_file() is called from convert_all_links at
2509 the end of the retrieval with a freshly built new urlpos
2511 -- Hrvoje Niksic <hniksic@arsdigita.com>
2513 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2514 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2515 converted_file_ptr->next = converted_files;
2516 converted_files = converted_file_ptr;
2520 static int find_fragment PARAMS ((const char *, int, const char **,
2523 /* Replace an attribute's original text with NEW_TEXT. */
2526 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2529 char quote_char = '\"'; /* use "..." for quoting, unless the
2530 original value is quoted, in which
2531 case reuse its quoting char. */
2532 const char *frag_beg, *frag_end;
2534 /* Structure of our string is:
2535 "...old-contents..."
2536 <--- size ---> (with quotes)
2539 <--- size --> (no quotes) */
2541 if (*p == '\"' || *p == '\'')
2546 size -= 2; /* disregard opening and closing quote */
2548 putc (quote_char, fp);
2549 fputs (new_text, fp);
2551 /* Look for fragment identifier, if any. */
2552 if (find_fragment (p, size, &frag_beg, &frag_end))
2553 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2557 putc (quote_char, fp);
2562 /* The same as REPLACE_ATTR, but used when replacing
2563 <meta http-equiv=refresh content="new_text"> because we need to
2564 append "timeout_value; URL=" before the next_text. */
2567 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2568 const char *new_text, int timeout)
2571 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2575 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2577 return replace_attr (p, size, fp, new_with_timeout);
2580 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2581 preceded by '&'. If the character is not found, return zero. If
2582 the character is found, return 1 and set BP and EP to point to the
2583 beginning and end of the region.
2585 This is used for finding the fragment indentifiers in URLs. */
2588 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2590 const char *end = beg + size;
2592 for (; beg < end; beg++)
2614 /* Quote FILE for use as local reference to an HTML file.
2616 We quote ? as %3F to avoid passing part of the file name as the
2617 parameter when browsing the converted file through HTTP. However,
2618 it is safe to do this only when `--html-extension' is turned on.
2619 This is because converting "index.html?foo=bar" to
2620 "index.html%3Ffoo=bar" would break local browsing, as the latter
2621 isn't even recognized as an HTML file! However, converting
2622 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2623 safe for both local and HTTP-served browsing. */
2626 local_quote_string (const char *file)
2628 const char *file_sans_qmark;
2631 if (!opt.html_extension)
2632 return html_quote_string (file);
2634 qm = count_char (file, '?');
2638 const char *from = file;
2641 /* qm * 2 because we replace each question mark with "%3F",
2642 i.e. replace one char with three, hence two more. */
2643 int fsqlen = strlen (file) + qm * 2;
2645 to = newname = (char *)alloca (fsqlen + 1);
2646 for (; *from; from++)
2657 assert (to - newname == fsqlen);
2660 file_sans_qmark = newname;
2663 file_sans_qmark = file;
2665 return html_quote_string (file_sans_qmark);
2668 /* We're storing "modes" of type downloaded_file_t in the hash table.
2669 However, our hash tables only accept pointers for keys and values.
2670 So when we need a pointer, we use the address of a
2671 downloaded_file_t variable of static storage. */
2673 static downloaded_file_t *
2674 downloaded_mode_to_ptr (downloaded_file_t mode)
2676 static downloaded_file_t
2677 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2678 v2 = FILE_DOWNLOADED_NORMALLY,
2679 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2680 v4 = CHECK_FOR_FILE;
2684 case FILE_NOT_ALREADY_DOWNLOADED:
2686 case FILE_DOWNLOADED_NORMALLY:
2688 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2690 case CHECK_FOR_FILE:
2696 /* This should really be merged with dl_file_url_map and
2697 downloaded_html_files in recur.c. This was originally a list, but
2698 I changed it to a hash table beause it was actually taking a lot of
2699 time to find things in it. */
2701 static struct hash_table *downloaded_files_hash;
2703 /* Remembers which files have been downloaded. In the standard case, should be
2704 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2705 download successfully (i.e. not for ones we have failures on or that we skip
2708 When we've downloaded a file and tacked on a ".html" extension due to -E,
2709 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2710 FILE_DOWNLOADED_NORMALLY.
2712 If you just want to check if a file has been previously added without adding
2713 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2714 with local filenames, not remote URLs. */
2716 downloaded_file (downloaded_file_t mode, const char *file)
2718 downloaded_file_t *ptr;
2720 if (mode == CHECK_FOR_FILE)
2722 if (!downloaded_files_hash)
2723 return FILE_NOT_ALREADY_DOWNLOADED;
2724 ptr = hash_table_get (downloaded_files_hash, file);
2726 return FILE_NOT_ALREADY_DOWNLOADED;
2730 if (!downloaded_files_hash)
2731 downloaded_files_hash = make_string_hash_table (0);
2733 ptr = hash_table_get (downloaded_files_hash, file);
2737 ptr = downloaded_mode_to_ptr (mode);
2738 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2740 return FILE_NOT_ALREADY_DOWNLOADED;
2744 df_free_mapper (void *key, void *value, void *ignored)
2751 downloaded_files_free (void)
2753 if (downloaded_files_hash)
2755 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2756 hash_table_destroy (downloaded_files_hash);
2757 downloaded_files_hash = NULL;
2761 /* Return non-zero if scheme a is similar to scheme b.
2763 Schemes are similar if they are equal. If SSL is supported, schemes
2764 are also similar if one is http (SCHEME_HTTP) and the other is https
2767 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2772 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2773 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2780 /* Debugging and testing support for path_simplify. */
2782 /* Debug: run path_simplify on PATH and return the result in a new
2783 string. Useful for calling from the debugger. */
2787 char *copy = xstrdup (path);
2788 path_simplify (copy);
2793 run_test (char *test, char *expected_result, int expected_change)
2795 char *test_copy = xstrdup (test);
2796 int modified = path_simplify (test_copy);
2798 if (0 != strcmp (test_copy, expected_result))
2800 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2801 test, expected_result, test_copy);
2803 if (modified != expected_change)
2805 if (expected_change == 1)
2806 printf ("Expected no modification with path_simplify(\"%s\").\n",
2809 printf ("Expected modification with path_simplify(\"%s\").\n",
2816 test_path_simplify (void)
2819 char *test, *result;
2825 { "foo", "foo", 0 },
2826 { "foo/bar", "foo/bar", 0 },
2827 { "foo///bar", "foo/bar", 1 },
2828 { "foo/.", "foo/", 1 },
2829 { "foo/./", "foo/", 1 },
2830 { "foo./", "foo./", 0 },
2831 { "foo/../bar", "bar", 1 },
2832 { "foo/../bar/", "bar/", 1 },
2833 { "foo/bar/..", "foo/", 1 },
2834 { "foo/bar/../x", "foo/x", 1 },
2835 { "foo/bar/../x/", "foo/x/", 1 },
2836 { "foo/..", "", 1 },
2837 { "foo/../..", "", 1 },
2838 { "a/b/../../c", "c", 1 },
2839 { "./a/../b", "b", 1 }
2843 for (i = 0; i < countof (tests); i++)
2845 char *test = tests[i].test;
2846 char *expected_result = tests[i].result;
2847 int expected_change = tests[i].should_modify;
2848 run_test (test, expected_result, expected_change);
2851 /* Now run all the tests with a leading slash before the test case,
2852 to prove that the slash is being preserved. */
2853 for (i = 0; i < countof (tests); i++)
2855 char *test, *expected_result;
2856 int expected_change = tests[i].should_modify;
2858 test = xmalloc (1 + strlen (tests[i].test) + 1);
2859 sprintf (test, "/%s", tests[i].test);
2861 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2862 sprintf (expected_result, "/%s", tests[i].result);
2864 run_test (test, expected_result, expected_change);
2867 xfree (expected_result);