2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
64 /* Supported schemes: */
65 static struct scheme_data supported_schemes[] =
67 { "http://", DEFAULT_HTTP_PORT, 1 },
69 { "https://", DEFAULT_HTTPS_PORT, 1 },
71 { "ftp://", DEFAULT_FTP_PORT, 1 },
77 /* Forward declarations: */
79 static char *construct_relative PARAMS ((const char *, const char *));
80 static int path_simplify PARAMS ((char *));
84 /* Support for encoding and decoding of URL strings. We determine
85 whether a character is unsafe through static table lookup. This
86 code assumes ASCII character set and 8-bit chars. */
89 /* rfc1738 reserved chars, preserved from encoding. */
92 /* rfc1738 unsafe chars, plus some more. */
96 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
97 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
98 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
100 /* Shorthands for the table: */
101 #define R urlchr_reserved
102 #define U urlchr_unsafe
105 const static unsigned char urlchr_table[256] =
107 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
108 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
109 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
110 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
111 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
112 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
113 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
114 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
115 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
116 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
117 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
118 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
119 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
120 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
121 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
122 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
127 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
132 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
138 /* URL-unescape the string S.
140 This is done by transforming the sequences "%HH" to the character
141 represented by the hexadecimal digits HH. If % is not followed by
142 two hexadecimal digits, it is inserted literally.
144 The transformation is done in place. If you need the original
145 string intact, make a copy before calling this function. */
148 url_unescape (char *s)
150 char *t = s; /* t - tortoise */
151 char *h = s; /* h - hare */
162 /* Do nothing if '%' is not followed by two hex digits. */
163 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
165 *t = X2DIGITS_TO_NUM (h[1], h[2]);
172 /* The core of url_escape_* functions. Escapes the characters that
173 match the provided mask in urlchr_table.
175 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
176 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
177 freshly allocated string will be returned in all cases. */
180 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
187 for (p1 = s; *p1; p1++)
188 if (urlchr_test (*p1, mask))
189 addition += 2; /* Two more characters (hex digits) */
192 return allow_passthrough ? (char *)s : xstrdup (s);
194 newlen = (p1 - s) + addition;
195 newstr = (char *)xmalloc (newlen + 1);
201 /* Quote the characters that match the test mask. */
202 if (urlchr_test (*p1, mask))
204 unsigned char c = *p1++;
206 *p2++ = XNUM_TO_digit (c >> 4);
207 *p2++ = XNUM_TO_digit (c & 0xf);
212 assert (p2 - newstr == newlen);
218 /* URL-escape the unsafe characters (see urlchr_table) in a given
219 string, returning a freshly allocated string. */
222 url_escape (const char *s)
224 return url_escape_1 (s, urlchr_unsafe, 0);
227 /* URL-escape the unsafe characters (see urlchr_table) in a given
228 string. If no characters are unsafe, S is returned. */
231 url_escape_allow_passthrough (const char *s)
233 return url_escape_1 (s, urlchr_unsafe, 1);
236 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
238 /* Decide whether to encode, decode, or pass through the char at P.
239 This used to be a macro, but it got a little too convoluted. */
240 static inline enum copy_method
241 decide_copy_method (const char *p)
245 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
247 /* %xx sequence: decode it, unless it would decode to an
248 unsafe or a reserved char; in that case, leave it as
250 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
251 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
252 return CM_PASSTHROUGH;
257 /* Garbled %.. sequence: encode `%'. */
260 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
263 return CM_PASSTHROUGH;
266 /* Translate a %-escaped (but possibly non-conformant) input string S
267 into a %-escaped (and conformant) output string. If no characters
268 are encoded or decoded, return the same string S; otherwise, return
269 a freshly allocated string with the new contents.
271 After a URL has been run through this function, the protocols that
272 use `%' as the quote character can use the resulting string as-is,
273 while those that don't call url_unescape() to get to the intended
274 data. This function is also stable: after an input string is
275 transformed the first time, all further transformations of the
276 result yield the same result string.
278 Let's discuss why this function is needed.
280 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
281 space character would mess up the HTTP request, it needs to be
284 GET /abc%20def HTTP/1.0
286 It appears that the unsafe chars need to be quoted, for example
287 with url_escape. But what if we're requested to download
288 `abc%20def'? url_escape transforms "%" to "%25", which would leave
289 us with `abc%2520def'. This is incorrect -- since %-escapes are
290 part of URL syntax, "%20" is the correct way to denote a literal
291 space on the Wget command line. This leaves us in the conclusion
292 that in that case Wget should not call url_escape, but leave the
295 And what if the requested URI is `abc%20 def'? If we call
296 url_escape, we end up with `/abc%2520%20def', which is almost
297 certainly not intended. If we don't call url_escape, we are left
298 with the embedded space and cannot complete the request. What the
299 user meant was for Wget to request `/abc%20%20def', and this is
300 where reencode_escapes kicks in.
302 Wget used to solve this by first decoding %-quotes, and then
303 encoding all the "unsafe" characters found in the resulting string.
304 This was wrong because it didn't preserve certain URL special
305 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
306 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
307 whether we considered `+' reserved (it is). One of these results
308 is inevitable because by the second step we would lose information
309 on whether the `+' was originally encoded or not. Both results
310 were wrong because in CGI parameters + means space, while %2B means
311 literal plus. reencode_escapes correctly translates the above to
312 "a%2B+b", i.e. returns the original string.
314 This function uses an algorithm proposed by Anon Sricharoenchai:
316 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
319 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
322 ...except that this code conflates the two steps, and decides
323 whether to encode, decode, or pass through each character in turn.
324 The function still uses two passes, but their logic is the same --
325 the first pass exists merely for the sake of allocation. Another
326 small difference is that we include `+' to URL_RESERVED.
330 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
332 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
336 "foo bar" -> "foo%20bar"
337 "foo%20bar" -> "foo%20bar"
338 "foo %20bar" -> "foo%20%20bar"
339 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
340 "foo%25%20bar" -> "foo%25%20bar"
341 "foo%2%20bar" -> "foo%252%20bar"
342 "foo+bar" -> "foo+bar" (plus is reserved!)
343 "foo%2b+bar" -> "foo%2b+bar" */
346 reencode_escapes (const char *s)
352 int encode_count = 0;
353 int decode_count = 0;
355 /* First, pass through the string to see if there's anything to do,
356 and to calculate the new length. */
357 for (p1 = s; *p1; p1++)
359 switch (decide_copy_method (p1))
372 if (!encode_count && !decode_count)
373 /* The string is good as it is. */
374 return (char *)s; /* C const model sucks. */
377 /* Each encoding adds two characters (hex digits), while each
378 decoding removes two characters. */
379 newlen = oldlen + 2 * (encode_count - decode_count);
380 newstr = xmalloc (newlen + 1);
387 switch (decide_copy_method (p1))
391 unsigned char c = *p1++;
393 *p2++ = XNUM_TO_DIGIT (c >> 4);
394 *p2++ = XNUM_TO_DIGIT (c & 0xf);
398 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
399 p1 += 3; /* skip %xx */
406 assert (p2 - newstr == newlen);
410 /* Returns the scheme type if the scheme is supported, or
411 SCHEME_INVALID if not. */
413 url_scheme (const char *url)
417 for (i = 0; supported_schemes[i].leading_string; i++)
418 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
419 strlen (supported_schemes[i].leading_string)))
421 if (supported_schemes[i].enabled)
422 return (enum url_scheme) i;
424 return SCHEME_INVALID;
427 return SCHEME_INVALID;
430 /* Return the number of characters needed to skip the scheme part of
431 the URL, e.g. `http://'. If no scheme is found, returns 0. */
433 url_skip_scheme (const char *url)
437 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
439 while (ISALNUM (*p) || *p == '-' || *p == '+')
446 /* Skip "//" if found. */
447 if (*p == '/' && *(p + 1) == '/')
453 /* Returns 1 if the URL begins with a scheme (supported or
454 unsupported), 0 otherwise. */
456 url_has_scheme (const char *url)
459 while (ISALNUM (*p) || *p == '-' || *p == '+')
465 scheme_default_port (enum url_scheme scheme)
467 return supported_schemes[scheme].default_port;
471 scheme_disable (enum url_scheme scheme)
473 supported_schemes[scheme].enabled = 0;
476 /* Skip the username and password, if present here. The function
477 should be called *not* with the complete URL, but with the part
478 right after the scheme.
480 If no username and password are found, return 0. */
482 url_skip_uname (const char *url)
486 /* Look for '@' that comes before '/' or '?'. */
487 p = (const char *)strpbrk (url, "/?@");
495 parse_uname (const char *str, int len, char **user, char **passwd)
500 /* Empty user name not allowed. */
503 colon = memchr (str, ':', len);
505 /* Empty user name again. */
510 int pwlen = len - (colon + 1 - str);
511 *passwd = xmalloc (pwlen + 1);
512 memcpy (*passwd, colon + 1, pwlen);
513 (*passwd)[pwlen] = '\0';
519 *user = xmalloc (len + 1);
520 memcpy (*user, str, len);
524 url_unescape (*user);
526 url_unescape (*passwd);
531 /* Used by main.c: detect URLs written using the "shorthand" URL forms
532 popularized by Netscape and NcFTP. HTTP shorthands look like this:
534 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
535 www.foo.com[:port] -> http://www.foo.com[:port]
537 FTP shorthands look like this:
539 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
540 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
542 If the URL needs not or cannot be rewritten, return NULL. */
544 rewrite_shorthand_url (const char *url)
548 if (url_has_scheme (url))
551 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
553 for (p = url; *p && *p != ':' && *p != '/'; p++)
563 /* If the characters after the colon and before the next slash
564 or end of string are all digits, it's HTTP. */
566 for (pp = p + 1; ISDIGIT (*pp); pp++)
568 if (digits > 0 && (*pp == '/' || *pp == '\0'))
571 /* Prepend "ftp://" to the entire URL... */
572 res = xmalloc (6 + strlen (url) + 1);
573 sprintf (res, "ftp://%s", url);
574 /* ...and replace ':' with '/'. */
575 res[6 + (p - url)] = '/';
582 /* Just prepend "http://" to what we have. */
583 res = xmalloc (7 + strlen (url) + 1);
584 sprintf (res, "http://%s", url);
589 static void parse_path PARAMS ((const char *, char **, char **));
591 /* Like strpbrk, with the exception that it returns the pointer to the
592 terminating zero (end-of-string aka "eos") if no matching character
595 Although I normally balk at Gcc-specific optimizations, it probably
596 makes sense here: glibc has optimizations that detect strpbrk being
597 called with literal string as ACCEPT and inline the search. That
598 optimization is defeated if strpbrk is hidden within the call to
599 another function. (And no, making strpbrk_or_eos inline doesn't
600 help because the check for literal accept is in the
605 #define strpbrk_or_eos(s, accept) ({ \
606 char *SOE_p = strpbrk (s, accept); \
608 SOE_p = (char *)s + strlen (s); \
612 #else /* not __GNUC__ */
615 strpbrk_or_eos (const char *s, const char *accept)
617 char *p = strpbrk (s, accept);
619 p = (char *)s + strlen (s);
624 /* Turn STR into lowercase; return non-zero if a character was
628 lowercase_str (char *str)
635 *str = TOLOWER (*str);
640 static char *parse_errors[] = {
641 #define PE_NO_ERROR 0
643 #define PE_UNSUPPORTED_SCHEME 1
644 "Unsupported scheme",
645 #define PE_EMPTY_HOST 2
647 #define PE_BAD_PORT_NUMBER 3
649 #define PE_INVALID_USER_NAME 4
651 #define PE_UNTERMINATED_IPV6_ADDRESS 5
652 "Unterminated IPv6 numeric address",
653 #define PE_IPV6_NOT_SUPPORTED 6
654 "IPv6 addresses not supported",
655 #define PE_INVALID_IPV6_ADDRESS 7
656 "Invalid IPv6 numeric address"
659 #define SETERR(p, v) do { \
665 /* The following two functions were adapted from glibc. */
668 is_valid_ipv4_address (const char *str, const char *end)
670 int saw_digit, octets;
680 if (ch >= '0' && ch <= '9') {
681 val = val * 10 + (ch - '0');
685 if (saw_digit == 0) {
690 } else if (ch == '.' && saw_digit == 1) {
704 static const int NS_INADDRSZ = 4;
705 static const int NS_IN6ADDRSZ = 16;
706 static const int NS_INT16SZ = 2;
709 is_valid_ipv6_address (const char *str, const char *end)
711 static const char xdigits[] = "0123456789abcdef";
724 /* Leading :: requires some special handling. */
728 if (str == end || *str != ':')
740 /* if ch is a number, add it to val. */
741 pch = strchr(xdigits, ch);
744 val |= (pch - xdigits);
751 /* if ch is a colon ... */
754 if (saw_xdigit == 0) {
759 } else if (str == end) {
762 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
770 /* if ch is a dot ... */
771 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
772 is_valid_ipv4_address(curtok, end) == 1) {
781 if (saw_xdigit == 1) {
782 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
787 if (colonp != NULL) {
788 if (tp == NS_IN6ADDRSZ)
793 if (tp != NS_IN6ADDRSZ)
802 Return a new struct url if successful, NULL on error. In case of
803 error, and if ERROR is not NULL, also set *ERROR to the appropriate
806 url_parse (const char *url, int *error)
810 int path_modified, host_modified;
812 enum url_scheme scheme;
814 const char *uname_b, *uname_e;
815 const char *host_b, *host_e;
816 const char *path_b, *path_e;
817 const char *params_b, *params_e;
818 const char *query_b, *query_e;
819 const char *fragment_b, *fragment_e;
822 char *user = NULL, *passwd = NULL;
826 scheme = url_scheme (url);
827 if (scheme == SCHEME_INVALID)
829 SETERR (error, PE_UNSUPPORTED_SCHEME);
833 url_encoded = reencode_escapes (url);
836 p += strlen (supported_schemes[scheme].leading_string);
838 p += url_skip_uname (p);
841 /* scheme://user:pass@host[:port]... */
844 /* We attempt to break down the URL into the components path,
845 params, query, and fragment. They are ordered like this:
847 scheme://host[:port][/path][;params][?query][#fragment] */
849 params_b = params_e = NULL;
850 query_b = query_e = NULL;
851 fragment_b = fragment_e = NULL;
857 /* Handle IPv6 address inside square brackets. Ideally we'd
858 just look for the terminating ']', but rfc2732 mandates
859 rejecting invalid IPv6 addresses. */
861 /* The address begins after '['. */
863 host_e = strchr (host_b, ']');
867 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
872 /* Check if the IPv6 address is valid. */
873 if (!is_valid_ipv6_address(host_b, host_e))
875 SETERR (error, PE_INVALID_IPV6_ADDRESS);
879 /* Continue parsing after the closing ']'. */
882 SETERR (error, PE_IPV6_NOT_SUPPORTED);
888 p = strpbrk_or_eos (p, ":/;?#");
892 if (host_b == host_e)
894 SETERR (error, PE_EMPTY_HOST);
898 port = scheme_default_port (scheme);
901 const char *port_b, *port_e, *pp;
903 /* scheme://host:port/tralala */
907 p = strpbrk_or_eos (p, "/;?#");
910 if (port_b == port_e)
912 /* http://host:/whatever */
914 SETERR (error, PE_BAD_PORT_NUMBER);
918 for (port = 0, pp = port_b; pp < port_e; pp++)
922 /* http://host:12randomgarbage/blah */
924 SETERR (error, PE_BAD_PORT_NUMBER);
928 port = 10 * port + (*pp - '0');
936 p = strpbrk_or_eos (p, ";?#");
941 /* Path is not allowed not to exist. */
949 p = strpbrk_or_eos (p, "?#");
956 p = strpbrk_or_eos (p, "#");
959 /* Hack that allows users to use '?' (a wildcard character) in
960 FTP URLs without it being interpreted as a query string
962 if (scheme == SCHEME_FTP)
964 query_b = query_e = NULL;
977 if (uname_b != uname_e)
979 /* http://user:pass@host */
981 /* uname_b uname_e */
982 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
984 SETERR (error, PE_INVALID_USER_NAME);
989 u = (struct url *)xmalloc (sizeof (struct url));
990 memset (u, 0, sizeof (*u));
993 u->host = strdupdelim (host_b, host_e);
998 u->path = strdupdelim (path_b, path_e);
999 path_modified = path_simplify (u->path);
1000 parse_path (u->path, &u->dir, &u->file);
1002 host_modified = lowercase_str (u->host);
1005 u->params = strdupdelim (params_b, params_e);
1007 u->query = strdupdelim (query_b, query_e);
1009 u->fragment = strdupdelim (fragment_b, fragment_e);
1011 if (path_modified || u->fragment || host_modified || path_b == path_e)
1013 /* If we suspect that a transformation has rendered what
1014 url_string might return different from URL_ENCODED, rebuild
1015 u->url using url_string. */
1016 u->url = url_string (u, 0);
1018 if (url_encoded != url)
1019 xfree ((char *) url_encoded);
1023 if (url_encoded == url)
1024 u->url = xstrdup (url);
1026 u->url = url_encoded;
1034 url_error (int error_code)
1036 assert (error_code >= 0 && error_code < countof (parse_errors));
1037 return parse_errors[error_code];
1040 /* Parse PATH into dir and file. PATH is extracted from the URL and
1041 is URL-escaped. The function returns unescaped DIR and FILE. */
1044 parse_path (const char *path, char **dir, char **file)
1048 last_slash = strrchr (path, '/');
1051 *dir = xstrdup ("");
1052 *file = xstrdup (path);
1056 *dir = strdupdelim (path, last_slash);
1057 *file = xstrdup (last_slash + 1);
1059 url_unescape (*dir);
1060 url_unescape (*file);
1063 /* Note: URL's "full path" is the path with the query string and
1064 params appended. The "fragment" (#foo) is intentionally ignored,
1065 but that might be changed. For example, if the original URL was
1066 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1067 the full path will be "/foo/bar/baz;bullshit?querystring". */
1069 /* Return the length of the full path, without the terminating
1073 full_path_length (const struct url *url)
1077 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1088 /* Write out the full path. */
1091 full_path_write (const struct url *url, char *where)
1093 #define FROB(el, chr) do { \
1094 char *f_el = url->el; \
1096 int l = strlen (f_el); \
1098 memcpy (where, f_el, l); \
1110 /* Public function for getting the "full path". E.g. if u->path is
1111 "foo/bar" and u->query is "param=value", full_path will be
1112 "/foo/bar?param=value". */
1115 url_full_path (const struct url *url)
1117 int length = full_path_length (url);
1118 char *full_path = (char *)xmalloc(length + 1);
1120 full_path_write (url, full_path);
1121 full_path[length] = '\0';
1126 /* Escape unsafe and reserved characters, except for the slash
1130 url_escape_dir (const char *dir)
1132 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1137 /* Unescape slashes in NEWDIR. */
1139 h = newdir; /* hare */
1140 t = newdir; /* tortoise */
1142 for (; *h; h++, t++)
1144 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1157 /* Sync u->path and u->url with u->dir and u->file. Called after
1158 u->file or u->dir have been changed, typically by the FTP code. */
1161 sync_path (struct url *u)
1163 char *newpath, *efile, *edir;
1167 /* u->dir and u->file are not escaped. URL-escape them before
1168 reassembling them into u->path. That way, if they contain
1169 separators like '?' or even if u->file contains slashes, the
1170 path will be correctly assembled. (u->file can contain slashes
1171 if the URL specifies it with %2f, or if an FTP server returns
1173 edir = url_escape_dir (u->dir);
1174 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1177 newpath = xstrdup (efile);
1180 int dirlen = strlen (edir);
1181 int filelen = strlen (efile);
1183 /* Copy "DIR/FILE" to newpath. */
1184 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1185 memcpy (p, edir, dirlen);
1188 memcpy (p, efile, filelen);
1197 if (efile != u->file)
1200 /* Regenerate u->url as well. */
1202 u->url = url_string (u, 0);
1205 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1206 This way we can sync u->path and u->url when they get changed. */
1209 url_set_dir (struct url *url, const char *newdir)
1212 url->dir = xstrdup (newdir);
1217 url_set_file (struct url *url, const char *newfile)
1220 url->file = xstrdup (newfile);
1225 url_free (struct url *url)
1231 FREE_MAYBE (url->params);
1232 FREE_MAYBE (url->query);
1233 FREE_MAYBE (url->fragment);
1234 FREE_MAYBE (url->user);
1235 FREE_MAYBE (url->passwd);
1244 get_urls_file (const char *file)
1246 struct file_memory *fm;
1247 struct urlpos *head, *tail;
1248 const char *text, *text_end;
1250 /* Load the file. */
1251 fm = read_file (file);
1254 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1257 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1261 text_end = fm->content + fm->length;
1262 while (text < text_end)
1264 const char *line_beg = text;
1265 const char *line_end = memchr (text, '\n', text_end - text);
1267 line_end = text_end;
1272 /* Strip whitespace from the beginning and end of line. */
1273 while (line_beg < line_end && ISSPACE (*line_beg))
1275 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1278 if (line_end > line_beg)
1280 /* URL is in the [line_beg, line_end) region. */
1284 struct urlpos *entry;
1287 /* We must copy the URL to a zero-terminated string, and we
1288 can't use alloca because we're in a loop. *sigh*. */
1289 url_text = strdupdelim (line_beg, line_end);
1293 /* Merge opt.base_href with URL. */
1294 char *merged = uri_merge (opt.base_href, url_text);
1299 url = url_parse (url_text, &up_error_code);
1302 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1303 file, url_text, url_error (up_error_code));
1309 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1310 memset (entry, 0, sizeof (*entry));
1321 read_file_free (fm);
1325 /* Free the linked list of urlpos. */
1327 free_urlpos (struct urlpos *l)
1331 struct urlpos *next = l->next;
1334 FREE_MAYBE (l->local_name);
1340 /* Rotate FNAME opt.backups times */
1342 rotate_backups(const char *fname)
1344 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1345 char *from = (char *)alloca (maxlen);
1346 char *to = (char *)alloca (maxlen);
1350 if (stat (fname, &sb) == 0)
1351 if (S_ISREG (sb.st_mode) == 0)
1354 for (i = opt.backups; i > 1; i--)
1356 sprintf (from, "%s.%d", fname, i - 1);
1357 sprintf (to, "%s.%d", fname, i);
1361 sprintf (to, "%s.%d", fname, 1);
1365 /* Create all the necessary directories for PATH (a file). Calls
1366 mkdirhier() internally. */
1368 mkalldirs (const char *path)
1375 p = path + strlen (path);
1376 for (; *p != '/' && p != path; p--)
1379 /* Don't create if it's just a file. */
1380 if ((p == path) && (*p != '/'))
1382 t = strdupdelim (path, p);
1384 /* Check whether the directory exists. */
1385 if ((stat (t, &st) == 0))
1387 if (S_ISDIR (st.st_mode))
1394 /* If the dir exists as a file name, remove it first. This
1395 is *only* for Wget to work with buggy old CERN http
1396 servers. Here is the scenario: When Wget tries to
1397 retrieve a directory without a slash, e.g.
1398 http://foo/bar (bar being a directory), CERN server will
1399 not redirect it too http://foo/bar/ -- it will generate a
1400 directory listing containing links to bar/file1,
1401 bar/file2, etc. Wget will lose because it saves this
1402 HTML listing to a file `bar', so it cannot create the
1403 directory. To work around this, if the file of the same
1404 name exists, we just remove it and create the directory
1406 DEBUGP (("Removing %s because of directory danger!\n", t));
1410 res = make_directory (t);
1412 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1417 /* Functions for constructing the file name out of URL components. */
1419 /* A growable string structure, used by url_file_name and friends.
1420 This should perhaps be moved to utils.c.
1422 The idea is to have a convenient and efficient way to construct a
1423 string by having various functions append data to it. Instead of
1424 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1425 functions in questions, we pass the pointer to this struct. */
1433 /* Ensure that the string can accept APPEND_COUNT more characters past
1434 the current TAIL position. If necessary, this will grow the string
1435 and update its allocated size. If the string is already large
1436 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1437 #define GROW(g, append_size) do { \
1438 struct growable *G_ = g; \
1439 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1442 /* Return the tail position of the string. */
1443 #define TAIL(r) ((r)->base + (r)->tail)
1445 /* Move the tail position by APPEND_COUNT characters. */
1446 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1448 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1452 append_string (const char *str, struct growable *dest)
1454 int l = strlen (str);
1456 memcpy (TAIL (dest), str, l);
1457 TAIL_INCR (dest, l);
1460 /* Append CH to DEST. For example, append_char (0, DEST)
1461 zero-terminates DEST. */
1464 append_char (char ch, struct growable *dest)
1468 TAIL_INCR (dest, 1);
1472 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1473 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1474 filechr_control = 4, /* a control character, e.g. 0-31 */
1477 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1479 /* Shorthands for the table: */
1480 #define U filechr_not_unix
1481 #define W filechr_not_windows
1482 #define C filechr_control
1487 /* Table of characters unsafe under various conditions (see above).
1489 Arguably we could also claim `%' to be unsafe, since we use it as
1490 the escape character. If we ever want to be able to reliably
1491 translate file name back to URL, this would become important
1492 crucial. Right now, it's better to be minimal in escaping. */
1494 const static unsigned char filechr_table[256] =
1496 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1497 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1498 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1499 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1500 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1501 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1502 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1503 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1504 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1505 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1506 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1507 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1508 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1509 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1510 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1511 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1513 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1514 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1529 /* FN_PORT_SEP is the separator between host and port in file names
1530 for non-standard port numbers. On Unix this is normally ':', as in
1531 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1532 because Windows can't handle ':' in file names. */
1533 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1535 /* FN_QUERY_SEP is the separator between the file name and the URL
1536 query, normally '?'. Since Windows cannot handle '?' as part of
1537 file name, we use '@' instead there. */
1538 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1540 /* Quote path element, characters in [b, e), as file name, and append
1541 the quoted string to DEST. Each character is quoted as per
1542 file_unsafe_char and the corresponding table. */
1545 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1554 if (opt.restrict_files_os == restrict_unix)
1555 mask = filechr_not_unix;
1557 mask = filechr_not_windows;
1558 if (opt.restrict_files_ctrl)
1559 mask |= filechr_control;
1561 /* Copy [b, e) to PATHEL and URL-unescape it. */
1562 BOUNDED_TO_ALLOCA (b, e, pathel);
1563 url_unescape (pathel);
1564 pathlen = strlen (pathel);
1566 /* Go through PATHEL and check how many characters we'll need to
1567 add for file quoting. */
1569 for (p = pathel; *p; p++)
1570 if (FILE_CHAR_TEST (*p, mask))
1573 /* p - pathel is the string length. Each quoted char means two
1574 additional characters in the string, hence 2*quoted. */
1575 outlen = (p - pathel) + (2 * quoted);
1576 GROW (dest, outlen);
1580 /* If there's nothing to quote, we don't need to go through the
1581 string the second time. */
1582 memcpy (TAIL (dest), pathel, outlen);
1586 char *q = TAIL (dest);
1587 for (p = pathel; *p; p++)
1589 if (!FILE_CHAR_TEST (*p, mask))
1593 unsigned char ch = *p;
1595 *q++ = XNUM_TO_DIGIT (ch >> 4);
1596 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1599 assert (q - TAIL (dest) == outlen);
1601 TAIL_INCR (dest, outlen);
1604 /* Append to DEST the directory structure that corresponds the
1605 directory part of URL's path. For example, if the URL is
1606 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1608 Each path element ("dir1" and "dir2" in the above example) is
1609 examined, url-unescaped, and re-escaped as file name element.
1611 Additionally, it cuts as many directories from the path as
1612 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1613 will produce "bar" for the above example. For 2 or more, it will
1616 Each component of the path is quoted for use as file name. */
1619 append_dir_structure (const struct url *u, struct growable *dest)
1621 char *pathel, *next;
1622 int cut = opt.cut_dirs;
1624 /* Go through the path components, de-URL-quote them, and quote them
1625 (if necessary) as file names. */
1628 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1633 /* Ignore empty pathels. path_simplify should remove
1634 occurrences of "//" from the path, but it has special cases
1635 for starting / which generates an empty pathel here. */
1639 append_char ('/', dest);
1640 append_uri_pathel (pathel, next, dest);
1644 /* Return a unique file name that matches the given URL as good as
1645 possible. Does not create directories on the file system. */
1648 url_file_name (const struct url *u)
1650 struct growable fnres;
1652 char *u_file, *u_query;
1653 char *fname, *unique;
1659 /* Start with the directory prefix, if specified. */
1661 append_string (opt.dir_prefix, &fnres);
1663 /* If "dirstruct" is turned on (typically the case with -r), add
1664 the host and port (unless those have been turned off) and
1665 directory structure. */
1668 if (opt.add_hostdir)
1671 append_char ('/', &fnres);
1672 append_string (u->host, &fnres);
1673 if (u->port != scheme_default_port (u->scheme))
1676 number_to_string (portstr, u->port);
1677 append_char (FN_PORT_SEP, &fnres);
1678 append_string (portstr, &fnres);
1682 append_dir_structure (u, &fnres);
1685 /* Add the file name. */
1687 append_char ('/', &fnres);
1688 u_file = *u->file ? u->file : "index.html";
1689 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1691 /* Append "?query" to the file name. */
1692 u_query = u->query && *u->query ? u->query : NULL;
1695 append_char (FN_QUERY_SEP, &fnres);
1696 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1699 /* Zero-terminate the file name. */
1700 append_char ('\0', &fnres);
1704 /* Check the cases in which the unique extensions are not used:
1705 1) Clobbering is turned off (-nc).
1706 2) Retrieval with regetting.
1707 3) Timestamping is used.
1708 4) Hierarchy is built.
1710 The exception is the case when file does exist and is a
1711 directory (see `mkalldirs' for explanation). */
1713 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1714 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1717 unique = unique_name (fname, 1);
1718 if (unique != fname)
1723 /* Return the length of URL's path. Path is considered to be
1724 terminated by one of '?', ';', '#', or by the end of the
1727 path_length (const char *url)
1729 const char *q = strpbrk_or_eos (url, "?;#");
1733 /* Find the last occurrence of character C in the range [b, e), or
1734 NULL, if none are present. This is equivalent to strrchr(b, c),
1735 except that it accepts an END argument instead of requiring the
1736 string to be zero-terminated. Why is there no memrchr()? */
1738 find_last_char (const char *b, const char *e, char c)
1746 /* Resolve "." and ".." elements of PATH by destructively modifying
1747 PATH. "." is resolved by removing that path element, and ".." is
1748 resolved by removing the preceding path element. Leading and
1749 trailing slashes are preserved.
1751 Return non-zero if any changes have been made.
1753 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1754 test examples are provided below. If you change anything in this
1755 function, run test_path_simplify to make sure you haven't broken a
1758 A previous version of this function was based on path_simplify()
1759 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1762 path_simplify (char *path)
1768 ++path; /* preserve the leading '/'. */
1771 end = p + strlen (p) + 1; /* position past the terminating zero. */
1776 /* P should point to the beginning of a path element. */
1778 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1780 /* Handle "./foo" by moving "foo" two characters to the
1782 if (*(p + 1) == '/')
1785 memmove (p, p + 2, end - p);
1796 else if (*p == '.' && *(p + 1) == '.'
1797 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1799 /* Handle "../foo" by moving "foo" one path element to the
1801 char *b = p; /* not p-1 because P can equal PATH */
1803 /* Backtrack by one path element, but not past the beginning
1806 /* foo/bar/../baz */
1812 /* Move backwards until B hits the beginning of the
1813 previous path element or the beginning of path. */
1814 for (--b; b > path && *(b - 1) != '/'; b--)
1819 if (*(p + 2) == '/')
1821 memmove (b, p + 3, end - (p + 3));
1835 /* Remove empty path elements. Not mandated by rfc1808 et
1836 al, but it seems like a good idea to get rid of them.
1837 Supporting them properly is hard (in which directory do
1838 you save http://x.com///y.html?) and they don't seem to
1849 memmove (p, q, end - q);
1854 /* Skip to the next path element. */
1855 while (*p && *p != '/')
1860 /* Make sure P points to the beginning of the next path element,
1861 which is location after the slash. */
1868 /* Resolve the result of "linking" a base URI (BASE) to a
1869 link-specified URI (LINK).
1871 Either of the URIs may be absolute or relative, complete with the
1872 host name, or path only. This tries to behave "reasonably" in all
1873 foreseeable cases. It employs little specific knowledge about
1874 schemes or URL-specific stuff -- it just works on strings.
1876 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1877 See uri_merge for a gentler interface to this functionality.
1879 Perhaps this function should call path_simplify so that the callers
1880 don't have to call url_parse unconditionally. */
1882 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1888 const char *end = base + path_length (base);
1892 /* Empty LINK points back to BASE, query string and all. */
1893 constr = xstrdup (base);
1895 else if (*link == '?')
1897 /* LINK points to the same location, but changes the query
1898 string. Examples: */
1899 /* uri_merge("path", "?new") -> "path?new" */
1900 /* uri_merge("path?foo", "?new") -> "path?new" */
1901 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1902 /* uri_merge("path#foo", "?new") -> "path?new" */
1903 int baselength = end - base;
1904 constr = xmalloc (baselength + linklength + 1);
1905 memcpy (constr, base, baselength);
1906 memcpy (constr + baselength, link, linklength);
1907 constr[baselength + linklength] = '\0';
1909 else if (*link == '#')
1911 /* uri_merge("path", "#new") -> "path#new" */
1912 /* uri_merge("path#foo", "#new") -> "path#new" */
1913 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1914 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1916 const char *end1 = strchr (base, '#');
1918 end1 = base + strlen (base);
1919 baselength = end1 - base;
1920 constr = xmalloc (baselength + linklength + 1);
1921 memcpy (constr, base, baselength);
1922 memcpy (constr + baselength, link, linklength);
1923 constr[baselength + linklength] = '\0';
1925 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1927 /* LINK begins with "//" and so is a net path: we need to
1928 replace everything after (and including) the double slash
1931 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1932 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1933 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1937 const char *start_insert;
1939 /* Look for first slash. */
1940 slash = memchr (base, '/', end - base);
1941 /* If found slash and it is a double slash, then replace
1942 from this point, else default to replacing from the
1944 if (slash && *(slash + 1) == '/')
1945 start_insert = slash;
1947 start_insert = base;
1949 span = start_insert - base;
1950 constr = (char *)xmalloc (span + linklength + 1);
1952 memcpy (constr, base, span);
1953 memcpy (constr + span, link, linklength);
1954 constr[span + linklength] = '\0';
1956 else if (*link == '/')
1958 /* LINK is an absolute path: we need to replace everything
1959 after (and including) the FIRST slash with LINK.
1961 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1962 "/qux/xyzzy", our result should be
1963 "http://host/qux/xyzzy". */
1966 const char *start_insert = NULL; /* for gcc to shut up. */
1967 const char *pos = base;
1968 int seen_slash_slash = 0;
1969 /* We're looking for the first slash, but want to ignore
1972 slash = memchr (pos, '/', end - pos);
1973 if (slash && !seen_slash_slash)
1974 if (*(slash + 1) == '/')
1977 seen_slash_slash = 1;
1981 /* At this point, SLASH is the location of the first / after
1982 "//", or the first slash altogether. START_INSERT is the
1983 pointer to the location where LINK will be inserted. When
1984 examining the last two examples, keep in mind that LINK
1987 if (!slash && !seen_slash_slash)
1988 /* example: "foo" */
1990 start_insert = base;
1991 else if (!slash && seen_slash_slash)
1992 /* example: "http://foo" */
1995 else if (slash && !seen_slash_slash)
1996 /* example: "foo/bar" */
1998 start_insert = base;
1999 else if (slash && seen_slash_slash)
2000 /* example: "http://something/" */
2002 start_insert = slash;
2004 span = start_insert - base;
2005 constr = (char *)xmalloc (span + linklength + 1);
2007 memcpy (constr, base, span);
2009 memcpy (constr + span, link, linklength);
2010 constr[span + linklength] = '\0';
2014 /* LINK is a relative URL: we need to replace everything
2015 after last slash (possibly empty) with LINK.
2017 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2018 our result should be "whatever/foo/qux/xyzzy". */
2019 int need_explicit_slash = 0;
2021 const char *start_insert;
2022 const char *last_slash = find_last_char (base, end, '/');
2025 /* No slash found at all. Append LINK to what we have,
2026 but we'll need a slash as a separator.
2028 Example: if base == "foo" and link == "qux/xyzzy", then
2029 we cannot just append link to base, because we'd get
2030 "fooqux/xyzzy", whereas what we want is
2033 To make sure the / gets inserted, we set
2034 need_explicit_slash to 1. We also set start_insert
2035 to end + 1, so that the length calculations work out
2036 correctly for one more (slash) character. Accessing
2037 that character is fine, since it will be the
2038 delimiter, '\0' or '?'. */
2039 /* example: "foo?..." */
2040 /* ^ ('?' gets changed to '/') */
2041 start_insert = end + 1;
2042 need_explicit_slash = 1;
2044 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2046 /* example: http://host" */
2048 start_insert = end + 1;
2049 need_explicit_slash = 1;
2053 /* example: "whatever/foo/bar" */
2055 start_insert = last_slash + 1;
2058 span = start_insert - base;
2059 constr = (char *)xmalloc (span + linklength + 1);
2061 memcpy (constr, base, span);
2062 if (need_explicit_slash)
2063 constr[span - 1] = '/';
2065 memcpy (constr + span, link, linklength);
2066 constr[span + linklength] = '\0';
2069 else /* !no_scheme */
2071 constr = strdupdelim (link, link + linklength);
2076 /* Merge BASE with LINK and return the resulting URI. This is an
2077 interface to uri_merge_1 that assumes that LINK is a
2078 zero-terminated string. */
2080 uri_merge (const char *base, const char *link)
2082 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2085 #define APPEND(p, s) do { \
2086 int len = strlen (s); \
2087 memcpy (p, s, len); \
2091 /* Use this instead of password when the actual password is supposed
2092 to be hidden. We intentionally use a generic string without giving
2093 away the number of characters in the password, like previous
2095 #define HIDDEN_PASSWORD "*password*"
2097 /* Recreate the URL string from the data in URL.
2099 If HIDE is non-zero (as it is when we're calling this on a URL we
2100 plan to print, but not when calling it to canonicalize a URL for
2101 use within the program), password will be hidden. Unsafe
2102 characters in the URL will be quoted. */
2105 url_string (const struct url *url, int hide_password)
2109 char *quoted_user = NULL, *quoted_passwd = NULL;
2111 int scheme_port = supported_schemes[url->scheme].default_port;
2112 char *scheme_str = supported_schemes[url->scheme].leading_string;
2113 int fplen = full_path_length (url);
2115 int brackets_around_host = 0;
2117 assert (scheme_str != NULL);
2119 /* Make sure the user name and password are quoted. */
2122 quoted_user = url_escape_allow_passthrough (url->user);
2126 quoted_passwd = HIDDEN_PASSWORD;
2128 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2132 if (strchr (url->host, ':'))
2133 brackets_around_host = 1;
2135 size = (strlen (scheme_str)
2136 + strlen (url->host)
2137 + (brackets_around_host ? 2 : 0)
2140 if (url->port != scheme_port)
2141 size += 1 + numdigit (url->port);
2144 size += 1 + strlen (quoted_user);
2146 size += 1 + strlen (quoted_passwd);
2149 p = result = xmalloc (size);
2151 APPEND (p, scheme_str);
2154 APPEND (p, quoted_user);
2158 APPEND (p, quoted_passwd);
2163 if (brackets_around_host)
2165 APPEND (p, url->host);
2166 if (brackets_around_host)
2168 if (url->port != scheme_port)
2171 p = number_to_string (p, url->port);
2174 full_path_write (url, p);
2178 assert (p - result == size);
2180 if (quoted_user && quoted_user != url->user)
2181 xfree (quoted_user);
2182 if (quoted_passwd && !hide_password
2183 && quoted_passwd != url->passwd)
2184 xfree (quoted_passwd);
2189 /* Return the URL of the proxy appropriate for url U. */
2191 getproxy (struct url *u)
2194 char *rewritten_url;
2195 static char rewritten_storage[1024];
2199 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2205 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2209 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2213 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2215 case SCHEME_INVALID:
2218 if (!proxy || !*proxy)
2221 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2222 getproxy() to return static storage. */
2223 rewritten_url = rewrite_shorthand_url (proxy);
2226 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2227 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2228 proxy = rewritten_storage;
2234 /* Should a host be accessed through proxy, concerning no_proxy? */
2236 no_proxy_match (const char *host, const char **no_proxy)
2241 return !sufmatch (no_proxy, host);
2244 /* Support for converting links for local viewing in downloaded HTML
2245 files. This should be moved to another file, because it has
2246 nothing to do with processing URLs. */
2248 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2249 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2251 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2252 const char *, int));
2253 static char *local_quote_string PARAMS ((const char *));
2255 /* Change the links in one HTML file. LINKS is a list of links in the
2256 document, along with their positions and the desired direction of
2259 convert_links (const char *file, struct urlpos *links)
2261 struct file_memory *fm;
2264 downloaded_file_t downloaded_file_return;
2266 struct urlpos *link;
2267 int to_url_count = 0, to_file_count = 0;
2269 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2272 /* First we do a "dry run": go through the list L and see whether
2273 any URL needs to be converted in the first place. If not, just
2274 leave the file alone. */
2276 struct urlpos *dry = links;
2277 for (dry = links; dry; dry = dry->next)
2278 if (dry->convert != CO_NOCONVERT)
2282 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2287 fm = read_file (file);
2290 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2291 file, strerror (errno));
2295 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2296 if (opt.backup_converted && downloaded_file_return)
2297 write_backup_file (file, downloaded_file_return);
2299 /* Before opening the file for writing, unlink the file. This is
2300 important if the data in FM is mmaped. In such case, nulling the
2301 file, which is what fopen() below does, would make us read all
2302 zeroes from the mmaped region. */
2303 if (unlink (file) < 0 && errno != ENOENT)
2305 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2306 file, strerror (errno));
2307 read_file_free (fm);
2310 /* Now open the file for writing. */
2311 fp = fopen (file, "wb");
2314 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2315 file, strerror (errno));
2316 read_file_free (fm);
2320 /* Here we loop through all the URLs in file, replacing those of
2321 them that are downloaded with relative references. */
2323 for (link = links; link; link = link->next)
2325 char *url_start = fm->content + link->pos;
2327 if (link->pos >= fm->length)
2329 DEBUGP (("Something strange is going on. Please investigate."));
2332 /* If the URL is not to be converted, skip it. */
2333 if (link->convert == CO_NOCONVERT)
2335 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2339 /* Echo the file contents, up to the offending URL's opening
2340 quote, to the outfile. */
2341 fwrite (p, 1, url_start - p, fp);
2344 switch (link->convert)
2346 case CO_CONVERT_TO_RELATIVE:
2347 /* Convert absolute URL to relative. */
2349 char *newname = construct_relative (file, link->local_name);
2350 char *quoted_newname = local_quote_string (newname);
2352 if (!link->link_refresh_p)
2353 p = replace_attr (p, link->size, fp, quoted_newname);
2355 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2356 link->refresh_timeout);
2358 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2359 link->url->url, newname, link->pos, file));
2361 xfree (quoted_newname);
2365 case CO_CONVERT_TO_COMPLETE:
2366 /* Convert the link to absolute URL. */
2368 char *newlink = link->url->url;
2369 char *quoted_newlink = html_quote_string (newlink);
2371 if (!link->link_refresh_p)
2372 p = replace_attr (p, link->size, fp, quoted_newlink);
2374 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2375 link->refresh_timeout);
2377 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2378 newlink, link->pos, file));
2379 xfree (quoted_newlink);
2383 case CO_NULLIFY_BASE:
2384 /* Change the base href to "". */
2385 p = replace_attr (p, link->size, fp, "");
2393 /* Output the rest of the file. */
2394 if (p - fm->content < fm->length)
2395 fwrite (p, 1, fm->length - (p - fm->content), fp);
2397 read_file_free (fm);
2399 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2402 /* Construct and return a malloced copy of the relative link from two
2403 pieces of information: local name S1 of the referring file and
2404 local name S2 of the referred file.
2406 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2407 "jagor.srce.hr/images/news.gif", the function will return
2410 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2411 "fly.cc.fer.hr/images/fly.gif", the function will return
2412 "../images/fly.gif".
2414 Caveats: S1 should not begin with `/', unless S2 also begins with
2415 '/'. S1 should not contain things like ".." and such --
2416 construct_relative ("fly/ioccc/../index.html",
2417 "fly/images/fly.gif") will fail. (A workaround is to call
2418 something like path_simplify() on S1). */
2420 construct_relative (const char *s1, const char *s2)
2422 int i, cnt, sepdirs1;
2426 return xstrdup (s2);
2427 /* S1 should *not* be absolute, if S2 wasn't. */
2428 assert (*s1 != '/');
2430 /* Skip the directories common to both strings. */
2433 while (s1[i] && s2[i]
2438 if (s1[i] == '/' && s2[i] == '/')
2443 for (sepdirs1 = 0; s1[i]; i++)
2446 /* Now, construct the file as of:
2447 - ../ repeated sepdirs1 time
2448 - all the non-mutual directories of S2. */
2449 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2450 for (i = 0; i < sepdirs1; i++)
2451 memcpy (res + 3 * i, "../", 3);
2452 strcpy (res + 3 * i, s2 + cnt);
2457 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2459 /* Rather than just writing over the original .html file with the
2460 converted version, save the former to *.orig. Note we only do
2461 this for files we've _successfully_ downloaded, so we don't
2462 clobber .orig files sitting around from previous invocations. */
2464 /* Construct the backup filename as the original name plus ".orig". */
2465 size_t filename_len = strlen(file);
2466 char* filename_plus_orig_suffix;
2467 boolean already_wrote_backup_file = FALSE;
2468 slist* converted_file_ptr;
2469 static slist* converted_files = NULL;
2471 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2473 /* Just write "orig" over "html". We need to do it this way
2474 because when we're checking to see if we've downloaded the
2475 file before (to see if we can skip downloading it), we don't
2476 know if it's a text/html file. Therefore we don't know yet
2477 at that stage that -E is going to cause us to tack on
2478 ".html", so we need to compare vs. the original URL plus
2479 ".orig", not the original URL plus ".html.orig". */
2480 filename_plus_orig_suffix = alloca (filename_len + 1);
2481 strcpy(filename_plus_orig_suffix, file);
2482 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2484 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2486 /* Append ".orig" to the name. */
2487 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2488 strcpy(filename_plus_orig_suffix, file);
2489 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2492 /* We can get called twice on the same URL thanks to the
2493 convert_all_links() call in main(). If we write the .orig file
2494 each time in such a case, it'll end up containing the first-pass
2495 conversion, not the original file. So, see if we've already been
2496 called on this file. */
2497 converted_file_ptr = converted_files;
2498 while (converted_file_ptr != NULL)
2499 if (strcmp(converted_file_ptr->string, file) == 0)
2501 already_wrote_backup_file = TRUE;
2505 converted_file_ptr = converted_file_ptr->next;
2507 if (!already_wrote_backup_file)
2509 /* Rename <file> to <file>.orig before former gets written over. */
2510 if (rename(file, filename_plus_orig_suffix) != 0)
2511 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2512 file, filename_plus_orig_suffix, strerror (errno));
2514 /* Remember that we've already written a .orig backup for this file.
2515 Note that we never free this memory since we need it till the
2516 convert_all_links() call, which is one of the last things the
2517 program does before terminating. BTW, I'm not sure if it would be
2518 safe to just set 'converted_file_ptr->string' to 'file' below,
2519 rather than making a copy of the string... Another note is that I
2520 thought I could just add a field to the urlpos structure saying
2521 that we'd written a .orig file for this URL, but that didn't work,
2522 so I had to make this separate list.
2523 -- Dan Harkless <wget@harkless.org>
2525 This [adding a field to the urlpos structure] didn't work
2526 because convert_file() is called from convert_all_links at
2527 the end of the retrieval with a freshly built new urlpos
2529 -- Hrvoje Niksic <hniksic@arsdigita.com>
2531 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2532 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2533 converted_file_ptr->next = converted_files;
2534 converted_files = converted_file_ptr;
2538 static int find_fragment PARAMS ((const char *, int, const char **,
2541 /* Replace an attribute's original text with NEW_TEXT. */
2544 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2547 char quote_char = '\"'; /* use "..." for quoting, unless the
2548 original value is quoted, in which
2549 case reuse its quoting char. */
2550 const char *frag_beg, *frag_end;
2552 /* Structure of our string is:
2553 "...old-contents..."
2554 <--- size ---> (with quotes)
2557 <--- size --> (no quotes) */
2559 if (*p == '\"' || *p == '\'')
2564 size -= 2; /* disregard opening and closing quote */
2566 putc (quote_char, fp);
2567 fputs (new_text, fp);
2569 /* Look for fragment identifier, if any. */
2570 if (find_fragment (p, size, &frag_beg, &frag_end))
2571 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2575 putc (quote_char, fp);
2580 /* The same as REPLACE_ATTR, but used when replacing
2581 <meta http-equiv=refresh content="new_text"> because we need to
2582 append "timeout_value; URL=" before the next_text. */
2585 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2586 const char *new_text, int timeout)
2589 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2593 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2595 return replace_attr (p, size, fp, new_with_timeout);
2598 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2599 preceded by '&'. If the character is not found, return zero. If
2600 the character is found, return 1 and set BP and EP to point to the
2601 beginning and end of the region.
2603 This is used for finding the fragment indentifiers in URLs. */
2606 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2608 const char *end = beg + size;
2610 for (; beg < end; beg++)
2632 /* Quote FILE for use as local reference to an HTML file.
2634 We quote ? as %3F to avoid passing part of the file name as the
2635 parameter when browsing the converted file through HTTP. However,
2636 it is safe to do this only when `--html-extension' is turned on.
2637 This is because converting "index.html?foo=bar" to
2638 "index.html%3Ffoo=bar" would break local browsing, as the latter
2639 isn't even recognized as an HTML file! However, converting
2640 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2641 safe for both local and HTTP-served browsing. */
2644 local_quote_string (const char *file)
2646 const char *file_sans_qmark;
2649 if (!opt.html_extension)
2650 return html_quote_string (file);
2652 qm = count_char (file, '?');
2656 const char *from = file;
2659 /* qm * 2 because we replace each question mark with "%3F",
2660 i.e. replace one char with three, hence two more. */
2661 int fsqlen = strlen (file) + qm * 2;
2663 to = newname = (char *)alloca (fsqlen + 1);
2664 for (; *from; from++)
2675 assert (to - newname == fsqlen);
2678 file_sans_qmark = newname;
2681 file_sans_qmark = file;
2683 return html_quote_string (file_sans_qmark);
2686 /* We're storing "modes" of type downloaded_file_t in the hash table.
2687 However, our hash tables only accept pointers for keys and values.
2688 So when we need a pointer, we use the address of a
2689 downloaded_file_t variable of static storage. */
2691 static downloaded_file_t *
2692 downloaded_mode_to_ptr (downloaded_file_t mode)
2694 static downloaded_file_t
2695 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2696 v2 = FILE_DOWNLOADED_NORMALLY,
2697 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2698 v4 = CHECK_FOR_FILE;
2702 case FILE_NOT_ALREADY_DOWNLOADED:
2704 case FILE_DOWNLOADED_NORMALLY:
2706 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2708 case CHECK_FOR_FILE:
2714 /* This should really be merged with dl_file_url_map and
2715 downloaded_html_files in recur.c. This was originally a list, but
2716 I changed it to a hash table beause it was actually taking a lot of
2717 time to find things in it. */
2719 static struct hash_table *downloaded_files_hash;
2721 /* Remembers which files have been downloaded. In the standard case, should be
2722 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2723 download successfully (i.e. not for ones we have failures on or that we skip
2726 When we've downloaded a file and tacked on a ".html" extension due to -E,
2727 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2728 FILE_DOWNLOADED_NORMALLY.
2730 If you just want to check if a file has been previously added without adding
2731 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2732 with local filenames, not remote URLs. */
2734 downloaded_file (downloaded_file_t mode, const char *file)
2736 downloaded_file_t *ptr;
2738 if (mode == CHECK_FOR_FILE)
2740 if (!downloaded_files_hash)
2741 return FILE_NOT_ALREADY_DOWNLOADED;
2742 ptr = hash_table_get (downloaded_files_hash, file);
2744 return FILE_NOT_ALREADY_DOWNLOADED;
2748 if (!downloaded_files_hash)
2749 downloaded_files_hash = make_string_hash_table (0);
2751 ptr = hash_table_get (downloaded_files_hash, file);
2755 ptr = downloaded_mode_to_ptr (mode);
2756 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2758 return FILE_NOT_ALREADY_DOWNLOADED;
2762 df_free_mapper (void *key, void *value, void *ignored)
2769 downloaded_files_free (void)
2771 if (downloaded_files_hash)
2773 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2774 hash_table_destroy (downloaded_files_hash);
2775 downloaded_files_hash = NULL;
2779 /* Return non-zero if scheme a is similar to scheme b.
2781 Schemes are similar if they are equal. If SSL is supported, schemes
2782 are also similar if one is http (SCHEME_HTTP) and the other is https
2785 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2790 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2791 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2798 /* Debugging and testing support for path_simplify. */
2800 /* Debug: run path_simplify on PATH and return the result in a new
2801 string. Useful for calling from the debugger. */
2805 char *copy = xstrdup (path);
2806 path_simplify (copy);
2811 run_test (char *test, char *expected_result, int expected_change)
2813 char *test_copy = xstrdup (test);
2814 int modified = path_simplify (test_copy);
2816 if (0 != strcmp (test_copy, expected_result))
2818 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2819 test, expected_result, test_copy);
2821 if (modified != expected_change)
2823 if (expected_change == 1)
2824 printf ("Expected no modification with path_simplify(\"%s\").\n",
2827 printf ("Expected modification with path_simplify(\"%s\").\n",
2834 test_path_simplify (void)
2837 char *test, *result;
2843 { "foo", "foo", 0 },
2844 { "foo/bar", "foo/bar", 0 },
2845 { "foo///bar", "foo/bar", 1 },
2846 { "foo/.", "foo/", 1 },
2847 { "foo/./", "foo/", 1 },
2848 { "foo./", "foo./", 0 },
2849 { "foo/../bar", "bar", 1 },
2850 { "foo/../bar/", "bar/", 1 },
2851 { "foo/bar/..", "foo/", 1 },
2852 { "foo/bar/../x", "foo/x", 1 },
2853 { "foo/bar/../x/", "foo/x/", 1 },
2854 { "foo/..", "", 1 },
2855 { "foo/../..", "", 1 },
2856 { "a/b/../../c", "c", 1 },
2857 { "./a/../b", "b", 1 }
2861 for (i = 0; i < countof (tests); i++)
2863 char *test = tests[i].test;
2864 char *expected_result = tests[i].result;
2865 int expected_change = tests[i].should_modify;
2866 run_test (test, expected_result, expected_change);
2869 /* Now run all the tests with a leading slash before the test case,
2870 to prove that the slash is being preserved. */
2871 for (i = 0; i < countof (tests); i++)
2873 char *test, *expected_result;
2874 int expected_change = tests[i].should_modify;
2876 test = xmalloc (1 + strlen (tests[i].test) + 1);
2877 sprintf (test, "/%s", tests[i].test);
2879 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2880 sprintf (expected_result, "/%s", tests[i].result);
2882 run_test (test, expected_result, expected_change);
2885 xfree (expected_result);