2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
62 static const int NS_INADDRSZ = 4;
63 static const int NS_IN6ADDRSZ = 16;
64 static const int NS_INT16SZ = 2;
74 /* Supported schemes: */
75 static struct scheme_data supported_schemes[] =
77 { "http://", DEFAULT_HTTP_PORT, 1 },
79 { "https://", DEFAULT_HTTPS_PORT, 1 },
81 { "ftp://", DEFAULT_FTP_PORT, 1 },
87 /* Forward declarations: */
89 static char *construct_relative PARAMS ((const char *, const char *));
90 static int path_simplify PARAMS ((char *));
94 /* Support for encoding and decoding of URL strings. We determine
95 whether a character is unsafe through static table lookup. This
96 code assumes ASCII character set and 8-bit chars. */
99 /* rfc1738 reserved chars, preserved from encoding. */
102 /* rfc1738 unsafe chars, plus some more. */
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
110 /* Shorthands for the table: */
111 #define R urlchr_reserved
112 #define U urlchr_unsafe
115 const static unsigned char urlchr_table[256] =
117 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
118 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
119 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
120 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
121 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
122 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
123 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
124 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
125 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
126 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
127 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
128 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
129 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
130 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
131 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
132 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
134 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 /* URL-unescape the string S.
150 This is done by transforming the sequences "%HH" to the character
151 represented by the hexadecimal digits HH. If % is not followed by
152 two hexadecimal digits, it is inserted literally.
154 The transformation is done in place. If you need the original
155 string intact, make a copy before calling this function. */
158 url_unescape (char *s)
160 char *t = s; /* t - tortoise */
161 char *h = s; /* h - hare */
172 /* Do nothing if '%' is not followed by two hex digits. */
173 if (!*(h + 1) || !*(h + 2)
174 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
176 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
183 /* The core of url_escape_* functions. Escapes the characters that
184 match the provided mask in urlchr_table.
186 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
187 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
188 freshly allocated string will be returned in all cases. */
191 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
198 for (p1 = s; *p1; p1++)
199 if (urlchr_test (*p1, mask))
200 addition += 2; /* Two more characters (hex digits) */
203 return allow_passthrough ? (char *)s : xstrdup (s);
205 newlen = (p1 - s) + addition;
206 newstr = (char *)xmalloc (newlen + 1);
212 /* Quote the characters that match the test mask. */
213 if (urlchr_test (*p1, mask))
215 unsigned char c = *p1++;
217 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
218 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
223 assert (p2 - newstr == newlen);
229 /* URL-escape the unsafe characters (see urlchr_table) in a given
230 string, returning a freshly allocated string. */
233 url_escape (const char *s)
235 return url_escape_1 (s, urlchr_unsafe, 0);
238 /* URL-escape the unsafe characters (see urlchr_table) in a given
239 string. If no characters are unsafe, S is returned. */
242 url_escape_allow_passthrough (const char *s)
244 return url_escape_1 (s, urlchr_unsafe, 1);
247 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
249 /* Decide whether to encode, decode, or pass through the char at P.
250 This used to be a macro, but it got a little too convoluted. */
251 static inline enum copy_method
252 decide_copy_method (const char *p)
256 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
258 /* %xx sequence: decode it, unless it would decode to an
259 unsafe or a reserved char; in that case, leave it as
261 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
262 XCHAR_TO_XDIGIT (*(p + 2));
264 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
265 return CM_PASSTHROUGH;
270 /* Garbled %.. sequence: encode `%'. */
273 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
276 return CM_PASSTHROUGH;
279 /* Translate a %-escaped (but possibly non-conformant) input string S
280 into a %-escaped (and conformant) output string. If no characters
281 are encoded or decoded, return the same string S; otherwise, return
282 a freshly allocated string with the new contents.
284 After a URL has been run through this function, the protocols that
285 use `%' as the quote character can use the resulting string as-is,
286 while those that don't call url_unescape() to get to the intended
287 data. This function is also stable: after an input string is
288 transformed the first time, all further transformations of the
289 result yield the same result string.
291 Let's discuss why this function is needed.
293 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
294 space character would mess up the HTTP request, it needs to be
297 GET /abc%20def HTTP/1.0
299 It appears that the unsafe chars need to be quoted, for example
300 with url_escape. But what if we're requested to download
301 `abc%20def'? url_escape transforms "%" to "%25", which would leave
302 us with `abc%2520def'. This is incorrect -- since %-escapes are
303 part of URL syntax, "%20" is the correct way to denote a literal
304 space on the Wget command line. This leaves us in the conclusion
305 that in that case Wget should not call url_escape, but leave the
308 And what if the requested URI is `abc%20 def'? If we call
309 url_escape, we end up with `/abc%2520%20def', which is almost
310 certainly not intended. If we don't call url_escape, we are left
311 with the embedded space and cannot complete the request. What the
312 user meant was for Wget to request `/abc%20%20def', and this is
313 where reencode_escapes kicks in.
315 Wget used to solve this by first decoding %-quotes, and then
316 encoding all the "unsafe" characters found in the resulting string.
317 This was wrong because it didn't preserve certain URL special
318 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
319 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
320 whether we considered `+' reserved (it is). One of these results
321 is inevitable because by the second step we would lose information
322 on whether the `+' was originally encoded or not. Both results
323 were wrong because in CGI parameters + means space, while %2B means
324 literal plus. reencode_escapes correctly translates the above to
325 "a%2B+b", i.e. returns the original string.
327 This function uses an algorithm proposed by Anon Sricharoenchai:
329 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
332 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
335 ...except that this code conflates the two steps, and decides
336 whether to encode, decode, or pass through each character in turn.
337 The function still uses two passes, but their logic is the same --
338 the first pass exists merely for the sake of allocation. Another
339 small difference is that we include `+' to URL_RESERVED.
343 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
345 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
349 "foo bar" -> "foo%20bar"
350 "foo%20bar" -> "foo%20bar"
351 "foo %20bar" -> "foo%20%20bar"
352 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
353 "foo%25%20bar" -> "foo%25%20bar"
354 "foo%2%20bar" -> "foo%252%20bar"
355 "foo+bar" -> "foo+bar" (plus is reserved!)
356 "foo%2b+bar" -> "foo%2b+bar" */
359 reencode_escapes (const char *s)
365 int encode_count = 0;
366 int decode_count = 0;
368 /* First, pass through the string to see if there's anything to do,
369 and to calculate the new length. */
370 for (p1 = s; *p1; p1++)
372 switch (decide_copy_method (p1))
385 if (!encode_count && !decode_count)
386 /* The string is good as it is. */
387 return (char *)s; /* C const model sucks. */
390 /* Each encoding adds two characters (hex digits), while each
391 decoding removes two characters. */
392 newlen = oldlen + 2 * (encode_count - decode_count);
393 newstr = xmalloc (newlen + 1);
400 switch (decide_copy_method (p1))
404 unsigned char c = *p1++;
406 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
407 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
411 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
412 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
413 p1 += 3; /* skip %xx */
420 assert (p2 - newstr == newlen);
424 /* Returns the scheme type if the scheme is supported, or
425 SCHEME_INVALID if not. */
427 url_scheme (const char *url)
431 for (i = 0; supported_schemes[i].leading_string; i++)
432 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
433 strlen (supported_schemes[i].leading_string)))
435 if (supported_schemes[i].enabled)
436 return (enum url_scheme) i;
438 return SCHEME_INVALID;
441 return SCHEME_INVALID;
444 /* Return the number of characters needed to skip the scheme part of
445 the URL, e.g. `http://'. If no scheme is found, returns 0. */
447 url_skip_scheme (const char *url)
451 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
453 while (ISALNUM (*p) || *p == '-' || *p == '+')
460 /* Skip "//" if found. */
461 if (*p == '/' && *(p + 1) == '/')
467 /* Returns 1 if the URL begins with a scheme (supported or
468 unsupported), 0 otherwise. */
470 url_has_scheme (const char *url)
473 while (ISALNUM (*p) || *p == '-' || *p == '+')
479 scheme_default_port (enum url_scheme scheme)
481 return supported_schemes[scheme].default_port;
485 scheme_disable (enum url_scheme scheme)
487 supported_schemes[scheme].enabled = 0;
490 /* Skip the username and password, if present here. The function
491 should be called *not* with the complete URL, but with the part
492 right after the scheme.
494 If no username and password are found, return 0. */
496 url_skip_uname (const char *url)
500 /* Look for '@' that comes before '/' or '?'. */
501 p = (const char *)strpbrk (url, "/?@");
509 parse_uname (const char *str, int len, char **user, char **passwd)
514 /* Empty user name not allowed. */
517 colon = memchr (str, ':', len);
519 /* Empty user name again. */
524 int pwlen = len - (colon + 1 - str);
525 *passwd = xmalloc (pwlen + 1);
526 memcpy (*passwd, colon + 1, pwlen);
527 (*passwd)[pwlen] = '\0';
533 *user = xmalloc (len + 1);
534 memcpy (*user, str, len);
538 url_unescape (*user);
540 url_unescape (*passwd);
545 /* Used by main.c: detect URLs written using the "shorthand" URL forms
546 popularized by Netscape and NcFTP. HTTP shorthands look like this:
548 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
549 www.foo.com[:port] -> http://www.foo.com[:port]
551 FTP shorthands look like this:
553 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
554 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
556 If the URL needs not or cannot be rewritten, return NULL. */
558 rewrite_shorthand_url (const char *url)
562 if (url_has_scheme (url))
565 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
567 for (p = url; *p && *p != ':' && *p != '/'; p++)
577 /* If the characters after the colon and before the next slash
578 or end of string are all digits, it's HTTP. */
580 for (pp = p + 1; ISDIGIT (*pp); pp++)
582 if (digits > 0 && (*pp == '/' || *pp == '\0'))
585 /* Prepend "ftp://" to the entire URL... */
586 res = xmalloc (6 + strlen (url) + 1);
587 sprintf (res, "ftp://%s", url);
588 /* ...and replace ':' with '/'. */
589 res[6 + (p - url)] = '/';
596 /* Just prepend "http://" to what we have. */
597 res = xmalloc (7 + strlen (url) + 1);
598 sprintf (res, "http://%s", url);
603 static void parse_path PARAMS ((const char *, char **, char **));
605 /* Like strpbrk, with the exception that it returns the pointer to the
606 terminating zero (end-of-string aka "eos") if no matching character
609 Although I normally balk at Gcc-specific optimizations, it probably
610 makes sense here: glibc has optimizations that detect strpbrk being
611 called with literal string as ACCEPT and inline the search. That
612 optimization is defeated if strpbrk is hidden within the call to
613 another function. (And no, making strpbrk_or_eos inline doesn't
614 help because the check for literal accept is in the
619 #define strpbrk_or_eos(s, accept) ({ \
620 char *SOE_p = strpbrk (s, accept); \
622 SOE_p = (char *)s + strlen (s); \
626 #else /* not __GNUC__ */
629 strpbrk_or_eos (const char *s, const char *accept)
631 char *p = strpbrk (s, accept);
633 p = (char *)s + strlen (s);
638 /* Turn STR into lowercase; return non-zero if a character was
642 lowercase_str (char *str)
649 *str = TOLOWER (*str);
654 static char *parse_errors[] = {
655 #define PE_NO_ERROR 0
657 #define PE_UNSUPPORTED_SCHEME 1
658 "Unsupported scheme",
659 #define PE_EMPTY_HOST 2
661 #define PE_BAD_PORT_NUMBER 3
663 #define PE_INVALID_USER_NAME 4
665 #define PE_UNTERMINATED_IPV6_ADDRESS 5
666 "Unterminated IPv6 numeric address",
667 #define PE_IPV6_NOT_SUPPORTED 6
668 "IPv6 addresses not supported",
669 #define PE_INVALID_IPV6_ADDRESS 7
670 "Invalid IPv6 numeric address"
673 #define SETERR(p, v) do { \
679 /* The following two functions were adapted from glibc. */
682 is_valid_ipv4_address (const char *str, const char *end)
684 int saw_digit, octets;
694 if (ch >= '0' && ch <= '9') {
695 val = val * 10 + (ch - '0');
699 if (saw_digit == 0) {
704 } else if (ch == '.' && saw_digit == 1) {
719 is_valid_ipv6_address (const char *str, const char *end)
721 static const char xdigits[] = "0123456789abcdef";
734 /* Leading :: requires some special handling. */
738 if (str == end || *str != ':')
750 /* if ch is a number, add it to val. */
751 pch = strchr(xdigits, ch);
754 val |= (pch - xdigits);
761 /* if ch is a colon ... */
764 if (saw_xdigit == 0) {
769 } else if (str == end) {
772 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
780 /* if ch is a dot ... */
781 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
782 is_valid_ipv4_address(curtok, end) == 1) {
791 if (saw_xdigit == 1) {
792 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
797 if (colonp != NULL) {
798 if (tp == NS_IN6ADDRSZ)
803 if (tp != NS_IN6ADDRSZ)
812 Return a new struct url if successful, NULL on error. In case of
813 error, and if ERROR is not NULL, also set *ERROR to the appropriate
816 url_parse (const char *url, int *error)
820 int path_modified, host_modified;
822 enum url_scheme scheme;
824 const char *uname_b, *uname_e;
825 const char *host_b, *host_e;
826 const char *path_b, *path_e;
827 const char *params_b, *params_e;
828 const char *query_b, *query_e;
829 const char *fragment_b, *fragment_e;
832 char *user = NULL, *passwd = NULL;
836 scheme = url_scheme (url);
837 if (scheme == SCHEME_INVALID)
839 SETERR (error, PE_UNSUPPORTED_SCHEME);
843 url_encoded = reencode_escapes (url);
846 p += strlen (supported_schemes[scheme].leading_string);
848 p += url_skip_uname (p);
851 /* scheme://user:pass@host[:port]... */
854 /* We attempt to break down the URL into the components path,
855 params, query, and fragment. They are ordered like this:
857 scheme://host[:port][/path][;params][?query][#fragment] */
859 params_b = params_e = NULL;
860 query_b = query_e = NULL;
861 fragment_b = fragment_e = NULL;
867 /* Handle IPv6 address inside square brackets. Ideally we'd
868 just look for the terminating ']', but rfc2732 mandates
869 rejecting invalid IPv6 addresses. */
871 /* The address begins after '['. */
873 host_e = strchr (host_b, ']');
877 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
882 /* Check if the IPv6 address is valid. */
883 if (!is_valid_ipv6_address(host_b, host_e))
885 SETERR (error, PE_INVALID_IPV6_ADDRESS);
889 /* Continue parsing after the closing ']'. */
892 SETERR (error, PE_IPV6_NOT_SUPPORTED);
898 p = strpbrk_or_eos (p, ":/;?#");
902 if (host_b == host_e)
904 SETERR (error, PE_EMPTY_HOST);
908 port = scheme_default_port (scheme);
911 const char *port_b, *port_e, *pp;
913 /* scheme://host:port/tralala */
917 p = strpbrk_or_eos (p, "/;?#");
920 if (port_b == port_e)
922 /* http://host:/whatever */
924 SETERR (error, PE_BAD_PORT_NUMBER);
928 for (port = 0, pp = port_b; pp < port_e; pp++)
932 /* http://host:12randomgarbage/blah */
934 SETERR (error, PE_BAD_PORT_NUMBER);
938 port = 10 * port + (*pp - '0');
946 p = strpbrk_or_eos (p, ";?#");
951 /* Path is not allowed not to exist. */
959 p = strpbrk_or_eos (p, "?#");
966 p = strpbrk_or_eos (p, "#");
969 /* Hack that allows users to use '?' (a wildcard character) in
970 FTP URLs without it being interpreted as a query string
972 if (scheme == SCHEME_FTP)
974 query_b = query_e = NULL;
987 if (uname_b != uname_e)
989 /* http://user:pass@host */
991 /* uname_b uname_e */
992 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
994 SETERR (error, PE_INVALID_USER_NAME);
999 u = (struct url *)xmalloc (sizeof (struct url));
1000 memset (u, 0, sizeof (*u));
1003 u->host = strdupdelim (host_b, host_e);
1008 u->path = strdupdelim (path_b, path_e);
1009 path_modified = path_simplify (u->path);
1010 parse_path (u->path, &u->dir, &u->file);
1012 host_modified = lowercase_str (u->host);
1015 u->params = strdupdelim (params_b, params_e);
1017 u->query = strdupdelim (query_b, query_e);
1019 u->fragment = strdupdelim (fragment_b, fragment_e);
1021 if (path_modified || u->fragment || host_modified || path_b == path_e)
1023 /* If we suspect that a transformation has rendered what
1024 url_string might return different from URL_ENCODED, rebuild
1025 u->url using url_string. */
1026 u->url = url_string (u, 0);
1028 if (url_encoded != url)
1029 xfree ((char *) url_encoded);
1033 if (url_encoded == url)
1034 u->url = xstrdup (url);
1036 u->url = url_encoded;
1044 url_error (int error_code)
1046 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1047 return parse_errors[error_code];
1050 /* Parse PATH into dir and file. PATH is extracted from the URL and
1051 is URL-escaped. The function returns unescaped DIR and FILE. */
1054 parse_path (const char *path, char **dir, char **file)
1058 last_slash = strrchr (path, '/');
1061 *dir = xstrdup ("");
1062 *file = xstrdup (path);
1066 *dir = strdupdelim (path, last_slash);
1067 *file = xstrdup (last_slash + 1);
1069 url_unescape (*dir);
1070 url_unescape (*file);
1073 /* Note: URL's "full path" is the path with the query string and
1074 params appended. The "fragment" (#foo) is intentionally ignored,
1075 but that might be changed. For example, if the original URL was
1076 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1077 the full path will be "/foo/bar/baz;bullshit?querystring". */
1079 /* Return the length of the full path, without the terminating
1083 full_path_length (const struct url *url)
1087 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1098 /* Write out the full path. */
1101 full_path_write (const struct url *url, char *where)
1103 #define FROB(el, chr) do { \
1104 char *f_el = url->el; \
1106 int l = strlen (f_el); \
1108 memcpy (where, f_el, l); \
1120 /* Public function for getting the "full path". E.g. if u->path is
1121 "foo/bar" and u->query is "param=value", full_path will be
1122 "/foo/bar?param=value". */
1125 url_full_path (const struct url *url)
1127 int length = full_path_length (url);
1128 char *full_path = (char *)xmalloc(length + 1);
1130 full_path_write (url, full_path);
1131 full_path[length] = '\0';
1136 /* Escape unsafe and reserved characters, except for the slash
1140 url_escape_dir (const char *dir)
1142 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1147 /* Unescape slashes in NEWDIR. */
1149 h = newdir; /* hare */
1150 t = newdir; /* tortoise */
1152 for (; *h; h++, t++)
1154 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1167 /* Sync u->path and u->url with u->dir and u->file. Called after
1168 u->file or u->dir have been changed, typically by the FTP code. */
1171 sync_path (struct url *u)
1173 char *newpath, *efile, *edir;
1177 /* u->dir and u->file are not escaped. URL-escape them before
1178 reassembling them into u->path. That way, if they contain
1179 separators like '?' or even if u->file contains slashes, the
1180 path will be correctly assembled. (u->file can contain slashes
1181 if the URL specifies it with %2f, or if an FTP server returns
1183 edir = url_escape_dir (u->dir);
1184 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1187 newpath = xstrdup (efile);
1190 int dirlen = strlen (edir);
1191 int filelen = strlen (efile);
1193 /* Copy "DIR/FILE" to newpath. */
1194 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195 memcpy (p, edir, dirlen);
1198 memcpy (p, efile, filelen);
1207 if (efile != u->file)
1210 /* Regenerate u->url as well. */
1212 u->url = url_string (u, 0);
1215 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1216 This way we can sync u->path and u->url when they get changed. */
1219 url_set_dir (struct url *url, const char *newdir)
1222 url->dir = xstrdup (newdir);
1227 url_set_file (struct url *url, const char *newfile)
1230 url->file = xstrdup (newfile);
1235 url_free (struct url *url)
1241 FREE_MAYBE (url->params);
1242 FREE_MAYBE (url->query);
1243 FREE_MAYBE (url->fragment);
1244 FREE_MAYBE (url->user);
1245 FREE_MAYBE (url->passwd);
1254 get_urls_file (const char *file)
1256 struct file_memory *fm;
1257 struct urlpos *head, *tail;
1258 const char *text, *text_end;
1260 /* Load the file. */
1261 fm = read_file (file);
1264 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1267 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1271 text_end = fm->content + fm->length;
1272 while (text < text_end)
1274 const char *line_beg = text;
1275 const char *line_end = memchr (text, '\n', text_end - text);
1277 line_end = text_end;
1282 /* Strip whitespace from the beginning and end of line. */
1283 while (line_beg < line_end && ISSPACE (*line_beg))
1285 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1288 if (line_end > line_beg)
1290 /* URL is in the [line_beg, line_end) region. */
1294 struct urlpos *entry;
1297 /* We must copy the URL to a zero-terminated string, and we
1298 can't use alloca because we're in a loop. *sigh*. */
1299 url_text = strdupdelim (line_beg, line_end);
1303 /* Merge opt.base_href with URL. */
1304 char *merged = uri_merge (opt.base_href, url_text);
1309 url = url_parse (url_text, &up_error_code);
1312 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1313 file, url_text, url_error (up_error_code));
1319 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1320 memset (entry, 0, sizeof (*entry));
1331 read_file_free (fm);
1335 /* Free the linked list of urlpos. */
1337 free_urlpos (struct urlpos *l)
1341 struct urlpos *next = l->next;
1344 FREE_MAYBE (l->local_name);
1350 /* Rotate FNAME opt.backups times */
1352 rotate_backups(const char *fname)
1354 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1355 char *from = (char *)alloca (maxlen);
1356 char *to = (char *)alloca (maxlen);
1360 if (stat (fname, &sb) == 0)
1361 if (S_ISREG (sb.st_mode) == 0)
1364 for (i = opt.backups; i > 1; i--)
1366 sprintf (from, "%s.%d", fname, i - 1);
1367 sprintf (to, "%s.%d", fname, i);
1371 sprintf (to, "%s.%d", fname, 1);
1375 /* Create all the necessary directories for PATH (a file). Calls
1376 mkdirhier() internally. */
1378 mkalldirs (const char *path)
1385 p = path + strlen (path);
1386 for (; *p != '/' && p != path; p--)
1389 /* Don't create if it's just a file. */
1390 if ((p == path) && (*p != '/'))
1392 t = strdupdelim (path, p);
1394 /* Check whether the directory exists. */
1395 if ((stat (t, &st) == 0))
1397 if (S_ISDIR (st.st_mode))
1404 /* If the dir exists as a file name, remove it first. This
1405 is *only* for Wget to work with buggy old CERN http
1406 servers. Here is the scenario: When Wget tries to
1407 retrieve a directory without a slash, e.g.
1408 http://foo/bar (bar being a directory), CERN server will
1409 not redirect it too http://foo/bar/ -- it will generate a
1410 directory listing containing links to bar/file1,
1411 bar/file2, etc. Wget will lose because it saves this
1412 HTML listing to a file `bar', so it cannot create the
1413 directory. To work around this, if the file of the same
1414 name exists, we just remove it and create the directory
1416 DEBUGP (("Removing %s because of directory danger!\n", t));
1420 res = make_directory (t);
1422 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1427 /* Functions for constructing the file name out of URL components. */
1429 /* A growable string structure, used by url_file_name and friends.
1430 This should perhaps be moved to utils.c.
1432 The idea is to have a convenient and efficient way to construct a
1433 string by having various functions append data to it. Instead of
1434 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1435 functions in questions, we pass the pointer to this struct. */
1443 /* Ensure that the string can accept APPEND_COUNT more characters past
1444 the current TAIL position. If necessary, this will grow the string
1445 and update its allocated size. If the string is already large
1446 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1447 #define GROW(g, append_size) do { \
1448 struct growable *G_ = g; \
1449 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1452 /* Return the tail position of the string. */
1453 #define TAIL(r) ((r)->base + (r)->tail)
1455 /* Move the tail position by APPEND_COUNT characters. */
1456 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1458 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1462 append_string (const char *str, struct growable *dest)
1464 int l = strlen (str);
1466 memcpy (TAIL (dest), str, l);
1467 TAIL_INCR (dest, l);
1470 /* Append CH to DEST. For example, append_char (0, DEST)
1471 zero-terminates DEST. */
1474 append_char (char ch, struct growable *dest)
1478 TAIL_INCR (dest, 1);
1482 filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
1483 filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
1484 filechr_unsafe_windows = 2, /* disallowed on Windows file system */
1487 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1489 /* Shorthands for the table: */
1490 #define A filechr_unsafe_always
1491 #define S filechr_unsafe_shell
1492 #define W filechr_unsafe_windows
1497 Unix shell: 0-31, 128-159
1498 Windows: \, |, /, <, >, ?, :
1500 Arguably we could also claim `%' to be unsafe, since we use it as
1501 the escape character. If we ever want to be able to reliably
1502 translate file name back to URL, this would become important
1503 crucial. Right now, it's better to be minimal in escaping. */
1505 const static unsigned char filechr_table[256] =
1507 A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1508 S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
1509 S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1510 S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
1511 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1512 0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
1513 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1514 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1515 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1516 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1517 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1518 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1519 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1520 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1521 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1522 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1524 S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
1525 S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
1526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1535 /* Return non-zero if character CH is unsafe for use in file or
1536 directory name. Called by append_uri_pathel. */
1539 file_unsafe_char (char ch, int restrict)
1541 int mask = filechr_unsafe_always;
1542 if (restrict == restrict_shell)
1543 mask |= filechr_unsafe_shell;
1544 else if (restrict == restrict_windows)
1545 mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
1546 return FILE_CHAR_TEST (ch, mask);
1549 /* FN_PORT_SEP is the separator between host and port in file names
1550 for non-standard port numbers. On Unix this is normally ':', as in
1551 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1552 because Windows can't handle ':' in file names. */
1553 #define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
1555 /* FN_QUERY_SEP is the separator between the file name and the URL
1556 query, normally '?'. Since Windows cannot handle '?' as part of
1557 file name, we use '@' instead there. */
1558 #define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
1560 /* Quote path element, characters in [b, e), as file name, and append
1561 the quoted string to DEST. Each character is quoted as per
1562 file_unsafe_char and the corresponding table. */
1565 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1573 /* Currently restrict_for_windows is determined at compile time
1574 only. But some users download files to Windows partitions; they
1575 should be able to say --windows-file-names so Wget escapes
1576 characters invalid on Windows. Similar run-time restrictions for
1577 other file systems can be implemented. */
1578 const int restrict = opt.restrict_file_names;
1580 /* Copy [b, e) to PATHEL and URL-unescape it. */
1581 BOUNDED_TO_ALLOCA (b, e, pathel);
1582 url_unescape (pathel);
1583 pathlen = strlen (pathel);
1585 /* Go through PATHEL and check how many characters we'll need to
1586 add for file quoting. */
1588 for (p = pathel; *p; p++)
1589 if (file_unsafe_char (*p, restrict))
1592 /* p - pathel is the string length. Each quoted char means two
1593 additional characters in the string, hence 2*quoted. */
1594 outlen = (p - pathel) + (2 * quoted);
1595 GROW (dest, outlen);
1599 /* If there's nothing to quote, we don't need to go through the
1600 string the second time. */
1601 memcpy (TAIL (dest), pathel, outlen);
1605 char *q = TAIL (dest);
1606 for (p = pathel; *p; p++)
1608 if (!file_unsafe_char (*p, restrict))
1612 unsigned char ch = *p;
1614 *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1615 *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1618 assert (q - TAIL (dest) == outlen);
1620 TAIL_INCR (dest, outlen);
1623 /* Append to DEST the directory structure that corresponds the
1624 directory part of URL's path. For example, if the URL is
1625 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1627 Each path element ("dir1" and "dir2" in the above example) is
1628 examined, url-unescaped, and re-escaped as file name element.
1630 Additionally, it cuts as many directories from the path as
1631 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1632 will produce "bar" for the above example. For 2 or more, it will
1635 Each component of the path is quoted for use as file name. */
1638 append_dir_structure (const struct url *u, struct growable *dest)
1640 char *pathel, *next;
1641 int cut = opt.cut_dirs;
1643 /* Go through the path components, de-URL-quote them, and quote them
1644 (if necessary) as file names. */
1647 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1652 /* Ignore empty pathels. path_simplify should remove
1653 occurrences of "//" from the path, but it has special cases
1654 for starting / which generates an empty pathel here. */
1658 append_char ('/', dest);
1659 append_uri_pathel (pathel, next, dest);
1663 /* Return a unique file name that matches the given URL as good as
1664 possible. Does not create directories on the file system. */
1667 url_file_name (const struct url *u)
1669 struct growable fnres;
1671 char *u_file, *u_query;
1672 char *fname, *unique;
1678 /* Start with the directory prefix, if specified. */
1679 if (!DOTP (opt.dir_prefix))
1680 append_string (opt.dir_prefix, &fnres);
1682 /* If "dirstruct" is turned on (typically the case with -r), add
1683 the host and port (unless those have been turned off) and
1684 directory structure. */
1687 if (opt.add_hostdir)
1690 append_char ('/', &fnres);
1691 append_string (u->host, &fnres);
1692 if (u->port != scheme_default_port (u->scheme))
1695 number_to_string (portstr, u->port);
1696 append_char (FN_PORT_SEP, &fnres);
1697 append_string (portstr, &fnres);
1701 append_dir_structure (u, &fnres);
1704 /* Add the file name. */
1706 append_char ('/', &fnres);
1707 u_file = *u->file ? u->file : "index.html";
1708 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1710 /* Append "?query" to the file name. */
1711 u_query = u->query && *u->query ? u->query : NULL;
1714 append_char (FN_QUERY_SEP, &fnres);
1715 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1718 /* Zero-terminate the file name. */
1719 append_char ('\0', &fnres);
1723 /* Check the cases in which the unique extensions are not used:
1724 1) Clobbering is turned off (-nc).
1725 2) Retrieval with regetting.
1726 3) Timestamping is used.
1727 4) Hierarchy is built.
1729 The exception is the case when file does exist and is a
1730 directory (see `mkalldirs' for explanation). */
1732 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1733 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1736 unique = unique_name (fname, 1);
1737 if (unique != fname)
1742 /* Return the length of URL's path. Path is considered to be
1743 terminated by one of '?', ';', '#', or by the end of the
1746 path_length (const char *url)
1748 const char *q = strpbrk_or_eos (url, "?;#");
1752 /* Find the last occurrence of character C in the range [b, e), or
1753 NULL, if none are present. This is equivalent to strrchr(b, c),
1754 except that it accepts an END argument instead of requiring the
1755 string to be zero-terminated. Why is there no memrchr()? */
1757 find_last_char (const char *b, const char *e, char c)
1765 /* Resolve "." and ".." elements of PATH by destructively modifying
1766 PATH. "." is resolved by removing that path element, and ".." is
1767 resolved by removing the preceding path element. Leading and
1768 trailing slashes are preserved.
1770 Return non-zero if any changes have been made.
1772 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1773 test examples are provided below. If you change anything in this
1774 function, run test_path_simplify to make sure you haven't broken a
1777 A previous version of this function was based on path_simplify()
1778 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1781 path_simplify (char *path)
1787 ++path; /* preserve the leading '/'. */
1790 end = p + strlen (p) + 1; /* position past the terminating zero. */
1795 /* P should point to the beginning of a path element. */
1797 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1799 /* Handle "./foo" by moving "foo" two characters to the
1801 if (*(p + 1) == '/')
1804 memmove (p, p + 2, end - p);
1815 else if (*p == '.' && *(p + 1) == '.'
1816 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1818 /* Handle "../foo" by moving "foo" one path element to the
1820 char *b = p; /* not p-1 because P can equal PATH */
1822 /* Backtrack by one path element, but not past the beginning
1825 /* foo/bar/../baz */
1831 /* Move backwards until B hits the beginning of the
1832 previous path element or the beginning of path. */
1833 for (--b; b > path && *(b - 1) != '/'; b--)
1838 if (*(p + 2) == '/')
1840 memmove (b, p + 3, end - (p + 3));
1854 /* Remove empty path elements. Not mandated by rfc1808 et
1855 al, but it seems like a good idea to get rid of them.
1856 Supporting them properly is hard (in which directory do
1857 you save http://x.com///y.html?) and they don't seem to
1868 memmove (p, q, end - q);
1873 /* Skip to the next path element. */
1874 while (*p && *p != '/')
1879 /* Make sure P points to the beginning of the next path element,
1880 which is location after the slash. */
1887 /* Resolve the result of "linking" a base URI (BASE) to a
1888 link-specified URI (LINK).
1890 Either of the URIs may be absolute or relative, complete with the
1891 host name, or path only. This tries to behave "reasonably" in all
1892 foreseeable cases. It employs little specific knowledge about
1893 schemes or URL-specific stuff -- it just works on strings.
1895 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1896 See uri_merge for a gentler interface to this functionality.
1898 Perhaps this function should call path_simplify so that the callers
1899 don't have to call url_parse unconditionally. */
1901 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1907 const char *end = base + path_length (base);
1911 /* Empty LINK points back to BASE, query string and all. */
1912 constr = xstrdup (base);
1914 else if (*link == '?')
1916 /* LINK points to the same location, but changes the query
1917 string. Examples: */
1918 /* uri_merge("path", "?new") -> "path?new" */
1919 /* uri_merge("path?foo", "?new") -> "path?new" */
1920 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1921 /* uri_merge("path#foo", "?new") -> "path?new" */
1922 int baselength = end - base;
1923 constr = xmalloc (baselength + linklength + 1);
1924 memcpy (constr, base, baselength);
1925 memcpy (constr + baselength, link, linklength);
1926 constr[baselength + linklength] = '\0';
1928 else if (*link == '#')
1930 /* uri_merge("path", "#new") -> "path#new" */
1931 /* uri_merge("path#foo", "#new") -> "path#new" */
1932 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1933 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1935 const char *end1 = strchr (base, '#');
1937 end1 = base + strlen (base);
1938 baselength = end1 - base;
1939 constr = xmalloc (baselength + linklength + 1);
1940 memcpy (constr, base, baselength);
1941 memcpy (constr + baselength, link, linklength);
1942 constr[baselength + linklength] = '\0';
1944 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1946 /* LINK begins with "//" and so is a net path: we need to
1947 replace everything after (and including) the double slash
1950 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1951 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1952 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1956 const char *start_insert;
1958 /* Look for first slash. */
1959 slash = memchr (base, '/', end - base);
1960 /* If found slash and it is a double slash, then replace
1961 from this point, else default to replacing from the
1963 if (slash && *(slash + 1) == '/')
1964 start_insert = slash;
1966 start_insert = base;
1968 span = start_insert - base;
1969 constr = (char *)xmalloc (span + linklength + 1);
1971 memcpy (constr, base, span);
1972 memcpy (constr + span, link, linklength);
1973 constr[span + linklength] = '\0';
1975 else if (*link == '/')
1977 /* LINK is an absolute path: we need to replace everything
1978 after (and including) the FIRST slash with LINK.
1980 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1981 "/qux/xyzzy", our result should be
1982 "http://host/qux/xyzzy". */
1985 const char *start_insert = NULL; /* for gcc to shut up. */
1986 const char *pos = base;
1987 int seen_slash_slash = 0;
1988 /* We're looking for the first slash, but want to ignore
1991 slash = memchr (pos, '/', end - pos);
1992 if (slash && !seen_slash_slash)
1993 if (*(slash + 1) == '/')
1996 seen_slash_slash = 1;
2000 /* At this point, SLASH is the location of the first / after
2001 "//", or the first slash altogether. START_INSERT is the
2002 pointer to the location where LINK will be inserted. When
2003 examining the last two examples, keep in mind that LINK
2006 if (!slash && !seen_slash_slash)
2007 /* example: "foo" */
2009 start_insert = base;
2010 else if (!slash && seen_slash_slash)
2011 /* example: "http://foo" */
2014 else if (slash && !seen_slash_slash)
2015 /* example: "foo/bar" */
2017 start_insert = base;
2018 else if (slash && seen_slash_slash)
2019 /* example: "http://something/" */
2021 start_insert = slash;
2023 span = start_insert - base;
2024 constr = (char *)xmalloc (span + linklength + 1);
2026 memcpy (constr, base, span);
2028 memcpy (constr + span, link, linklength);
2029 constr[span + linklength] = '\0';
2033 /* LINK is a relative URL: we need to replace everything
2034 after last slash (possibly empty) with LINK.
2036 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2037 our result should be "whatever/foo/qux/xyzzy". */
2038 int need_explicit_slash = 0;
2040 const char *start_insert;
2041 const char *last_slash = find_last_char (base, end, '/');
2044 /* No slash found at all. Append LINK to what we have,
2045 but we'll need a slash as a separator.
2047 Example: if base == "foo" and link == "qux/xyzzy", then
2048 we cannot just append link to base, because we'd get
2049 "fooqux/xyzzy", whereas what we want is
2052 To make sure the / gets inserted, we set
2053 need_explicit_slash to 1. We also set start_insert
2054 to end + 1, so that the length calculations work out
2055 correctly for one more (slash) character. Accessing
2056 that character is fine, since it will be the
2057 delimiter, '\0' or '?'. */
2058 /* example: "foo?..." */
2059 /* ^ ('?' gets changed to '/') */
2060 start_insert = end + 1;
2061 need_explicit_slash = 1;
2063 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2065 /* example: http://host" */
2067 start_insert = end + 1;
2068 need_explicit_slash = 1;
2072 /* example: "whatever/foo/bar" */
2074 start_insert = last_slash + 1;
2077 span = start_insert - base;
2078 constr = (char *)xmalloc (span + linklength + 1);
2080 memcpy (constr, base, span);
2081 if (need_explicit_slash)
2082 constr[span - 1] = '/';
2084 memcpy (constr + span, link, linklength);
2085 constr[span + linklength] = '\0';
2088 else /* !no_scheme */
2090 constr = strdupdelim (link, link + linklength);
2095 /* Merge BASE with LINK and return the resulting URI. This is an
2096 interface to uri_merge_1 that assumes that LINK is a
2097 zero-terminated string. */
2099 uri_merge (const char *base, const char *link)
2101 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2104 #define APPEND(p, s) do { \
2105 int len = strlen (s); \
2106 memcpy (p, s, len); \
2110 /* Use this instead of password when the actual password is supposed
2111 to be hidden. We intentionally use a generic string without giving
2112 away the number of characters in the password, like previous
2114 #define HIDDEN_PASSWORD "*password*"
2116 /* Recreate the URL string from the data in URL.
2118 If HIDE is non-zero (as it is when we're calling this on a URL we
2119 plan to print, but not when calling it to canonicalize a URL for
2120 use within the program), password will be hidden. Unsafe
2121 characters in the URL will be quoted. */
2124 url_string (const struct url *url, int hide_password)
2128 char *quoted_user = NULL, *quoted_passwd = NULL;
2130 int scheme_port = supported_schemes[url->scheme].default_port;
2131 char *scheme_str = supported_schemes[url->scheme].leading_string;
2132 int fplen = full_path_length (url);
2134 int brackets_around_host = 0;
2136 assert (scheme_str != NULL);
2138 /* Make sure the user name and password are quoted. */
2141 quoted_user = url_escape_allow_passthrough (url->user);
2145 quoted_passwd = HIDDEN_PASSWORD;
2147 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2151 if (strchr (url->host, ':'))
2152 brackets_around_host = 1;
2154 size = (strlen (scheme_str)
2155 + strlen (url->host)
2156 + (brackets_around_host ? 2 : 0)
2159 if (url->port != scheme_port)
2160 size += 1 + numdigit (url->port);
2163 size += 1 + strlen (quoted_user);
2165 size += 1 + strlen (quoted_passwd);
2168 p = result = xmalloc (size);
2170 APPEND (p, scheme_str);
2173 APPEND (p, quoted_user);
2177 APPEND (p, quoted_passwd);
2182 if (brackets_around_host)
2184 APPEND (p, url->host);
2185 if (brackets_around_host)
2187 if (url->port != scheme_port)
2190 p = number_to_string (p, url->port);
2193 full_path_write (url, p);
2197 assert (p - result == size);
2199 if (quoted_user && quoted_user != url->user)
2200 xfree (quoted_user);
2201 if (quoted_passwd && !hide_password
2202 && quoted_passwd != url->passwd)
2203 xfree (quoted_passwd);
2208 /* Return the URL of the proxy appropriate for url U. */
2210 getproxy (struct url *u)
2213 char *rewritten_url;
2214 static char rewritten_storage[1024];
2218 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2224 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2228 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2232 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2234 case SCHEME_INVALID:
2237 if (!proxy || !*proxy)
2240 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2241 getproxy() to return static storage. */
2242 rewritten_url = rewrite_shorthand_url (proxy);
2245 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2246 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2247 proxy = rewritten_storage;
2253 /* Should a host be accessed through proxy, concerning no_proxy? */
2255 no_proxy_match (const char *host, const char **no_proxy)
2260 return !sufmatch (no_proxy, host);
2263 /* Support for converting links for local viewing in downloaded HTML
2264 files. This should be moved to another file, because it has
2265 nothing to do with processing URLs. */
2267 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2268 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2270 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2271 const char *, int));
2272 static char *local_quote_string PARAMS ((const char *));
2274 /* Change the links in one HTML file. LINKS is a list of links in the
2275 document, along with their positions and the desired direction of
2278 convert_links (const char *file, struct urlpos *links)
2280 struct file_memory *fm;
2283 downloaded_file_t downloaded_file_return;
2285 struct urlpos *link;
2286 int to_url_count = 0, to_file_count = 0;
2288 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2291 /* First we do a "dry run": go through the list L and see whether
2292 any URL needs to be converted in the first place. If not, just
2293 leave the file alone. */
2295 struct urlpos *dry = links;
2296 for (dry = links; dry; dry = dry->next)
2297 if (dry->convert != CO_NOCONVERT)
2301 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2306 fm = read_file (file);
2309 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2310 file, strerror (errno));
2314 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2315 if (opt.backup_converted && downloaded_file_return)
2316 write_backup_file (file, downloaded_file_return);
2318 /* Before opening the file for writing, unlink the file. This is
2319 important if the data in FM is mmaped. In such case, nulling the
2320 file, which is what fopen() below does, would make us read all
2321 zeroes from the mmaped region. */
2322 if (unlink (file) < 0 && errno != ENOENT)
2324 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2325 file, strerror (errno));
2326 read_file_free (fm);
2329 /* Now open the file for writing. */
2330 fp = fopen (file, "wb");
2333 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2334 file, strerror (errno));
2335 read_file_free (fm);
2339 /* Here we loop through all the URLs in file, replacing those of
2340 them that are downloaded with relative references. */
2342 for (link = links; link; link = link->next)
2344 char *url_start = fm->content + link->pos;
2346 if (link->pos >= fm->length)
2348 DEBUGP (("Something strange is going on. Please investigate."));
2351 /* If the URL is not to be converted, skip it. */
2352 if (link->convert == CO_NOCONVERT)
2354 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2358 /* Echo the file contents, up to the offending URL's opening
2359 quote, to the outfile. */
2360 fwrite (p, 1, url_start - p, fp);
2363 switch (link->convert)
2365 case CO_CONVERT_TO_RELATIVE:
2366 /* Convert absolute URL to relative. */
2368 char *newname = construct_relative (file, link->local_name);
2369 char *quoted_newname = local_quote_string (newname);
2371 if (!link->link_refresh_p)
2372 p = replace_attr (p, link->size, fp, quoted_newname);
2374 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2375 link->refresh_timeout);
2377 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2378 link->url->url, newname, link->pos, file));
2380 xfree (quoted_newname);
2384 case CO_CONVERT_TO_COMPLETE:
2385 /* Convert the link to absolute URL. */
2387 char *newlink = link->url->url;
2388 char *quoted_newlink = html_quote_string (newlink);
2390 if (!link->link_refresh_p)
2391 p = replace_attr (p, link->size, fp, quoted_newlink);
2393 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2394 link->refresh_timeout);
2396 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2397 newlink, link->pos, file));
2398 xfree (quoted_newlink);
2402 case CO_NULLIFY_BASE:
2403 /* Change the base href to "". */
2404 p = replace_attr (p, link->size, fp, "");
2412 /* Output the rest of the file. */
2413 if (p - fm->content < fm->length)
2414 fwrite (p, 1, fm->length - (p - fm->content), fp);
2416 read_file_free (fm);
2418 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2421 /* Construct and return a malloced copy of the relative link from two
2422 pieces of information: local name S1 of the referring file and
2423 local name S2 of the referred file.
2425 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2426 "jagor.srce.hr/images/news.gif", the function will return
2429 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2430 "fly.cc.fer.hr/images/fly.gif", the function will return
2431 "../images/fly.gif".
2433 Caveats: S1 should not begin with `/', unless S2 also begins with
2434 '/'. S1 should not contain things like ".." and such --
2435 construct_relative ("fly/ioccc/../index.html",
2436 "fly/images/fly.gif") will fail. (A workaround is to call
2437 something like path_simplify() on S1). */
2439 construct_relative (const char *s1, const char *s2)
2441 int i, cnt, sepdirs1;
2445 return xstrdup (s2);
2446 /* S1 should *not* be absolute, if S2 wasn't. */
2447 assert (*s1 != '/');
2449 /* Skip the directories common to both strings. */
2452 while (s1[i] && s2[i]
2457 if (s1[i] == '/' && s2[i] == '/')
2462 for (sepdirs1 = 0; s1[i]; i++)
2465 /* Now, construct the file as of:
2466 - ../ repeated sepdirs1 time
2467 - all the non-mutual directories of S2. */
2468 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2469 for (i = 0; i < sepdirs1; i++)
2470 memcpy (res + 3 * i, "../", 3);
2471 strcpy (res + 3 * i, s2 + cnt);
2476 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2478 /* Rather than just writing over the original .html file with the
2479 converted version, save the former to *.orig. Note we only do
2480 this for files we've _successfully_ downloaded, so we don't
2481 clobber .orig files sitting around from previous invocations. */
2483 /* Construct the backup filename as the original name plus ".orig". */
2484 size_t filename_len = strlen(file);
2485 char* filename_plus_orig_suffix;
2486 boolean already_wrote_backup_file = FALSE;
2487 slist* converted_file_ptr;
2488 static slist* converted_files = NULL;
2490 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2492 /* Just write "orig" over "html". We need to do it this way
2493 because when we're checking to see if we've downloaded the
2494 file before (to see if we can skip downloading it), we don't
2495 know if it's a text/html file. Therefore we don't know yet
2496 at that stage that -E is going to cause us to tack on
2497 ".html", so we need to compare vs. the original URL plus
2498 ".orig", not the original URL plus ".html.orig". */
2499 filename_plus_orig_suffix = alloca (filename_len + 1);
2500 strcpy(filename_plus_orig_suffix, file);
2501 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2503 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2505 /* Append ".orig" to the name. */
2506 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2507 strcpy(filename_plus_orig_suffix, file);
2508 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2511 /* We can get called twice on the same URL thanks to the
2512 convert_all_links() call in main(). If we write the .orig file
2513 each time in such a case, it'll end up containing the first-pass
2514 conversion, not the original file. So, see if we've already been
2515 called on this file. */
2516 converted_file_ptr = converted_files;
2517 while (converted_file_ptr != NULL)
2518 if (strcmp(converted_file_ptr->string, file) == 0)
2520 already_wrote_backup_file = TRUE;
2524 converted_file_ptr = converted_file_ptr->next;
2526 if (!already_wrote_backup_file)
2528 /* Rename <file> to <file>.orig before former gets written over. */
2529 if (rename(file, filename_plus_orig_suffix) != 0)
2530 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2531 file, filename_plus_orig_suffix, strerror (errno));
2533 /* Remember that we've already written a .orig backup for this file.
2534 Note that we never free this memory since we need it till the
2535 convert_all_links() call, which is one of the last things the
2536 program does before terminating. BTW, I'm not sure if it would be
2537 safe to just set 'converted_file_ptr->string' to 'file' below,
2538 rather than making a copy of the string... Another note is that I
2539 thought I could just add a field to the urlpos structure saying
2540 that we'd written a .orig file for this URL, but that didn't work,
2541 so I had to make this separate list.
2542 -- Dan Harkless <wget@harkless.org>
2544 This [adding a field to the urlpos structure] didn't work
2545 because convert_file() is called from convert_all_links at
2546 the end of the retrieval with a freshly built new urlpos
2548 -- Hrvoje Niksic <hniksic@arsdigita.com>
2550 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2551 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2552 converted_file_ptr->next = converted_files;
2553 converted_files = converted_file_ptr;
2557 static int find_fragment PARAMS ((const char *, int, const char **,
2560 /* Replace an attribute's original text with NEW_TEXT. */
2563 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2566 char quote_char = '\"'; /* use "..." for quoting, unless the
2567 original value is quoted, in which
2568 case reuse its quoting char. */
2569 const char *frag_beg, *frag_end;
2571 /* Structure of our string is:
2572 "...old-contents..."
2573 <--- size ---> (with quotes)
2576 <--- size --> (no quotes) */
2578 if (*p == '\"' || *p == '\'')
2583 size -= 2; /* disregard opening and closing quote */
2585 putc (quote_char, fp);
2586 fputs (new_text, fp);
2588 /* Look for fragment identifier, if any. */
2589 if (find_fragment (p, size, &frag_beg, &frag_end))
2590 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2594 putc (quote_char, fp);
2599 /* The same as REPLACE_ATTR, but used when replacing
2600 <meta http-equiv=refresh content="new_text"> because we need to
2601 append "timeout_value; URL=" before the next_text. */
2604 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2605 const char *new_text, int timeout)
2608 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2612 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2614 return replace_attr (p, size, fp, new_with_timeout);
2617 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2618 preceded by '&'. If the character is not found, return zero. If
2619 the character is found, return 1 and set BP and EP to point to the
2620 beginning and end of the region.
2622 This is used for finding the fragment indentifiers in URLs. */
2625 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2627 const char *end = beg + size;
2629 for (; beg < end; beg++)
2651 /* Quote FILE for use as local reference to an HTML file.
2653 We quote ? as %3F to avoid passing part of the file name as the
2654 parameter when browsing the converted file through HTTP. However,
2655 it is safe to do this only when `--html-extension' is turned on.
2656 This is because converting "index.html?foo=bar" to
2657 "index.html%3Ffoo=bar" would break local browsing, as the latter
2658 isn't even recognized as an HTML file! However, converting
2659 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2660 safe for both local and HTTP-served browsing. */
2663 local_quote_string (const char *file)
2665 const char *file_sans_qmark;
2668 if (!opt.html_extension)
2669 return html_quote_string (file);
2671 qm = count_char (file, '?');
2675 const char *from = file;
2678 /* qm * 2 because we replace each question mark with "%3F",
2679 i.e. replace one char with three, hence two more. */
2680 int fsqlen = strlen (file) + qm * 2;
2682 to = newname = (char *)alloca (fsqlen + 1);
2683 for (; *from; from++)
2694 assert (to - newname == fsqlen);
2697 file_sans_qmark = newname;
2700 file_sans_qmark = file;
2702 return html_quote_string (file_sans_qmark);
2705 /* We're storing "modes" of type downloaded_file_t in the hash table.
2706 However, our hash tables only accept pointers for keys and values.
2707 So when we need a pointer, we use the address of a
2708 downloaded_file_t variable of static storage. */
2710 static downloaded_file_t *
2711 downloaded_mode_to_ptr (downloaded_file_t mode)
2713 static downloaded_file_t
2714 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2715 v2 = FILE_DOWNLOADED_NORMALLY,
2716 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2717 v4 = CHECK_FOR_FILE;
2721 case FILE_NOT_ALREADY_DOWNLOADED:
2723 case FILE_DOWNLOADED_NORMALLY:
2725 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2727 case CHECK_FOR_FILE:
2733 /* This should really be merged with dl_file_url_map and
2734 downloaded_html_files in recur.c. This was originally a list, but
2735 I changed it to a hash table beause it was actually taking a lot of
2736 time to find things in it. */
2738 static struct hash_table *downloaded_files_hash;
2740 /* Remembers which files have been downloaded. In the standard case, should be
2741 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2742 download successfully (i.e. not for ones we have failures on or that we skip
2745 When we've downloaded a file and tacked on a ".html" extension due to -E,
2746 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2747 FILE_DOWNLOADED_NORMALLY.
2749 If you just want to check if a file has been previously added without adding
2750 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2751 with local filenames, not remote URLs. */
2753 downloaded_file (downloaded_file_t mode, const char *file)
2755 downloaded_file_t *ptr;
2757 if (mode == CHECK_FOR_FILE)
2759 if (!downloaded_files_hash)
2760 return FILE_NOT_ALREADY_DOWNLOADED;
2761 ptr = hash_table_get (downloaded_files_hash, file);
2763 return FILE_NOT_ALREADY_DOWNLOADED;
2767 if (!downloaded_files_hash)
2768 downloaded_files_hash = make_string_hash_table (0);
2770 ptr = hash_table_get (downloaded_files_hash, file);
2774 ptr = downloaded_mode_to_ptr (mode);
2775 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2777 return FILE_NOT_ALREADY_DOWNLOADED;
2781 df_free_mapper (void *key, void *value, void *ignored)
2788 downloaded_files_free (void)
2790 if (downloaded_files_hash)
2792 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2793 hash_table_destroy (downloaded_files_hash);
2794 downloaded_files_hash = NULL;
2798 /* Return non-zero if scheme a is similar to scheme b.
2800 Schemes are similar if they are equal. If SSL is supported, schemes
2801 are also similar if one is http (SCHEME_HTTP) and the other is https
2804 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2809 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2810 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2817 /* Debugging and testing support for path_simplify. */
2819 /* Debug: run path_simplify on PATH and return the result in a new
2820 string. Useful for calling from the debugger. */
2824 char *copy = xstrdup (path);
2825 path_simplify (copy);
2830 run_test (char *test, char *expected_result, int expected_change)
2832 char *test_copy = xstrdup (test);
2833 int modified = path_simplify (test_copy);
2835 if (0 != strcmp (test_copy, expected_result))
2837 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2838 test, expected_result, test_copy);
2840 if (modified != expected_change)
2842 if (expected_change == 1)
2843 printf ("Expected no modification with path_simplify(\"%s\").\n",
2846 printf ("Expected modification with path_simplify(\"%s\").\n",
2853 test_path_simplify (void)
2856 char *test, *result;
2862 { "foo", "foo", 0 },
2863 { "foo/bar", "foo/bar", 0 },
2864 { "foo///bar", "foo/bar", 1 },
2865 { "foo/.", "foo/", 1 },
2866 { "foo/./", "foo/", 1 },
2867 { "foo./", "foo./", 0 },
2868 { "foo/../bar", "bar", 1 },
2869 { "foo/../bar/", "bar/", 1 },
2870 { "foo/bar/..", "foo/", 1 },
2871 { "foo/bar/../x", "foo/x", 1 },
2872 { "foo/bar/../x/", "foo/x/", 1 },
2873 { "foo/..", "", 1 },
2874 { "foo/../..", "", 1 },
2875 { "a/b/../../c", "c", 1 },
2876 { "./a/../b", "b", 1 }
2880 for (i = 0; i < ARRAY_SIZE (tests); i++)
2882 char *test = tests[i].test;
2883 char *expected_result = tests[i].result;
2884 int expected_change = tests[i].should_modify;
2885 run_test (test, expected_result, expected_change);
2888 /* Now run all the tests with a leading slash before the test case,
2889 to prove that the slash is being preserved. */
2890 for (i = 0; i < ARRAY_SIZE (tests); i++)
2892 char *test, *expected_result;
2893 int expected_change = tests[i].should_modify;
2895 test = xmalloc (1 + strlen (tests[i].test) + 1);
2896 sprintf (test, "/%s", tests[i].test);
2898 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2899 sprintf (expected_result, "/%s", tests[i].result);
2901 run_test (test, expected_result, expected_change);
2904 xfree (expected_result);