2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
62 static const int NS_INADDRSZ = 4;
63 static const int NS_IN6ADDRSZ = 16;
64 static const int NS_INT16SZ = 2;
74 /* Supported schemes: */
75 static struct scheme_data supported_schemes[] =
77 { "http://", DEFAULT_HTTP_PORT, 1 },
79 { "https://", DEFAULT_HTTPS_PORT, 1 },
81 { "ftp://", DEFAULT_FTP_PORT, 1 },
87 /* Forward declarations: */
89 static char *construct_relative PARAMS ((const char *, const char *));
90 static int path_simplify PARAMS ((char *));
94 /* Support for encoding and decoding of URL strings. We determine
95 whether a character is unsafe through static table lookup. This
96 code assumes ASCII character set and 8-bit chars. */
99 /* rfc1738 reserved chars, preserved from encoding. */
102 /* rfc1738 unsafe chars, plus some more. */
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
110 /* Shorthands for the table: */
111 #define R urlchr_reserved
112 #define U urlchr_unsafe
115 const static unsigned char urlchr_table[256] =
117 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
118 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
119 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
120 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
121 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
122 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
123 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
124 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
125 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
126 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
127 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
128 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
129 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
130 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
131 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
132 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
134 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 /* URL-unescape the string S.
150 This is done by transforming the sequences "%HH" to the character
151 represented by the hexadecimal digits HH. If % is not followed by
152 two hexadecimal digits, it is inserted literally.
154 The transformation is done in place. If you need the original
155 string intact, make a copy before calling this function. */
158 url_unescape (char *s)
160 char *t = s; /* t - tortoise */
161 char *h = s; /* h - hare */
172 /* Do nothing if '%' is not followed by two hex digits. */
173 if (!*(h + 1) || !*(h + 2)
174 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
176 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
183 /* The core of url_escape_* functions. Escapes the characters that
184 match the provided mask in urlchr_table.
186 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
187 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
188 freshly allocated string will be returned in all cases. */
191 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
198 for (p1 = s; *p1; p1++)
199 if (urlchr_test (*p1, mask))
200 addition += 2; /* Two more characters (hex digits) */
203 return allow_passthrough ? (char *)s : xstrdup (s);
205 newlen = (p1 - s) + addition;
206 newstr = (char *)xmalloc (newlen + 1);
212 /* Quote the characters that match the test mask. */
213 if (urlchr_test (*p1, mask))
215 unsigned char c = *p1++;
217 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
218 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
223 assert (p2 - newstr == newlen);
229 /* URL-escape the unsafe characters (see urlchr_table) in a given
230 string, returning a freshly allocated string. */
233 url_escape (const char *s)
235 return url_escape_1 (s, urlchr_unsafe, 0);
238 /* URL-escape the unsafe characters (see urlchr_table) in a given
239 string. If no characters are unsafe, S is returned. */
242 url_escape_allow_passthrough (const char *s)
244 return url_escape_1 (s, urlchr_unsafe, 1);
247 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
249 /* Decide whether to encode, decode, or pass through the char at P.
250 This used to be a macro, but it got a little too convoluted. */
251 static inline enum copy_method
252 decide_copy_method (const char *p)
256 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
258 /* %xx sequence: decode it, unless it would decode to an
259 unsafe or a reserved char; in that case, leave it as
261 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
262 XCHAR_TO_XDIGIT (*(p + 2));
264 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
265 return CM_PASSTHROUGH;
270 /* Garbled %.. sequence: encode `%'. */
273 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
276 return CM_PASSTHROUGH;
279 /* Translate a %-escaped (but possibly non-conformant) input string S
280 into a %-escaped (and conformant) output string. If no characters
281 are encoded or decoded, return the same string S; otherwise, return
282 a freshly allocated string with the new contents.
284 After a URL has been run through this function, the protocols that
285 use `%' as the quote character can use the resulting string as-is,
286 while those that don't call url_unescape() to get to the intended
287 data. This function is also stable: after an input string is
288 transformed the first time, all further transformations of the
289 result yield the same result string.
291 Let's discuss why this function is needed.
293 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
294 space character would mess up the HTTP request, it needs to be
297 GET /abc%20def HTTP/1.0
299 It appears that the unsafe chars need to be quoted, for example
300 with url_escape. But what if we're requested to download
301 `abc%20def'? url_escape transforms "%" to "%25", which would leave
302 us with `abc%2520def'. This is incorrect -- since %-escapes are
303 part of URL syntax, "%20" is the correct way to denote a literal
304 space on the Wget command line. This leaves us in the conclusion
305 that in that case Wget should not call url_escape, but leave the
308 And what if the requested URI is `abc%20 def'? If we call
309 url_escape, we end up with `/abc%2520%20def', which is almost
310 certainly not intended. If we don't call url_escape, we are left
311 with the embedded space and cannot complete the request. What the
312 user meant was for Wget to request `/abc%20%20def', and this is
313 where reencode_escapes kicks in.
315 Wget used to solve this by first decoding %-quotes, and then
316 encoding all the "unsafe" characters found in the resulting string.
317 This was wrong because it didn't preserve certain URL special
318 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
319 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
320 whether we considered `+' reserved (it is). One of these results
321 is inevitable because by the second step we would lose information
322 on whether the `+' was originally encoded or not. Both results
323 were wrong because in CGI parameters + means space, while %2B means
324 literal plus. reencode_escapes correctly translates the above to
325 "a%2B+b", i.e. returns the original string.
327 This function uses an algorithm proposed by Anon Sricharoenchai:
329 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
332 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
335 ...except that this code conflates the two steps, and decides
336 whether to encode, decode, or pass through each character in turn.
337 The function still uses two passes, but their logic is the same --
338 the first pass exists merely for the sake of allocation. Another
339 small difference is that we include `+' to URL_RESERVED.
343 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
345 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
349 "foo bar" -> "foo%20bar"
350 "foo%20bar" -> "foo%20bar"
351 "foo %20bar" -> "foo%20%20bar"
352 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
353 "foo%25%20bar" -> "foo%25%20bar"
354 "foo%2%20bar" -> "foo%252%20bar"
355 "foo+bar" -> "foo+bar" (plus is reserved!)
356 "foo%2b+bar" -> "foo%2b+bar" */
359 reencode_escapes (const char *s)
365 int encode_count = 0;
366 int decode_count = 0;
368 /* First, pass through the string to see if there's anything to do,
369 and to calculate the new length. */
370 for (p1 = s; *p1; p1++)
372 switch (decide_copy_method (p1))
385 if (!encode_count && !decode_count)
386 /* The string is good as it is. */
387 return (char *)s; /* C const model sucks. */
390 /* Each encoding adds two characters (hex digits), while each
391 decoding removes two characters. */
392 newlen = oldlen + 2 * (encode_count - decode_count);
393 newstr = xmalloc (newlen + 1);
400 switch (decide_copy_method (p1))
404 unsigned char c = *p1++;
406 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
407 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
411 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
412 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
413 p1 += 3; /* skip %xx */
420 assert (p2 - newstr == newlen);
424 /* Returns the scheme type if the scheme is supported, or
425 SCHEME_INVALID if not. */
427 url_scheme (const char *url)
431 for (i = 0; supported_schemes[i].leading_string; i++)
432 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
433 strlen (supported_schemes[i].leading_string)))
435 if (supported_schemes[i].enabled)
436 return (enum url_scheme) i;
438 return SCHEME_INVALID;
441 return SCHEME_INVALID;
444 /* Return the number of characters needed to skip the scheme part of
445 the URL, e.g. `http://'. If no scheme is found, returns 0. */
447 url_skip_scheme (const char *url)
451 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
453 while (ISALNUM (*p) || *p == '-' || *p == '+')
460 /* Skip "//" if found. */
461 if (*p == '/' && *(p + 1) == '/')
467 /* Returns 1 if the URL begins with a scheme (supported or
468 unsupported), 0 otherwise. */
470 url_has_scheme (const char *url)
473 while (ISALNUM (*p) || *p == '-' || *p == '+')
479 scheme_default_port (enum url_scheme scheme)
481 return supported_schemes[scheme].default_port;
485 scheme_disable (enum url_scheme scheme)
487 supported_schemes[scheme].enabled = 0;
490 /* Skip the username and password, if present here. The function
491 should be called *not* with the complete URL, but with the part
492 right after the scheme.
494 If no username and password are found, return 0. */
496 url_skip_uname (const char *url)
500 /* Look for '@' that comes before '/' or '?'. */
501 p = (const char *)strpbrk (url, "/?@");
509 parse_uname (const char *str, int len, char **user, char **passwd)
514 /* Empty user name not allowed. */
517 colon = memchr (str, ':', len);
519 /* Empty user name again. */
524 int pwlen = len - (colon + 1 - str);
525 *passwd = xmalloc (pwlen + 1);
526 memcpy (*passwd, colon + 1, pwlen);
527 (*passwd)[pwlen] = '\0';
533 *user = xmalloc (len + 1);
534 memcpy (*user, str, len);
538 url_unescape (*user);
540 url_unescape (*passwd);
545 /* Used by main.c: detect URLs written using the "shorthand" URL forms
546 popularized by Netscape and NcFTP. HTTP shorthands look like this:
548 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
549 www.foo.com[:port] -> http://www.foo.com[:port]
551 FTP shorthands look like this:
553 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
554 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
556 If the URL needs not or cannot be rewritten, return NULL. */
558 rewrite_shorthand_url (const char *url)
562 if (url_has_scheme (url))
565 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
567 for (p = url; *p && *p != ':' && *p != '/'; p++)
577 /* If the characters after the colon and before the next slash
578 or end of string are all digits, it's HTTP. */
580 for (pp = p + 1; ISDIGIT (*pp); pp++)
582 if (digits > 0 && (*pp == '/' || *pp == '\0'))
585 /* Prepend "ftp://" to the entire URL... */
586 res = xmalloc (6 + strlen (url) + 1);
587 sprintf (res, "ftp://%s", url);
588 /* ...and replace ':' with '/'. */
589 res[6 + (p - url)] = '/';
596 /* Just prepend "http://" to what we have. */
597 res = xmalloc (7 + strlen (url) + 1);
598 sprintf (res, "http://%s", url);
603 static void parse_path PARAMS ((const char *, char **, char **));
605 /* Like strpbrk, with the exception that it returns the pointer to the
606 terminating zero (end-of-string aka "eos") if no matching character
609 Although I normally balk at Gcc-specific optimizations, it probably
610 makes sense here: glibc has optimizations that detect strpbrk being
611 called with literal string as ACCEPT and inline the search. That
612 optimization is defeated if strpbrk is hidden within the call to
613 another function. (And no, making strpbrk_or_eos inline doesn't
614 help because the check for literal accept is in the
619 #define strpbrk_or_eos(s, accept) ({ \
620 char *SOE_p = strpbrk (s, accept); \
622 SOE_p = (char *)s + strlen (s); \
626 #else /* not __GNUC__ */
629 strpbrk_or_eos (const char *s, const char *accept)
631 char *p = strpbrk (s, accept);
633 p = (char *)s + strlen (s);
638 /* Turn STR into lowercase; return non-zero if a character was
642 lowercase_str (char *str)
649 *str = TOLOWER (*str);
654 static char *parse_errors[] = {
655 #define PE_NO_ERROR 0
657 #define PE_UNSUPPORTED_SCHEME 1
658 "Unsupported scheme",
659 #define PE_EMPTY_HOST 2
661 #define PE_BAD_PORT_NUMBER 3
663 #define PE_INVALID_USER_NAME 4
665 #define PE_UNTERMINATED_IPV6_ADDRESS 5
666 "Unterminated IPv6 numeric address",
667 #define PE_IPV6_NOT_SUPPORTED 6
668 "IPv6 addresses not supported",
669 #define PE_INVALID_IPV6_ADDRESS 7
670 "Invalid IPv6 numeric address"
673 #define SETERR(p, v) do { \
679 /* The following two functions were adapted from glibc. */
682 is_valid_ipv4_address (const char *str, const char *end)
684 int saw_digit, octets;
694 if (ch >= '0' && ch <= '9') {
695 val = val * 10 + (ch - '0');
699 if (saw_digit == 0) {
704 } else if (ch == '.' && saw_digit == 1) {
719 is_valid_ipv6_address (const char *str, const char *end)
721 static const char xdigits[] = "0123456789abcdef";
734 /* Leading :: requires some special handling. */
738 if (str == end || *str != ':')
750 /* if ch is a number, add it to val. */
751 pch = strchr(xdigits, ch);
754 val |= (pch - xdigits);
761 /* if ch is a colon ... */
764 if (saw_xdigit == 0) {
769 } else if (str == end) {
772 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
780 /* if ch is a dot ... */
781 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
782 is_valid_ipv4_address(curtok, end) == 1) {
791 if (saw_xdigit == 1) {
792 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
797 if (colonp != NULL) {
798 if (tp == NS_IN6ADDRSZ)
803 if (tp != NS_IN6ADDRSZ)
812 Return a new struct url if successful, NULL on error. In case of
813 error, and if ERROR is not NULL, also set *ERROR to the appropriate
816 url_parse (const char *url, int *error)
820 int path_modified, host_modified;
822 enum url_scheme scheme;
824 const char *uname_b, *uname_e;
825 const char *host_b, *host_e;
826 const char *path_b, *path_e;
827 const char *params_b, *params_e;
828 const char *query_b, *query_e;
829 const char *fragment_b, *fragment_e;
832 char *user = NULL, *passwd = NULL;
836 scheme = url_scheme (url);
837 if (scheme == SCHEME_INVALID)
839 SETERR (error, PE_UNSUPPORTED_SCHEME);
843 url_encoded = reencode_escapes (url);
846 p += strlen (supported_schemes[scheme].leading_string);
848 p += url_skip_uname (p);
851 /* scheme://user:pass@host[:port]... */
854 /* We attempt to break down the URL into the components path,
855 params, query, and fragment. They are ordered like this:
857 scheme://host[:port][/path][;params][?query][#fragment] */
859 params_b = params_e = NULL;
860 query_b = query_e = NULL;
861 fragment_b = fragment_e = NULL;
867 /* Handle IPv6 address inside square brackets. Ideally we'd
868 just look for the terminating ']', but rfc2732 mandates
869 rejecting invalid IPv6 addresses. */
871 /* The address begins after '['. */
873 host_e = strchr (host_b, ']');
877 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
882 /* Check if the IPv6 address is valid. */
883 if (!is_valid_ipv6_address(host_b, host_e))
885 SETERR (error, PE_INVALID_IPV6_ADDRESS);
889 /* Continue parsing after the closing ']'. */
892 SETERR (error, PE_IPV6_NOT_SUPPORTED);
898 p = strpbrk_or_eos (p, ":/;?#");
902 if (host_b == host_e)
904 SETERR (error, PE_EMPTY_HOST);
908 port = scheme_default_port (scheme);
911 const char *port_b, *port_e, *pp;
913 /* scheme://host:port/tralala */
917 p = strpbrk_or_eos (p, "/;?#");
920 if (port_b == port_e)
922 /* http://host:/whatever */
924 SETERR (error, PE_BAD_PORT_NUMBER);
928 for (port = 0, pp = port_b; pp < port_e; pp++)
932 /* http://host:12randomgarbage/blah */
934 SETERR (error, PE_BAD_PORT_NUMBER);
938 port = 10 * port + (*pp - '0');
946 p = strpbrk_or_eos (p, ";?#");
951 /* Path is not allowed not to exist. */
959 p = strpbrk_or_eos (p, "?#");
966 p = strpbrk_or_eos (p, "#");
969 /* Hack that allows users to use '?' (a wildcard character) in
970 FTP URLs without it being interpreted as a query string
972 if (scheme == SCHEME_FTP)
974 query_b = query_e = NULL;
987 if (uname_b != uname_e)
989 /* http://user:pass@host */
991 /* uname_b uname_e */
992 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
994 SETERR (error, PE_INVALID_USER_NAME);
999 u = (struct url *)xmalloc (sizeof (struct url));
1000 memset (u, 0, sizeof (*u));
1003 u->host = strdupdelim (host_b, host_e);
1008 u->path = strdupdelim (path_b, path_e);
1009 path_modified = path_simplify (u->path);
1010 parse_path (u->path, &u->dir, &u->file);
1012 host_modified = lowercase_str (u->host);
1015 u->params = strdupdelim (params_b, params_e);
1017 u->query = strdupdelim (query_b, query_e);
1019 u->fragment = strdupdelim (fragment_b, fragment_e);
1021 if (path_modified || u->fragment || host_modified || path_b == path_e)
1023 /* If we suspect that a transformation has rendered what
1024 url_string might return different from URL_ENCODED, rebuild
1025 u->url using url_string. */
1026 u->url = url_string (u, 0);
1028 if (url_encoded != url)
1029 xfree ((char *) url_encoded);
1033 if (url_encoded == url)
1034 u->url = xstrdup (url);
1036 u->url = url_encoded;
1044 url_error (int error_code)
1046 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1047 return parse_errors[error_code];
1050 /* Parse PATH into dir and file. PATH is extracted from the URL and
1051 is URL-escaped. The function returns unescaped DIR and FILE. */
1054 parse_path (const char *path, char **dir, char **file)
1058 last_slash = strrchr (path, '/');
1061 *dir = xstrdup ("");
1062 *file = xstrdup (path);
1066 *dir = strdupdelim (path, last_slash);
1067 *file = xstrdup (last_slash + 1);
1069 url_unescape (*dir);
1070 url_unescape (*file);
1073 /* Note: URL's "full path" is the path with the query string and
1074 params appended. The "fragment" (#foo) is intentionally ignored,
1075 but that might be changed. For example, if the original URL was
1076 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1077 the full path will be "/foo/bar/baz;bullshit?querystring". */
1079 /* Return the length of the full path, without the terminating
1083 full_path_length (const struct url *url)
1087 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1098 /* Write out the full path. */
1101 full_path_write (const struct url *url, char *where)
1103 #define FROB(el, chr) do { \
1104 char *f_el = url->el; \
1106 int l = strlen (f_el); \
1108 memcpy (where, f_el, l); \
1120 /* Public function for getting the "full path". E.g. if u->path is
1121 "foo/bar" and u->query is "param=value", full_path will be
1122 "/foo/bar?param=value". */
1125 url_full_path (const struct url *url)
1127 int length = full_path_length (url);
1128 char *full_path = (char *)xmalloc(length + 1);
1130 full_path_write (url, full_path);
1131 full_path[length] = '\0';
1136 /* Escape unsafe and reserved characters, except for the slash
1140 url_escape_dir (const char *dir)
1142 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1147 /* Unescape slashes in NEWDIR. */
1149 h = newdir; /* hare */
1150 t = newdir; /* tortoise */
1152 for (; *h; h++, t++)
1154 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1167 /* Sync u->path and u->url with u->dir and u->file. Called after
1168 u->file or u->dir have been changed, typically by the FTP code. */
1171 sync_path (struct url *u)
1173 char *newpath, *efile, *edir;
1177 /* u->dir and u->file are not escaped. URL-escape them before
1178 reassembling them into u->path. That way, if they contain
1179 separators like '?' or even if u->file contains slashes, the
1180 path will be correctly assembled. (u->file can contain slashes
1181 if the URL specifies it with %2f, or if an FTP server returns
1183 edir = url_escape_dir (u->dir);
1184 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1187 newpath = xstrdup (efile);
1190 int dirlen = strlen (edir);
1191 int filelen = strlen (efile);
1193 /* Copy "DIR/FILE" to newpath. */
1194 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195 memcpy (p, edir, dirlen);
1198 memcpy (p, efile, filelen);
1207 if (efile != u->file)
1210 /* Regenerate u->url as well. */
1212 u->url = url_string (u, 0);
1215 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1216 This way we can sync u->path and u->url when they get changed. */
1219 url_set_dir (struct url *url, const char *newdir)
1222 url->dir = xstrdup (newdir);
1227 url_set_file (struct url *url, const char *newfile)
1230 url->file = xstrdup (newfile);
1235 url_free (struct url *url)
1241 FREE_MAYBE (url->params);
1242 FREE_MAYBE (url->query);
1243 FREE_MAYBE (url->fragment);
1244 FREE_MAYBE (url->user);
1245 FREE_MAYBE (url->passwd);
1254 get_urls_file (const char *file)
1256 struct file_memory *fm;
1257 struct urlpos *head, *tail;
1258 const char *text, *text_end;
1260 /* Load the file. */
1261 fm = read_file (file);
1264 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1267 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1271 text_end = fm->content + fm->length;
1272 while (text < text_end)
1274 const char *line_beg = text;
1275 const char *line_end = memchr (text, '\n', text_end - text);
1277 line_end = text_end;
1282 /* Strip whitespace from the beginning and end of line. */
1283 while (line_beg < line_end && ISSPACE (*line_beg))
1285 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1288 if (line_end > line_beg)
1290 /* URL is in the [line_beg, line_end) region. */
1294 struct urlpos *entry;
1297 /* We must copy the URL to a zero-terminated string, and we
1298 can't use alloca because we're in a loop. *sigh*. */
1299 url_text = strdupdelim (line_beg, line_end);
1303 /* Merge opt.base_href with URL. */
1304 char *merged = uri_merge (opt.base_href, url_text);
1309 url = url_parse (url_text, &up_error_code);
1312 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1313 file, url_text, url_error (up_error_code));
1319 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1320 memset (entry, 0, sizeof (*entry));
1331 read_file_free (fm);
1335 /* Free the linked list of urlpos. */
1337 free_urlpos (struct urlpos *l)
1341 struct urlpos *next = l->next;
1344 FREE_MAYBE (l->local_name);
1350 /* Rotate FNAME opt.backups times */
1352 rotate_backups(const char *fname)
1354 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1355 char *from = (char *)alloca (maxlen);
1356 char *to = (char *)alloca (maxlen);
1360 if (stat (fname, &sb) == 0)
1361 if (S_ISREG (sb.st_mode) == 0)
1364 for (i = opt.backups; i > 1; i--)
1366 sprintf (from, "%s.%d", fname, i - 1);
1367 sprintf (to, "%s.%d", fname, i);
1371 sprintf (to, "%s.%d", fname, 1);
1375 /* Create all the necessary directories for PATH (a file). Calls
1376 mkdirhier() internally. */
1378 mkalldirs (const char *path)
1385 p = path + strlen (path);
1386 for (; *p != '/' && p != path; p--)
1389 /* Don't create if it's just a file. */
1390 if ((p == path) && (*p != '/'))
1392 t = strdupdelim (path, p);
1394 /* Check whether the directory exists. */
1395 if ((stat (t, &st) == 0))
1397 if (S_ISDIR (st.st_mode))
1404 /* If the dir exists as a file name, remove it first. This
1405 is *only* for Wget to work with buggy old CERN http
1406 servers. Here is the scenario: When Wget tries to
1407 retrieve a directory without a slash, e.g.
1408 http://foo/bar (bar being a directory), CERN server will
1409 not redirect it too http://foo/bar/ -- it will generate a
1410 directory listing containing links to bar/file1,
1411 bar/file2, etc. Wget will lose because it saves this
1412 HTML listing to a file `bar', so it cannot create the
1413 directory. To work around this, if the file of the same
1414 name exists, we just remove it and create the directory
1416 DEBUGP (("Removing %s because of directory danger!\n", t));
1420 res = make_directory (t);
1422 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1427 /* Functions for constructing the file name out of URL components. */
1429 /* A growable string structure, used by url_file_name and friends.
1430 This should perhaps be moved to utils.c.
1432 The idea is to have a convenient and efficient way to construct a
1433 string by having various functions append data to it. Instead of
1434 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1435 functions in questions, we pass the pointer to this struct. */
1443 /* Ensure that the string can accept APPEND_COUNT more characters past
1444 the current TAIL position. If necessary, this will grow the string
1445 and update its allocated size. If the string is already large
1446 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1447 #define GROW(g, append_size) do { \
1448 struct growable *G_ = g; \
1449 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1452 /* Return the tail position of the string. */
1453 #define TAIL(r) ((r)->base + (r)->tail)
1455 /* Move the tail position by APPEND_COUNT characters. */
1456 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1458 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1462 append_string (const char *str, struct growable *dest)
1464 int l = strlen (str);
1466 memcpy (TAIL (dest), str, l);
1467 TAIL_INCR (dest, l);
1470 /* Append CH to DEST. For example, append_char (0, DEST)
1471 zero-terminates DEST. */
1474 append_char (char ch, struct growable *dest)
1478 TAIL_INCR (dest, 1);
1482 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1483 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1484 filechr_control = 4, /* a control character, e.g. 0-31 */
1487 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1489 /* Shorthands for the table: */
1490 #define U filechr_not_unix
1491 #define W filechr_not_windows
1492 #define C filechr_control
1497 /* Table of characters unsafe under various conditions (see above).
1499 Arguably we could also claim `%' to be unsafe, since we use it as
1500 the escape character. If we ever want to be able to reliably
1501 translate file name back to URL, this would become important
1502 crucial. Right now, it's better to be minimal in escaping. */
1504 const static unsigned char filechr_table[256] =
1506 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1507 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1508 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1509 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1510 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1511 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1512 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1513 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1514 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1515 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1516 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1517 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1518 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1519 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1520 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1521 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1523 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1524 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1534 /* FN_PORT_SEP is the separator between host and port in file names
1535 for non-standard port numbers. On Unix this is normally ':', as in
1536 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1537 because Windows can't handle ':' in file names. */
1538 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1540 /* FN_QUERY_SEP is the separator between the file name and the URL
1541 query, normally '?'. Since Windows cannot handle '?' as part of
1542 file name, we use '@' instead there. */
1543 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1545 /* Quote path element, characters in [b, e), as file name, and append
1546 the quoted string to DEST. Each character is quoted as per
1547 file_unsafe_char and the corresponding table. */
1550 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1559 if (opt.restrict_files_os == restrict_unix)
1560 mask = filechr_not_unix;
1562 mask = filechr_not_windows;
1563 if (opt.restrict_files_ctrl)
1564 mask |= filechr_control;
1566 /* Copy [b, e) to PATHEL and URL-unescape it. */
1567 BOUNDED_TO_ALLOCA (b, e, pathel);
1568 url_unescape (pathel);
1569 pathlen = strlen (pathel);
1571 /* Go through PATHEL and check how many characters we'll need to
1572 add for file quoting. */
1574 for (p = pathel; *p; p++)
1575 if (FILE_CHAR_TEST (*p, mask))
1578 /* p - pathel is the string length. Each quoted char means two
1579 additional characters in the string, hence 2*quoted. */
1580 outlen = (p - pathel) + (2 * quoted);
1581 GROW (dest, outlen);
1585 /* If there's nothing to quote, we don't need to go through the
1586 string the second time. */
1587 memcpy (TAIL (dest), pathel, outlen);
1591 char *q = TAIL (dest);
1592 for (p = pathel; *p; p++)
1594 if (!FILE_CHAR_TEST (*p, mask))
1598 unsigned char ch = *p;
1600 *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1601 *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1604 assert (q - TAIL (dest) == outlen);
1606 TAIL_INCR (dest, outlen);
1609 /* Append to DEST the directory structure that corresponds the
1610 directory part of URL's path. For example, if the URL is
1611 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1613 Each path element ("dir1" and "dir2" in the above example) is
1614 examined, url-unescaped, and re-escaped as file name element.
1616 Additionally, it cuts as many directories from the path as
1617 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1618 will produce "bar" for the above example. For 2 or more, it will
1621 Each component of the path is quoted for use as file name. */
1624 append_dir_structure (const struct url *u, struct growable *dest)
1626 char *pathel, *next;
1627 int cut = opt.cut_dirs;
1629 /* Go through the path components, de-URL-quote them, and quote them
1630 (if necessary) as file names. */
1633 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1638 /* Ignore empty pathels. path_simplify should remove
1639 occurrences of "//" from the path, but it has special cases
1640 for starting / which generates an empty pathel here. */
1644 append_char ('/', dest);
1645 append_uri_pathel (pathel, next, dest);
1649 /* Return a unique file name that matches the given URL as good as
1650 possible. Does not create directories on the file system. */
1653 url_file_name (const struct url *u)
1655 struct growable fnres;
1657 char *u_file, *u_query;
1658 char *fname, *unique;
1664 /* Start with the directory prefix, if specified. */
1665 if (!DOTP (opt.dir_prefix))
1666 append_string (opt.dir_prefix, &fnres);
1668 /* If "dirstruct" is turned on (typically the case with -r), add
1669 the host and port (unless those have been turned off) and
1670 directory structure. */
1673 if (opt.add_hostdir)
1676 append_char ('/', &fnres);
1677 append_string (u->host, &fnres);
1678 if (u->port != scheme_default_port (u->scheme))
1681 number_to_string (portstr, u->port);
1682 append_char (FN_PORT_SEP, &fnres);
1683 append_string (portstr, &fnres);
1687 append_dir_structure (u, &fnres);
1690 /* Add the file name. */
1692 append_char ('/', &fnres);
1693 u_file = *u->file ? u->file : "index.html";
1694 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1696 /* Append "?query" to the file name. */
1697 u_query = u->query && *u->query ? u->query : NULL;
1700 append_char (FN_QUERY_SEP, &fnres);
1701 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1704 /* Zero-terminate the file name. */
1705 append_char ('\0', &fnres);
1709 /* Check the cases in which the unique extensions are not used:
1710 1) Clobbering is turned off (-nc).
1711 2) Retrieval with regetting.
1712 3) Timestamping is used.
1713 4) Hierarchy is built.
1715 The exception is the case when file does exist and is a
1716 directory (see `mkalldirs' for explanation). */
1718 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1719 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1722 unique = unique_name (fname, 1);
1723 if (unique != fname)
1728 /* Return the length of URL's path. Path is considered to be
1729 terminated by one of '?', ';', '#', or by the end of the
1732 path_length (const char *url)
1734 const char *q = strpbrk_or_eos (url, "?;#");
1738 /* Find the last occurrence of character C in the range [b, e), or
1739 NULL, if none are present. This is equivalent to strrchr(b, c),
1740 except that it accepts an END argument instead of requiring the
1741 string to be zero-terminated. Why is there no memrchr()? */
1743 find_last_char (const char *b, const char *e, char c)
1751 /* Resolve "." and ".." elements of PATH by destructively modifying
1752 PATH. "." is resolved by removing that path element, and ".." is
1753 resolved by removing the preceding path element. Leading and
1754 trailing slashes are preserved.
1756 Return non-zero if any changes have been made.
1758 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1759 test examples are provided below. If you change anything in this
1760 function, run test_path_simplify to make sure you haven't broken a
1763 A previous version of this function was based on path_simplify()
1764 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1767 path_simplify (char *path)
1773 ++path; /* preserve the leading '/'. */
1776 end = p + strlen (p) + 1; /* position past the terminating zero. */
1781 /* P should point to the beginning of a path element. */
1783 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1785 /* Handle "./foo" by moving "foo" two characters to the
1787 if (*(p + 1) == '/')
1790 memmove (p, p + 2, end - p);
1801 else if (*p == '.' && *(p + 1) == '.'
1802 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1804 /* Handle "../foo" by moving "foo" one path element to the
1806 char *b = p; /* not p-1 because P can equal PATH */
1808 /* Backtrack by one path element, but not past the beginning
1811 /* foo/bar/../baz */
1817 /* Move backwards until B hits the beginning of the
1818 previous path element or the beginning of path. */
1819 for (--b; b > path && *(b - 1) != '/'; b--)
1824 if (*(p + 2) == '/')
1826 memmove (b, p + 3, end - (p + 3));
1840 /* Remove empty path elements. Not mandated by rfc1808 et
1841 al, but it seems like a good idea to get rid of them.
1842 Supporting them properly is hard (in which directory do
1843 you save http://x.com///y.html?) and they don't seem to
1854 memmove (p, q, end - q);
1859 /* Skip to the next path element. */
1860 while (*p && *p != '/')
1865 /* Make sure P points to the beginning of the next path element,
1866 which is location after the slash. */
1873 /* Resolve the result of "linking" a base URI (BASE) to a
1874 link-specified URI (LINK).
1876 Either of the URIs may be absolute or relative, complete with the
1877 host name, or path only. This tries to behave "reasonably" in all
1878 foreseeable cases. It employs little specific knowledge about
1879 schemes or URL-specific stuff -- it just works on strings.
1881 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1882 See uri_merge for a gentler interface to this functionality.
1884 Perhaps this function should call path_simplify so that the callers
1885 don't have to call url_parse unconditionally. */
1887 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1893 const char *end = base + path_length (base);
1897 /* Empty LINK points back to BASE, query string and all. */
1898 constr = xstrdup (base);
1900 else if (*link == '?')
1902 /* LINK points to the same location, but changes the query
1903 string. Examples: */
1904 /* uri_merge("path", "?new") -> "path?new" */
1905 /* uri_merge("path?foo", "?new") -> "path?new" */
1906 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1907 /* uri_merge("path#foo", "?new") -> "path?new" */
1908 int baselength = end - base;
1909 constr = xmalloc (baselength + linklength + 1);
1910 memcpy (constr, base, baselength);
1911 memcpy (constr + baselength, link, linklength);
1912 constr[baselength + linklength] = '\0';
1914 else if (*link == '#')
1916 /* uri_merge("path", "#new") -> "path#new" */
1917 /* uri_merge("path#foo", "#new") -> "path#new" */
1918 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1919 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1921 const char *end1 = strchr (base, '#');
1923 end1 = base + strlen (base);
1924 baselength = end1 - base;
1925 constr = xmalloc (baselength + linklength + 1);
1926 memcpy (constr, base, baselength);
1927 memcpy (constr + baselength, link, linklength);
1928 constr[baselength + linklength] = '\0';
1930 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1932 /* LINK begins with "//" and so is a net path: we need to
1933 replace everything after (and including) the double slash
1936 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1937 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1938 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1942 const char *start_insert;
1944 /* Look for first slash. */
1945 slash = memchr (base, '/', end - base);
1946 /* If found slash and it is a double slash, then replace
1947 from this point, else default to replacing from the
1949 if (slash && *(slash + 1) == '/')
1950 start_insert = slash;
1952 start_insert = base;
1954 span = start_insert - base;
1955 constr = (char *)xmalloc (span + linklength + 1);
1957 memcpy (constr, base, span);
1958 memcpy (constr + span, link, linklength);
1959 constr[span + linklength] = '\0';
1961 else if (*link == '/')
1963 /* LINK is an absolute path: we need to replace everything
1964 after (and including) the FIRST slash with LINK.
1966 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1967 "/qux/xyzzy", our result should be
1968 "http://host/qux/xyzzy". */
1971 const char *start_insert = NULL; /* for gcc to shut up. */
1972 const char *pos = base;
1973 int seen_slash_slash = 0;
1974 /* We're looking for the first slash, but want to ignore
1977 slash = memchr (pos, '/', end - pos);
1978 if (slash && !seen_slash_slash)
1979 if (*(slash + 1) == '/')
1982 seen_slash_slash = 1;
1986 /* At this point, SLASH is the location of the first / after
1987 "//", or the first slash altogether. START_INSERT is the
1988 pointer to the location where LINK will be inserted. When
1989 examining the last two examples, keep in mind that LINK
1992 if (!slash && !seen_slash_slash)
1993 /* example: "foo" */
1995 start_insert = base;
1996 else if (!slash && seen_slash_slash)
1997 /* example: "http://foo" */
2000 else if (slash && !seen_slash_slash)
2001 /* example: "foo/bar" */
2003 start_insert = base;
2004 else if (slash && seen_slash_slash)
2005 /* example: "http://something/" */
2007 start_insert = slash;
2009 span = start_insert - base;
2010 constr = (char *)xmalloc (span + linklength + 1);
2012 memcpy (constr, base, span);
2014 memcpy (constr + span, link, linklength);
2015 constr[span + linklength] = '\0';
2019 /* LINK is a relative URL: we need to replace everything
2020 after last slash (possibly empty) with LINK.
2022 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2023 our result should be "whatever/foo/qux/xyzzy". */
2024 int need_explicit_slash = 0;
2026 const char *start_insert;
2027 const char *last_slash = find_last_char (base, end, '/');
2030 /* No slash found at all. Append LINK to what we have,
2031 but we'll need a slash as a separator.
2033 Example: if base == "foo" and link == "qux/xyzzy", then
2034 we cannot just append link to base, because we'd get
2035 "fooqux/xyzzy", whereas what we want is
2038 To make sure the / gets inserted, we set
2039 need_explicit_slash to 1. We also set start_insert
2040 to end + 1, so that the length calculations work out
2041 correctly for one more (slash) character. Accessing
2042 that character is fine, since it will be the
2043 delimiter, '\0' or '?'. */
2044 /* example: "foo?..." */
2045 /* ^ ('?' gets changed to '/') */
2046 start_insert = end + 1;
2047 need_explicit_slash = 1;
2049 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2051 /* example: http://host" */
2053 start_insert = end + 1;
2054 need_explicit_slash = 1;
2058 /* example: "whatever/foo/bar" */
2060 start_insert = last_slash + 1;
2063 span = start_insert - base;
2064 constr = (char *)xmalloc (span + linklength + 1);
2066 memcpy (constr, base, span);
2067 if (need_explicit_slash)
2068 constr[span - 1] = '/';
2070 memcpy (constr + span, link, linklength);
2071 constr[span + linklength] = '\0';
2074 else /* !no_scheme */
2076 constr = strdupdelim (link, link + linklength);
2081 /* Merge BASE with LINK and return the resulting URI. This is an
2082 interface to uri_merge_1 that assumes that LINK is a
2083 zero-terminated string. */
2085 uri_merge (const char *base, const char *link)
2087 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2090 #define APPEND(p, s) do { \
2091 int len = strlen (s); \
2092 memcpy (p, s, len); \
2096 /* Use this instead of password when the actual password is supposed
2097 to be hidden. We intentionally use a generic string without giving
2098 away the number of characters in the password, like previous
2100 #define HIDDEN_PASSWORD "*password*"
2102 /* Recreate the URL string from the data in URL.
2104 If HIDE is non-zero (as it is when we're calling this on a URL we
2105 plan to print, but not when calling it to canonicalize a URL for
2106 use within the program), password will be hidden. Unsafe
2107 characters in the URL will be quoted. */
2110 url_string (const struct url *url, int hide_password)
2114 char *quoted_user = NULL, *quoted_passwd = NULL;
2116 int scheme_port = supported_schemes[url->scheme].default_port;
2117 char *scheme_str = supported_schemes[url->scheme].leading_string;
2118 int fplen = full_path_length (url);
2120 int brackets_around_host = 0;
2122 assert (scheme_str != NULL);
2124 /* Make sure the user name and password are quoted. */
2127 quoted_user = url_escape_allow_passthrough (url->user);
2131 quoted_passwd = HIDDEN_PASSWORD;
2133 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2137 if (strchr (url->host, ':'))
2138 brackets_around_host = 1;
2140 size = (strlen (scheme_str)
2141 + strlen (url->host)
2142 + (brackets_around_host ? 2 : 0)
2145 if (url->port != scheme_port)
2146 size += 1 + numdigit (url->port);
2149 size += 1 + strlen (quoted_user);
2151 size += 1 + strlen (quoted_passwd);
2154 p = result = xmalloc (size);
2156 APPEND (p, scheme_str);
2159 APPEND (p, quoted_user);
2163 APPEND (p, quoted_passwd);
2168 if (brackets_around_host)
2170 APPEND (p, url->host);
2171 if (brackets_around_host)
2173 if (url->port != scheme_port)
2176 p = number_to_string (p, url->port);
2179 full_path_write (url, p);
2183 assert (p - result == size);
2185 if (quoted_user && quoted_user != url->user)
2186 xfree (quoted_user);
2187 if (quoted_passwd && !hide_password
2188 && quoted_passwd != url->passwd)
2189 xfree (quoted_passwd);
2194 /* Return the URL of the proxy appropriate for url U. */
2196 getproxy (struct url *u)
2199 char *rewritten_url;
2200 static char rewritten_storage[1024];
2204 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2210 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2214 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2218 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2220 case SCHEME_INVALID:
2223 if (!proxy || !*proxy)
2226 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2227 getproxy() to return static storage. */
2228 rewritten_url = rewrite_shorthand_url (proxy);
2231 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2232 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2233 proxy = rewritten_storage;
2239 /* Should a host be accessed through proxy, concerning no_proxy? */
2241 no_proxy_match (const char *host, const char **no_proxy)
2246 return !sufmatch (no_proxy, host);
2249 /* Support for converting links for local viewing in downloaded HTML
2250 files. This should be moved to another file, because it has
2251 nothing to do with processing URLs. */
2253 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2254 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2256 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2257 const char *, int));
2258 static char *local_quote_string PARAMS ((const char *));
2260 /* Change the links in one HTML file. LINKS is a list of links in the
2261 document, along with their positions and the desired direction of
2264 convert_links (const char *file, struct urlpos *links)
2266 struct file_memory *fm;
2269 downloaded_file_t downloaded_file_return;
2271 struct urlpos *link;
2272 int to_url_count = 0, to_file_count = 0;
2274 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2277 /* First we do a "dry run": go through the list L and see whether
2278 any URL needs to be converted in the first place. If not, just
2279 leave the file alone. */
2281 struct urlpos *dry = links;
2282 for (dry = links; dry; dry = dry->next)
2283 if (dry->convert != CO_NOCONVERT)
2287 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2292 fm = read_file (file);
2295 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2296 file, strerror (errno));
2300 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2301 if (opt.backup_converted && downloaded_file_return)
2302 write_backup_file (file, downloaded_file_return);
2304 /* Before opening the file for writing, unlink the file. This is
2305 important if the data in FM is mmaped. In such case, nulling the
2306 file, which is what fopen() below does, would make us read all
2307 zeroes from the mmaped region. */
2308 if (unlink (file) < 0 && errno != ENOENT)
2310 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2311 file, strerror (errno));
2312 read_file_free (fm);
2315 /* Now open the file for writing. */
2316 fp = fopen (file, "wb");
2319 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2320 file, strerror (errno));
2321 read_file_free (fm);
2325 /* Here we loop through all the URLs in file, replacing those of
2326 them that are downloaded with relative references. */
2328 for (link = links; link; link = link->next)
2330 char *url_start = fm->content + link->pos;
2332 if (link->pos >= fm->length)
2334 DEBUGP (("Something strange is going on. Please investigate."));
2337 /* If the URL is not to be converted, skip it. */
2338 if (link->convert == CO_NOCONVERT)
2340 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2344 /* Echo the file contents, up to the offending URL's opening
2345 quote, to the outfile. */
2346 fwrite (p, 1, url_start - p, fp);
2349 switch (link->convert)
2351 case CO_CONVERT_TO_RELATIVE:
2352 /* Convert absolute URL to relative. */
2354 char *newname = construct_relative (file, link->local_name);
2355 char *quoted_newname = local_quote_string (newname);
2357 if (!link->link_refresh_p)
2358 p = replace_attr (p, link->size, fp, quoted_newname);
2360 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2361 link->refresh_timeout);
2363 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2364 link->url->url, newname, link->pos, file));
2366 xfree (quoted_newname);
2370 case CO_CONVERT_TO_COMPLETE:
2371 /* Convert the link to absolute URL. */
2373 char *newlink = link->url->url;
2374 char *quoted_newlink = html_quote_string (newlink);
2376 if (!link->link_refresh_p)
2377 p = replace_attr (p, link->size, fp, quoted_newlink);
2379 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2380 link->refresh_timeout);
2382 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2383 newlink, link->pos, file));
2384 xfree (quoted_newlink);
2388 case CO_NULLIFY_BASE:
2389 /* Change the base href to "". */
2390 p = replace_attr (p, link->size, fp, "");
2398 /* Output the rest of the file. */
2399 if (p - fm->content < fm->length)
2400 fwrite (p, 1, fm->length - (p - fm->content), fp);
2402 read_file_free (fm);
2404 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2407 /* Construct and return a malloced copy of the relative link from two
2408 pieces of information: local name S1 of the referring file and
2409 local name S2 of the referred file.
2411 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2412 "jagor.srce.hr/images/news.gif", the function will return
2415 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2416 "fly.cc.fer.hr/images/fly.gif", the function will return
2417 "../images/fly.gif".
2419 Caveats: S1 should not begin with `/', unless S2 also begins with
2420 '/'. S1 should not contain things like ".." and such --
2421 construct_relative ("fly/ioccc/../index.html",
2422 "fly/images/fly.gif") will fail. (A workaround is to call
2423 something like path_simplify() on S1). */
2425 construct_relative (const char *s1, const char *s2)
2427 int i, cnt, sepdirs1;
2431 return xstrdup (s2);
2432 /* S1 should *not* be absolute, if S2 wasn't. */
2433 assert (*s1 != '/');
2435 /* Skip the directories common to both strings. */
2438 while (s1[i] && s2[i]
2443 if (s1[i] == '/' && s2[i] == '/')
2448 for (sepdirs1 = 0; s1[i]; i++)
2451 /* Now, construct the file as of:
2452 - ../ repeated sepdirs1 time
2453 - all the non-mutual directories of S2. */
2454 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2455 for (i = 0; i < sepdirs1; i++)
2456 memcpy (res + 3 * i, "../", 3);
2457 strcpy (res + 3 * i, s2 + cnt);
2462 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2464 /* Rather than just writing over the original .html file with the
2465 converted version, save the former to *.orig. Note we only do
2466 this for files we've _successfully_ downloaded, so we don't
2467 clobber .orig files sitting around from previous invocations. */
2469 /* Construct the backup filename as the original name plus ".orig". */
2470 size_t filename_len = strlen(file);
2471 char* filename_plus_orig_suffix;
2472 boolean already_wrote_backup_file = FALSE;
2473 slist* converted_file_ptr;
2474 static slist* converted_files = NULL;
2476 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2478 /* Just write "orig" over "html". We need to do it this way
2479 because when we're checking to see if we've downloaded the
2480 file before (to see if we can skip downloading it), we don't
2481 know if it's a text/html file. Therefore we don't know yet
2482 at that stage that -E is going to cause us to tack on
2483 ".html", so we need to compare vs. the original URL plus
2484 ".orig", not the original URL plus ".html.orig". */
2485 filename_plus_orig_suffix = alloca (filename_len + 1);
2486 strcpy(filename_plus_orig_suffix, file);
2487 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2489 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2491 /* Append ".orig" to the name. */
2492 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2493 strcpy(filename_plus_orig_suffix, file);
2494 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2497 /* We can get called twice on the same URL thanks to the
2498 convert_all_links() call in main(). If we write the .orig file
2499 each time in such a case, it'll end up containing the first-pass
2500 conversion, not the original file. So, see if we've already been
2501 called on this file. */
2502 converted_file_ptr = converted_files;
2503 while (converted_file_ptr != NULL)
2504 if (strcmp(converted_file_ptr->string, file) == 0)
2506 already_wrote_backup_file = TRUE;
2510 converted_file_ptr = converted_file_ptr->next;
2512 if (!already_wrote_backup_file)
2514 /* Rename <file> to <file>.orig before former gets written over. */
2515 if (rename(file, filename_plus_orig_suffix) != 0)
2516 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2517 file, filename_plus_orig_suffix, strerror (errno));
2519 /* Remember that we've already written a .orig backup for this file.
2520 Note that we never free this memory since we need it till the
2521 convert_all_links() call, which is one of the last things the
2522 program does before terminating. BTW, I'm not sure if it would be
2523 safe to just set 'converted_file_ptr->string' to 'file' below,
2524 rather than making a copy of the string... Another note is that I
2525 thought I could just add a field to the urlpos structure saying
2526 that we'd written a .orig file for this URL, but that didn't work,
2527 so I had to make this separate list.
2528 -- Dan Harkless <wget@harkless.org>
2530 This [adding a field to the urlpos structure] didn't work
2531 because convert_file() is called from convert_all_links at
2532 the end of the retrieval with a freshly built new urlpos
2534 -- Hrvoje Niksic <hniksic@arsdigita.com>
2536 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2537 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2538 converted_file_ptr->next = converted_files;
2539 converted_files = converted_file_ptr;
2543 static int find_fragment PARAMS ((const char *, int, const char **,
2546 /* Replace an attribute's original text with NEW_TEXT. */
2549 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2552 char quote_char = '\"'; /* use "..." for quoting, unless the
2553 original value is quoted, in which
2554 case reuse its quoting char. */
2555 const char *frag_beg, *frag_end;
2557 /* Structure of our string is:
2558 "...old-contents..."
2559 <--- size ---> (with quotes)
2562 <--- size --> (no quotes) */
2564 if (*p == '\"' || *p == '\'')
2569 size -= 2; /* disregard opening and closing quote */
2571 putc (quote_char, fp);
2572 fputs (new_text, fp);
2574 /* Look for fragment identifier, if any. */
2575 if (find_fragment (p, size, &frag_beg, &frag_end))
2576 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2580 putc (quote_char, fp);
2585 /* The same as REPLACE_ATTR, but used when replacing
2586 <meta http-equiv=refresh content="new_text"> because we need to
2587 append "timeout_value; URL=" before the next_text. */
2590 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2591 const char *new_text, int timeout)
2594 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2598 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2600 return replace_attr (p, size, fp, new_with_timeout);
2603 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2604 preceded by '&'. If the character is not found, return zero. If
2605 the character is found, return 1 and set BP and EP to point to the
2606 beginning and end of the region.
2608 This is used for finding the fragment indentifiers in URLs. */
2611 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2613 const char *end = beg + size;
2615 for (; beg < end; beg++)
2637 /* Quote FILE for use as local reference to an HTML file.
2639 We quote ? as %3F to avoid passing part of the file name as the
2640 parameter when browsing the converted file through HTTP. However,
2641 it is safe to do this only when `--html-extension' is turned on.
2642 This is because converting "index.html?foo=bar" to
2643 "index.html%3Ffoo=bar" would break local browsing, as the latter
2644 isn't even recognized as an HTML file! However, converting
2645 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2646 safe for both local and HTTP-served browsing. */
2649 local_quote_string (const char *file)
2651 const char *file_sans_qmark;
2654 if (!opt.html_extension)
2655 return html_quote_string (file);
2657 qm = count_char (file, '?');
2661 const char *from = file;
2664 /* qm * 2 because we replace each question mark with "%3F",
2665 i.e. replace one char with three, hence two more. */
2666 int fsqlen = strlen (file) + qm * 2;
2668 to = newname = (char *)alloca (fsqlen + 1);
2669 for (; *from; from++)
2680 assert (to - newname == fsqlen);
2683 file_sans_qmark = newname;
2686 file_sans_qmark = file;
2688 return html_quote_string (file_sans_qmark);
2691 /* We're storing "modes" of type downloaded_file_t in the hash table.
2692 However, our hash tables only accept pointers for keys and values.
2693 So when we need a pointer, we use the address of a
2694 downloaded_file_t variable of static storage. */
2696 static downloaded_file_t *
2697 downloaded_mode_to_ptr (downloaded_file_t mode)
2699 static downloaded_file_t
2700 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2701 v2 = FILE_DOWNLOADED_NORMALLY,
2702 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2703 v4 = CHECK_FOR_FILE;
2707 case FILE_NOT_ALREADY_DOWNLOADED:
2709 case FILE_DOWNLOADED_NORMALLY:
2711 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2713 case CHECK_FOR_FILE:
2719 /* This should really be merged with dl_file_url_map and
2720 downloaded_html_files in recur.c. This was originally a list, but
2721 I changed it to a hash table beause it was actually taking a lot of
2722 time to find things in it. */
2724 static struct hash_table *downloaded_files_hash;
2726 /* Remembers which files have been downloaded. In the standard case, should be
2727 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2728 download successfully (i.e. not for ones we have failures on or that we skip
2731 When we've downloaded a file and tacked on a ".html" extension due to -E,
2732 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2733 FILE_DOWNLOADED_NORMALLY.
2735 If you just want to check if a file has been previously added without adding
2736 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2737 with local filenames, not remote URLs. */
2739 downloaded_file (downloaded_file_t mode, const char *file)
2741 downloaded_file_t *ptr;
2743 if (mode == CHECK_FOR_FILE)
2745 if (!downloaded_files_hash)
2746 return FILE_NOT_ALREADY_DOWNLOADED;
2747 ptr = hash_table_get (downloaded_files_hash, file);
2749 return FILE_NOT_ALREADY_DOWNLOADED;
2753 if (!downloaded_files_hash)
2754 downloaded_files_hash = make_string_hash_table (0);
2756 ptr = hash_table_get (downloaded_files_hash, file);
2760 ptr = downloaded_mode_to_ptr (mode);
2761 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2763 return FILE_NOT_ALREADY_DOWNLOADED;
2767 df_free_mapper (void *key, void *value, void *ignored)
2774 downloaded_files_free (void)
2776 if (downloaded_files_hash)
2778 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2779 hash_table_destroy (downloaded_files_hash);
2780 downloaded_files_hash = NULL;
2784 /* Return non-zero if scheme a is similar to scheme b.
2786 Schemes are similar if they are equal. If SSL is supported, schemes
2787 are also similar if one is http (SCHEME_HTTP) and the other is https
2790 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2795 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2796 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2803 /* Debugging and testing support for path_simplify. */
2805 /* Debug: run path_simplify on PATH and return the result in a new
2806 string. Useful for calling from the debugger. */
2810 char *copy = xstrdup (path);
2811 path_simplify (copy);
2816 run_test (char *test, char *expected_result, int expected_change)
2818 char *test_copy = xstrdup (test);
2819 int modified = path_simplify (test_copy);
2821 if (0 != strcmp (test_copy, expected_result))
2823 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2824 test, expected_result, test_copy);
2826 if (modified != expected_change)
2828 if (expected_change == 1)
2829 printf ("Expected no modification with path_simplify(\"%s\").\n",
2832 printf ("Expected modification with path_simplify(\"%s\").\n",
2839 test_path_simplify (void)
2842 char *test, *result;
2848 { "foo", "foo", 0 },
2849 { "foo/bar", "foo/bar", 0 },
2850 { "foo///bar", "foo/bar", 1 },
2851 { "foo/.", "foo/", 1 },
2852 { "foo/./", "foo/", 1 },
2853 { "foo./", "foo./", 0 },
2854 { "foo/../bar", "bar", 1 },
2855 { "foo/../bar/", "bar/", 1 },
2856 { "foo/bar/..", "foo/", 1 },
2857 { "foo/bar/../x", "foo/x", 1 },
2858 { "foo/bar/../x/", "foo/x/", 1 },
2859 { "foo/..", "", 1 },
2860 { "foo/../..", "", 1 },
2861 { "a/b/../../c", "c", 1 },
2862 { "./a/../b", "b", 1 }
2866 for (i = 0; i < ARRAY_SIZE (tests); i++)
2868 char *test = tests[i].test;
2869 char *expected_result = tests[i].result;
2870 int expected_change = tests[i].should_modify;
2871 run_test (test, expected_result, expected_change);
2874 /* Now run all the tests with a leading slash before the test case,
2875 to prove that the slash is being preserved. */
2876 for (i = 0; i < ARRAY_SIZE (tests); i++)
2878 char *test, *expected_result;
2879 int expected_change = tests[i].should_modify;
2881 test = xmalloc (1 + strlen (tests[i].test) + 1);
2882 sprintf (test, "/%s", tests[i].test);
2884 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2885 sprintf (expected_result, "/%s", tests[i].result);
2887 run_test (test, expected_result, expected_change);
2890 xfree (expected_result);