2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
62 static const int NS_INADDRSZ = 4;
63 static const int NS_IN6ADDRSZ = 16;
64 static const int NS_INT16SZ = 2;
74 /* Supported schemes: */
75 static struct scheme_data supported_schemes[] =
77 { "http://", DEFAULT_HTTP_PORT, 1 },
79 { "https://", DEFAULT_HTTPS_PORT, 1 },
81 { "ftp://", DEFAULT_FTP_PORT, 1 },
87 /* Forward declarations: */
89 static char *construct_relative PARAMS ((const char *, const char *));
90 static int path_simplify PARAMS ((char *));
94 /* Support for encoding and decoding of URL strings. We determine
95 whether a character is unsafe through static table lookup. This
96 code assumes ASCII character set and 8-bit chars. */
99 /* rfc1738 reserved chars, preserved from encoding. */
102 /* rfc1738 unsafe chars, plus some more. */
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
110 /* Shorthands for the table: */
111 #define R urlchr_reserved
112 #define U urlchr_unsafe
115 const static unsigned char urlchr_table[256] =
117 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
118 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
119 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
120 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
121 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
122 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
123 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
124 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
125 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
126 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
127 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
128 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
129 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
130 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
131 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
132 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
134 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 /* URL-unescape the string S.
150 This is done by transforming the sequences "%HH" to the character
151 represented by the hexadecimal digits HH. If % is not followed by
152 two hexadecimal digits, it is inserted literally.
154 The transformation is done in place. If you need the original
155 string intact, make a copy before calling this function. */
158 url_unescape (char *s)
160 char *t = s; /* t - tortoise */
161 char *h = s; /* h - hare */
172 /* Do nothing if '%' is not followed by two hex digits. */
173 if (!*(h + 1) || !*(h + 2)
174 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
176 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
183 /* The core of url_escape_* functions. Escapes the characters that
184 match the provided mask in urlchr_table.
186 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
187 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
188 freshly allocated string will be returned in all cases. */
191 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
198 for (p1 = s; *p1; p1++)
199 if (urlchr_test (*p1, mask))
200 addition += 2; /* Two more characters (hex digits) */
203 return allow_passthrough ? (char *)s : xstrdup (s);
205 newlen = (p1 - s) + addition;
206 newstr = (char *)xmalloc (newlen + 1);
212 /* Quote the characters that match the test mask. */
213 if (urlchr_test (*p1, mask))
215 unsigned char c = *p1++;
217 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
218 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
223 assert (p2 - newstr == newlen);
229 /* URL-escape the unsafe characters (see urlchr_table) in a given
230 string, returning a freshly allocated string. */
233 url_escape (const char *s)
235 return url_escape_1 (s, urlchr_unsafe, 0);
238 /* URL-escape the unsafe characters (see urlchr_table) in a given
239 string. If no characters are unsafe, S is returned. */
242 url_escape_allow_passthrough (const char *s)
244 return url_escape_1 (s, urlchr_unsafe, 1);
247 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
249 /* Decide whether to encode, decode, or pass through the char at P.
250 This used to be a macro, but it got a little too convoluted. */
251 static inline enum copy_method
252 decide_copy_method (const char *p)
256 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
258 /* %xx sequence: decode it, unless it would decode to an
259 unsafe or a reserved char; in that case, leave it as
261 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
262 XCHAR_TO_XDIGIT (*(p + 2));
264 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
265 return CM_PASSTHROUGH;
270 /* Garbled %.. sequence: encode `%'. */
273 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
276 return CM_PASSTHROUGH;
279 /* Translate a %-escaped (but possibly non-conformant) input string S
280 into a %-escaped (and conformant) output string. If no characters
281 are encoded or decoded, return the same string S; otherwise, return
282 a freshly allocated string with the new contents.
284 After a URL has been run through this function, the protocols that
285 use `%' as the quote character can use the resulting string as-is,
286 while those that don't call url_unescape() to get to the intended
287 data. This function is also stable: after an input string is
288 transformed the first time, all further transformations of the
289 result yield the same result string.
291 Let's discuss why this function is needed.
293 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
294 space character would mess up the HTTP request, it needs to be
297 GET /abc%20def HTTP/1.0
299 It appears that the unsafe chars need to be quoted, for example
300 with url_escape. But what if we're requested to download
301 `abc%20def'? url_escape transforms "%" to "%25", which would leave
302 us with `abc%2520def'. This is incorrect -- since %-escapes are
303 part of URL syntax, "%20" is the correct way to denote a literal
304 space on the Wget command line. This leaves us in the conclusion
305 that in that case Wget should not call url_escape, but leave the
308 And what if the requested URI is `abc%20 def'? If we call
309 url_escape, we end up with `/abc%2520%20def', which is almost
310 certainly not intended. If we don't call url_escape, we are left
311 with the embedded space and cannot complete the request. What the
312 user meant was for Wget to request `/abc%20%20def', and this is
313 where reencode_escapes kicks in.
315 Wget used to solve this by first decoding %-quotes, and then
316 encoding all the "unsafe" characters found in the resulting string.
317 This was wrong because it didn't preserve certain URL special
318 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
319 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
320 whether we considered `+' reserved (it is). One of these results
321 is inevitable because by the second step we would lose information
322 on whether the `+' was originally encoded or not. Both results
323 were wrong because in CGI parameters + means space, while %2B means
324 literal plus. reencode_escapes correctly translates the above to
325 "a%2B+b", i.e. returns the original string.
327 This function uses an algorithm proposed by Anon Sricharoenchai:
329 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
332 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
335 ...except that this code conflates the two steps, and decides
336 whether to encode, decode, or pass through each character in turn.
337 The function still uses two passes, but their logic is the same --
338 the first pass exists merely for the sake of allocation. Another
339 small difference is that we include `+' to URL_RESERVED.
343 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
345 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
349 "foo bar" -> "foo%20bar"
350 "foo%20bar" -> "foo%20bar"
351 "foo %20bar" -> "foo%20%20bar"
352 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
353 "foo%25%20bar" -> "foo%25%20bar"
354 "foo%2%20bar" -> "foo%252%20bar"
355 "foo+bar" -> "foo+bar" (plus is reserved!)
356 "foo%2b+bar" -> "foo%2b+bar" */
359 reencode_escapes (const char *s)
365 int encode_count = 0;
366 int decode_count = 0;
368 /* First, pass through the string to see if there's anything to do,
369 and to calculate the new length. */
370 for (p1 = s; *p1; p1++)
372 switch (decide_copy_method (p1))
385 if (!encode_count && !decode_count)
386 /* The string is good as it is. */
387 return (char *)s; /* C const model sucks. */
390 /* Each encoding adds two characters (hex digits), while each
391 decoding removes two characters. */
392 newlen = oldlen + 2 * (encode_count - decode_count);
393 newstr = xmalloc (newlen + 1);
400 switch (decide_copy_method (p1))
404 unsigned char c = *p1++;
406 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
407 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
411 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
412 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
413 p1 += 3; /* skip %xx */
420 assert (p2 - newstr == newlen);
424 /* Returns the scheme type if the scheme is supported, or
425 SCHEME_INVALID if not. */
427 url_scheme (const char *url)
431 for (i = 0; supported_schemes[i].leading_string; i++)
432 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
433 strlen (supported_schemes[i].leading_string)))
435 if (supported_schemes[i].enabled)
436 return (enum url_scheme) i;
438 return SCHEME_INVALID;
441 return SCHEME_INVALID;
444 /* Return the number of characters needed to skip the scheme part of
445 the URL, e.g. `http://'. If no scheme is found, returns 0. */
447 url_skip_scheme (const char *url)
451 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
453 while (ISALNUM (*p) || *p == '-' || *p == '+')
460 /* Skip "//" if found. */
461 if (*p == '/' && *(p + 1) == '/')
467 /* Returns 1 if the URL begins with a scheme (supported or
468 unsupported), 0 otherwise. */
470 url_has_scheme (const char *url)
473 while (ISALNUM (*p) || *p == '-' || *p == '+')
479 scheme_default_port (enum url_scheme scheme)
481 return supported_schemes[scheme].default_port;
485 scheme_disable (enum url_scheme scheme)
487 supported_schemes[scheme].enabled = 0;
490 /* Skip the username and password, if present here. The function
491 should be called *not* with the complete URL, but with the part
492 right after the scheme.
494 If no username and password are found, return 0. */
496 url_skip_uname (const char *url)
500 /* Look for '@' that comes before '/' or '?'. */
501 p = (const char *)strpbrk (url, "/?@");
509 parse_uname (const char *str, int len, char **user, char **passwd)
514 /* Empty user name not allowed. */
517 colon = memchr (str, ':', len);
519 /* Empty user name again. */
524 int pwlen = len - (colon + 1 - str);
525 *passwd = xmalloc (pwlen + 1);
526 memcpy (*passwd, colon + 1, pwlen);
527 (*passwd)[pwlen] = '\0';
533 *user = xmalloc (len + 1);
534 memcpy (*user, str, len);
538 url_unescape (*user);
540 url_unescape (*passwd);
545 /* Used by main.c: detect URLs written using the "shorthand" URL forms
546 popularized by Netscape and NcFTP. HTTP shorthands look like this:
548 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
549 www.foo.com[:port] -> http://www.foo.com[:port]
551 FTP shorthands look like this:
553 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
554 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
556 If the URL needs not or cannot be rewritten, return NULL. */
558 rewrite_shorthand_url (const char *url)
562 if (url_has_scheme (url))
565 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
567 for (p = url; *p && *p != ':' && *p != '/'; p++)
577 /* If the characters after the colon and before the next slash
578 or end of string are all digits, it's HTTP. */
580 for (pp = p + 1; ISDIGIT (*pp); pp++)
582 if (digits > 0 && (*pp == '/' || *pp == '\0'))
585 /* Prepend "ftp://" to the entire URL... */
586 res = xmalloc (6 + strlen (url) + 1);
587 sprintf (res, "ftp://%s", url);
588 /* ...and replace ':' with '/'. */
589 res[6 + (p - url)] = '/';
596 /* Just prepend "http://" to what we have. */
597 res = xmalloc (7 + strlen (url) + 1);
598 sprintf (res, "http://%s", url);
603 static void parse_path PARAMS ((const char *, char **, char **));
605 /* Like strpbrk, with the exception that it returns the pointer to the
606 terminating zero (end-of-string aka "eos") if no matching character
609 Although I normally balk at Gcc-specific optimizations, it probably
610 makes sense here: glibc has optimizations that detect strpbrk being
611 called with literal string as ACCEPT and inline the search. That
612 optimization is defeated if strpbrk is hidden within the call to
613 another function. (And no, making strpbrk_or_eos inline doesn't
614 help because the check for literal accept is in the
619 #define strpbrk_or_eos(s, accept) ({ \
620 char *SOE_p = strpbrk (s, accept); \
622 SOE_p = (char *)s + strlen (s); \
626 #else /* not __GNUC__ */
629 strpbrk_or_eos (const char *s, const char *accept)
631 char *p = strpbrk (s, accept);
633 p = (char *)s + strlen (s);
638 /* Turn STR into lowercase; return non-zero if a character was
642 lowercase_str (char *str)
649 *str = TOLOWER (*str);
654 static char *parse_errors[] = {
655 #define PE_NO_ERROR 0
657 #define PE_UNSUPPORTED_SCHEME 1
658 "Unsupported scheme",
659 #define PE_EMPTY_HOST 2
661 #define PE_BAD_PORT_NUMBER 3
663 #define PE_INVALID_USER_NAME 4
665 #define PE_UNTERMINATED_IPV6_ADDRESS 5
666 "Unterminated IPv6 numeric address",
667 #define PE_IPV6_NOT_SUPPORTED 6
668 "IPv6 addresses not supported",
669 #define PE_INVALID_IPV6_ADDRESS 7
670 "Invalid IPv6 numeric address"
673 #define SETERR(p, v) do { \
679 /* The following two functions were adapted from glibc. */
682 is_valid_ipv4_address (const char *str, const char *end)
684 int saw_digit, octets;
694 if (ch >= '0' && ch <= '9') {
695 val = val * 10 + (ch - '0');
699 if (saw_digit == 0) {
704 } else if (ch == '.' && saw_digit == 1) {
719 is_valid_ipv6_address (const char *str, const char *end)
721 static const char xdigits[] = "0123456789abcdef";
734 /* Leading :: requires some special handling. */
738 if (str == end || *str != ':')
750 /* if ch is a number, add it to val. */
751 pch = strchr(xdigits, ch);
754 val |= (pch - xdigits);
761 /* if ch is a colon ... */
764 if (saw_xdigit == 0) {
769 } else if (str == end) {
772 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
780 /* if ch is a dot ... */
781 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
782 is_valid_ipv4_address(curtok, end) == 1) {
791 if (saw_xdigit == 1) {
792 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
797 if (colonp != NULL) {
798 if (tp == NS_IN6ADDRSZ)
803 if (tp != NS_IN6ADDRSZ)
812 Return a new struct url if successful, NULL on error. In case of
813 error, and if ERROR is not NULL, also set *ERROR to the appropriate
816 url_parse (const char *url, int *error)
820 int path_modified, host_modified;
822 enum url_scheme scheme;
824 const char *uname_b, *uname_e;
825 const char *host_b, *host_e;
826 const char *path_b, *path_e;
827 const char *params_b, *params_e;
828 const char *query_b, *query_e;
829 const char *fragment_b, *fragment_e;
832 char *user = NULL, *passwd = NULL;
836 scheme = url_scheme (url);
837 if (scheme == SCHEME_INVALID)
839 SETERR (error, PE_UNSUPPORTED_SCHEME);
843 url_encoded = reencode_escapes (url);
846 p += strlen (supported_schemes[scheme].leading_string);
848 p += url_skip_uname (p);
851 /* scheme://user:pass@host[:port]... */
854 /* We attempt to break down the URL into the components path,
855 params, query, and fragment. They are ordered like this:
857 scheme://host[:port][/path][;params][?query][#fragment] */
859 params_b = params_e = NULL;
860 query_b = query_e = NULL;
861 fragment_b = fragment_e = NULL;
867 /* Handle IPv6 address inside square brackets. Ideally we'd
868 just look for the terminating ']', but rfc2732 mandates
869 rejecting invalid IPv6 addresses. */
871 /* The address begins after '['. */
873 host_e = strchr (host_b, ']');
877 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
882 /* Check if the IPv6 address is valid. */
883 if (!is_valid_ipv6_address(host_b, host_e))
885 SETERR (error, PE_INVALID_IPV6_ADDRESS);
889 /* Continue parsing after the closing ']'. */
892 SETERR (error, PE_IPV6_NOT_SUPPORTED);
898 p = strpbrk_or_eos (p, ":/;?#");
902 if (host_b == host_e)
904 SETERR (error, PE_EMPTY_HOST);
908 port = scheme_default_port (scheme);
911 const char *port_b, *port_e, *pp;
913 /* scheme://host:port/tralala */
917 p = strpbrk_or_eos (p, "/;?#");
920 if (port_b == port_e)
922 /* http://host:/whatever */
924 SETERR (error, PE_BAD_PORT_NUMBER);
928 for (port = 0, pp = port_b; pp < port_e; pp++)
932 /* http://host:12randomgarbage/blah */
934 SETERR (error, PE_BAD_PORT_NUMBER);
938 port = 10 * port + (*pp - '0');
946 p = strpbrk_or_eos (p, ";?#");
951 /* Path is not allowed not to exist. */
959 p = strpbrk_or_eos (p, "?#");
966 p = strpbrk_or_eos (p, "#");
969 /* Hack that allows users to use '?' (a wildcard character) in
970 FTP URLs without it being interpreted as a query string
972 if (scheme == SCHEME_FTP)
974 query_b = query_e = NULL;
987 if (uname_b != uname_e)
989 /* http://user:pass@host */
991 /* uname_b uname_e */
992 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
994 SETERR (error, PE_INVALID_USER_NAME);
999 u = (struct url *)xmalloc (sizeof (struct url));
1000 memset (u, 0, sizeof (*u));
1003 u->host = strdupdelim (host_b, host_e);
1008 u->path = strdupdelim (path_b, path_e);
1009 path_modified = path_simplify (u->path);
1010 parse_path (u->path, &u->dir, &u->file);
1012 host_modified = lowercase_str (u->host);
1015 u->params = strdupdelim (params_b, params_e);
1017 u->query = strdupdelim (query_b, query_e);
1019 u->fragment = strdupdelim (fragment_b, fragment_e);
1021 if (path_modified || u->fragment || host_modified || path_b == path_e)
1023 /* If we suspect that a transformation has rendered what
1024 url_string might return different from URL_ENCODED, rebuild
1025 u->url using url_string. */
1026 u->url = url_string (u, 0);
1028 if (url_encoded != url)
1029 xfree ((char *) url_encoded);
1033 if (url_encoded == url)
1034 u->url = xstrdup (url);
1036 u->url = url_encoded;
1044 url_error (int error_code)
1046 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1047 return parse_errors[error_code];
1050 /* Parse PATH into dir and file. PATH is extracted from the URL and
1051 is URL-escaped. The function returns unescaped DIR and FILE. */
1054 parse_path (const char *path, char **dir, char **file)
1058 last_slash = strrchr (path, '/');
1061 *dir = xstrdup ("");
1062 *file = xstrdup (path);
1066 *dir = strdupdelim (path, last_slash);
1067 *file = xstrdup (last_slash + 1);
1069 url_unescape (*dir);
1070 url_unescape (*file);
1073 /* Note: URL's "full path" is the path with the query string and
1074 params appended. The "fragment" (#foo) is intentionally ignored,
1075 but that might be changed. For example, if the original URL was
1076 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1077 the full path will be "/foo/bar/baz;bullshit?querystring". */
1079 /* Return the length of the full path, without the terminating
1083 full_path_length (const struct url *url)
1087 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1098 /* Write out the full path. */
1101 full_path_write (const struct url *url, char *where)
1103 #define FROB(el, chr) do { \
1104 char *f_el = url->el; \
1106 int l = strlen (f_el); \
1108 memcpy (where, f_el, l); \
1120 /* Public function for getting the "full path". E.g. if u->path is
1121 "foo/bar" and u->query is "param=value", full_path will be
1122 "/foo/bar?param=value". */
1125 url_full_path (const struct url *url)
1127 int length = full_path_length (url);
1128 char *full_path = (char *)xmalloc(length + 1);
1130 full_path_write (url, full_path);
1131 full_path[length] = '\0';
1136 /* Escape unsafe and reserved characters, except for the slash
1140 url_escape_dir (const char *dir)
1142 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1147 /* Unescape slashes in NEWDIR. */
1149 h = newdir; /* hare */
1150 t = newdir; /* tortoise */
1152 for (; *h; h++, t++)
1154 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1167 /* Sync u->path and u->url with u->dir and u->file. Called after
1168 u->file or u->dir have been changed, typically by the FTP code. */
1171 sync_path (struct url *u)
1173 char *newpath, *efile, *edir;
1177 /* u->dir and u->file are not escaped. URL-escape them before
1178 reassembling them into u->path. That way, if they contain
1179 separators like '?' or even if u->file contains slashes, the
1180 path will be correctly assembled. (u->file can contain slashes
1181 if the URL specifies it with %2f, or if an FTP server returns
1183 edir = url_escape_dir (u->dir);
1184 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1187 newpath = xstrdup (efile);
1190 int dirlen = strlen (edir);
1191 int filelen = strlen (efile);
1193 /* Copy "DIR/FILE" to newpath. */
1194 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1195 memcpy (p, edir, dirlen);
1198 memcpy (p, efile, filelen);
1207 if (efile != u->file)
1210 /* Regenerate u->url as well. */
1212 u->url = url_string (u, 0);
1215 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1216 This way we can sync u->path and u->url when they get changed. */
1219 url_set_dir (struct url *url, const char *newdir)
1222 url->dir = xstrdup (newdir);
1227 url_set_file (struct url *url, const char *newfile)
1230 url->file = xstrdup (newfile);
1235 url_free (struct url *url)
1241 FREE_MAYBE (url->params);
1242 FREE_MAYBE (url->query);
1243 FREE_MAYBE (url->fragment);
1244 FREE_MAYBE (url->user);
1245 FREE_MAYBE (url->passwd);
1254 get_urls_file (const char *file)
1256 struct file_memory *fm;
1257 struct urlpos *head, *tail;
1258 const char *text, *text_end;
1260 /* Load the file. */
1261 fm = read_file (file);
1264 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1267 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1271 text_end = fm->content + fm->length;
1272 while (text < text_end)
1274 const char *line_beg = text;
1275 const char *line_end = memchr (text, '\n', text_end - text);
1277 line_end = text_end;
1282 /* Strip whitespace from the beginning and end of line. */
1283 while (line_beg < line_end && ISSPACE (*line_beg))
1285 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1288 if (line_end > line_beg)
1290 /* URL is in the [line_beg, line_end) region. */
1294 struct urlpos *entry;
1297 /* We must copy the URL to a zero-terminated string, and we
1298 can't use alloca because we're in a loop. *sigh*. */
1299 url_text = strdupdelim (line_beg, line_end);
1303 /* Merge opt.base_href with URL. */
1304 char *merged = uri_merge (opt.base_href, url_text);
1309 url = url_parse (url_text, &up_error_code);
1312 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1313 file, url_text, url_error (up_error_code));
1319 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1320 memset (entry, 0, sizeof (*entry));
1331 read_file_free (fm);
1335 /* Free the linked list of urlpos. */
1337 free_urlpos (struct urlpos *l)
1341 struct urlpos *next = l->next;
1344 FREE_MAYBE (l->local_name);
1350 /* Rotate FNAME opt.backups times */
1352 rotate_backups(const char *fname)
1354 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1355 char *from = (char *)alloca (maxlen);
1356 char *to = (char *)alloca (maxlen);
1360 if (stat (fname, &sb) == 0)
1361 if (S_ISREG (sb.st_mode) == 0)
1364 for (i = opt.backups; i > 1; i--)
1366 sprintf (from, "%s.%d", fname, i - 1);
1367 sprintf (to, "%s.%d", fname, i);
1371 sprintf (to, "%s.%d", fname, 1);
1375 /* Create all the necessary directories for PATH (a file). Calls
1376 mkdirhier() internally. */
1378 mkalldirs (const char *path)
1385 p = path + strlen (path);
1386 for (; *p != '/' && p != path; p--)
1389 /* Don't create if it's just a file. */
1390 if ((p == path) && (*p != '/'))
1392 t = strdupdelim (path, p);
1394 /* Check whether the directory exists. */
1395 if ((stat (t, &st) == 0))
1397 if (S_ISDIR (st.st_mode))
1404 /* If the dir exists as a file name, remove it first. This
1405 is *only* for Wget to work with buggy old CERN http
1406 servers. Here is the scenario: When Wget tries to
1407 retrieve a directory without a slash, e.g.
1408 http://foo/bar (bar being a directory), CERN server will
1409 not redirect it too http://foo/bar/ -- it will generate a
1410 directory listing containing links to bar/file1,
1411 bar/file2, etc. Wget will lose because it saves this
1412 HTML listing to a file `bar', so it cannot create the
1413 directory. To work around this, if the file of the same
1414 name exists, we just remove it and create the directory
1416 DEBUGP (("Removing %s because of directory danger!\n", t));
1420 res = make_directory (t);
1422 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1427 /* Functions for constructing the file name out of URL components. */
1429 /* A growable string structure, used by url_file_name and friends.
1430 This should perhaps be moved to utils.c.
1432 The idea is to have a convenient and efficient way to construct a
1433 string by having various functions append data to it. Instead of
1434 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1435 functions in questions, we pass the pointer to this struct. */
1443 /* Ensure that the string can accept APPEND_COUNT more characters past
1444 the current TAIL position. If necessary, this will grow the string
1445 and update its allocated size. If the string is already large
1446 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1447 #define GROW(g, append_size) do { \
1448 struct growable *G_ = g; \
1449 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1452 /* Return the tail position of the string. */
1453 #define TAIL(r) ((r)->base + (r)->tail)
1455 /* Move the tail position by APPEND_COUNT characters. */
1456 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1458 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1462 append_string (const char *str, struct growable *dest)
1464 int l = strlen (str);
1466 memcpy (TAIL (dest), str, l);
1467 TAIL_INCR (dest, l);
1470 /* Append CH to DEST. For example, append_char (0, DEST)
1471 zero-terminates DEST. */
1474 append_char (char ch, struct growable *dest)
1478 TAIL_INCR (dest, 1);
1482 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1483 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1484 filechr_control = 4, /* a control character, e.g. 0-31 */
1487 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1489 /* Shorthands for the table: */
1490 #define U filechr_not_unix
1491 #define W filechr_not_windows
1492 #define C filechr_control
1497 /* Table of characters unsafe under various conditions (see above).
1499 Arguably we could also claim `%' to be unsafe, since we use it as
1500 the escape character. If we ever want to be able to reliably
1501 translate file name back to URL, this would become important
1502 crucial. Right now, it's better to be minimal in escaping. */
1504 const static unsigned char filechr_table[256] =
1506 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1507 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1508 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1509 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1510 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1511 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1512 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1513 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1514 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1515 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1516 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1517 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1518 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1519 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1520 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1521 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1523 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1524 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1539 /* FN_PORT_SEP is the separator between host and port in file names
1540 for non-standard port numbers. On Unix this is normally ':', as in
1541 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1542 because Windows can't handle ':' in file names. */
1543 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1545 /* FN_QUERY_SEP is the separator between the file name and the URL
1546 query, normally '?'. Since Windows cannot handle '?' as part of
1547 file name, we use '@' instead there. */
1548 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1550 /* Quote path element, characters in [b, e), as file name, and append
1551 the quoted string to DEST. Each character is quoted as per
1552 file_unsafe_char and the corresponding table. */
1555 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1564 if (opt.restrict_files_os == restrict_unix)
1565 mask = filechr_not_unix;
1567 mask = filechr_not_windows;
1568 if (opt.restrict_files_ctrl)
1569 mask |= filechr_control;
1571 /* Copy [b, e) to PATHEL and URL-unescape it. */
1572 BOUNDED_TO_ALLOCA (b, e, pathel);
1573 url_unescape (pathel);
1574 pathlen = strlen (pathel);
1576 /* Go through PATHEL and check how many characters we'll need to
1577 add for file quoting. */
1579 for (p = pathel; *p; p++)
1580 if (FILE_CHAR_TEST (*p, mask))
1583 /* p - pathel is the string length. Each quoted char means two
1584 additional characters in the string, hence 2*quoted. */
1585 outlen = (p - pathel) + (2 * quoted);
1586 GROW (dest, outlen);
1590 /* If there's nothing to quote, we don't need to go through the
1591 string the second time. */
1592 memcpy (TAIL (dest), pathel, outlen);
1596 char *q = TAIL (dest);
1597 for (p = pathel; *p; p++)
1599 if (!FILE_CHAR_TEST (*p, mask))
1603 unsigned char ch = *p;
1605 *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1606 *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1609 assert (q - TAIL (dest) == outlen);
1611 TAIL_INCR (dest, outlen);
1614 /* Append to DEST the directory structure that corresponds the
1615 directory part of URL's path. For example, if the URL is
1616 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1618 Each path element ("dir1" and "dir2" in the above example) is
1619 examined, url-unescaped, and re-escaped as file name element.
1621 Additionally, it cuts as many directories from the path as
1622 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1623 will produce "bar" for the above example. For 2 or more, it will
1626 Each component of the path is quoted for use as file name. */
1629 append_dir_structure (const struct url *u, struct growable *dest)
1631 char *pathel, *next;
1632 int cut = opt.cut_dirs;
1634 /* Go through the path components, de-URL-quote them, and quote them
1635 (if necessary) as file names. */
1638 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1643 /* Ignore empty pathels. path_simplify should remove
1644 occurrences of "//" from the path, but it has special cases
1645 for starting / which generates an empty pathel here. */
1649 append_char ('/', dest);
1650 append_uri_pathel (pathel, next, dest);
1654 /* Return a unique file name that matches the given URL as good as
1655 possible. Does not create directories on the file system. */
1658 url_file_name (const struct url *u)
1660 struct growable fnres;
1662 char *u_file, *u_query;
1663 char *fname, *unique;
1669 /* Start with the directory prefix, if specified. */
1670 if (!DOTP (opt.dir_prefix))
1671 append_string (opt.dir_prefix, &fnres);
1673 /* If "dirstruct" is turned on (typically the case with -r), add
1674 the host and port (unless those have been turned off) and
1675 directory structure. */
1678 if (opt.add_hostdir)
1681 append_char ('/', &fnres);
1682 append_string (u->host, &fnres);
1683 if (u->port != scheme_default_port (u->scheme))
1686 number_to_string (portstr, u->port);
1687 append_char (FN_PORT_SEP, &fnres);
1688 append_string (portstr, &fnres);
1692 append_dir_structure (u, &fnres);
1695 /* Add the file name. */
1697 append_char ('/', &fnres);
1698 u_file = *u->file ? u->file : "index.html";
1699 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1701 /* Append "?query" to the file name. */
1702 u_query = u->query && *u->query ? u->query : NULL;
1705 append_char (FN_QUERY_SEP, &fnres);
1706 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1709 /* Zero-terminate the file name. */
1710 append_char ('\0', &fnres);
1714 /* Check the cases in which the unique extensions are not used:
1715 1) Clobbering is turned off (-nc).
1716 2) Retrieval with regetting.
1717 3) Timestamping is used.
1718 4) Hierarchy is built.
1720 The exception is the case when file does exist and is a
1721 directory (see `mkalldirs' for explanation). */
1723 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1724 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1727 unique = unique_name (fname, 1);
1728 if (unique != fname)
1733 /* Return the length of URL's path. Path is considered to be
1734 terminated by one of '?', ';', '#', or by the end of the
1737 path_length (const char *url)
1739 const char *q = strpbrk_or_eos (url, "?;#");
1743 /* Find the last occurrence of character C in the range [b, e), or
1744 NULL, if none are present. This is equivalent to strrchr(b, c),
1745 except that it accepts an END argument instead of requiring the
1746 string to be zero-terminated. Why is there no memrchr()? */
1748 find_last_char (const char *b, const char *e, char c)
1756 /* Resolve "." and ".." elements of PATH by destructively modifying
1757 PATH. "." is resolved by removing that path element, and ".." is
1758 resolved by removing the preceding path element. Leading and
1759 trailing slashes are preserved.
1761 Return non-zero if any changes have been made.
1763 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1764 test examples are provided below. If you change anything in this
1765 function, run test_path_simplify to make sure you haven't broken a
1768 A previous version of this function was based on path_simplify()
1769 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1772 path_simplify (char *path)
1778 ++path; /* preserve the leading '/'. */
1781 end = p + strlen (p) + 1; /* position past the terminating zero. */
1786 /* P should point to the beginning of a path element. */
1788 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1790 /* Handle "./foo" by moving "foo" two characters to the
1792 if (*(p + 1) == '/')
1795 memmove (p, p + 2, end - p);
1806 else if (*p == '.' && *(p + 1) == '.'
1807 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1809 /* Handle "../foo" by moving "foo" one path element to the
1811 char *b = p; /* not p-1 because P can equal PATH */
1813 /* Backtrack by one path element, but not past the beginning
1816 /* foo/bar/../baz */
1822 /* Move backwards until B hits the beginning of the
1823 previous path element or the beginning of path. */
1824 for (--b; b > path && *(b - 1) != '/'; b--)
1829 if (*(p + 2) == '/')
1831 memmove (b, p + 3, end - (p + 3));
1845 /* Remove empty path elements. Not mandated by rfc1808 et
1846 al, but it seems like a good idea to get rid of them.
1847 Supporting them properly is hard (in which directory do
1848 you save http://x.com///y.html?) and they don't seem to
1859 memmove (p, q, end - q);
1864 /* Skip to the next path element. */
1865 while (*p && *p != '/')
1870 /* Make sure P points to the beginning of the next path element,
1871 which is location after the slash. */
1878 /* Resolve the result of "linking" a base URI (BASE) to a
1879 link-specified URI (LINK).
1881 Either of the URIs may be absolute or relative, complete with the
1882 host name, or path only. This tries to behave "reasonably" in all
1883 foreseeable cases. It employs little specific knowledge about
1884 schemes or URL-specific stuff -- it just works on strings.
1886 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1887 See uri_merge for a gentler interface to this functionality.
1889 Perhaps this function should call path_simplify so that the callers
1890 don't have to call url_parse unconditionally. */
1892 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1898 const char *end = base + path_length (base);
1902 /* Empty LINK points back to BASE, query string and all. */
1903 constr = xstrdup (base);
1905 else if (*link == '?')
1907 /* LINK points to the same location, but changes the query
1908 string. Examples: */
1909 /* uri_merge("path", "?new") -> "path?new" */
1910 /* uri_merge("path?foo", "?new") -> "path?new" */
1911 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1912 /* uri_merge("path#foo", "?new") -> "path?new" */
1913 int baselength = end - base;
1914 constr = xmalloc (baselength + linklength + 1);
1915 memcpy (constr, base, baselength);
1916 memcpy (constr + baselength, link, linklength);
1917 constr[baselength + linklength] = '\0';
1919 else if (*link == '#')
1921 /* uri_merge("path", "#new") -> "path#new" */
1922 /* uri_merge("path#foo", "#new") -> "path#new" */
1923 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1924 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1926 const char *end1 = strchr (base, '#');
1928 end1 = base + strlen (base);
1929 baselength = end1 - base;
1930 constr = xmalloc (baselength + linklength + 1);
1931 memcpy (constr, base, baselength);
1932 memcpy (constr + baselength, link, linklength);
1933 constr[baselength + linklength] = '\0';
1935 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1937 /* LINK begins with "//" and so is a net path: we need to
1938 replace everything after (and including) the double slash
1941 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1942 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1943 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1947 const char *start_insert;
1949 /* Look for first slash. */
1950 slash = memchr (base, '/', end - base);
1951 /* If found slash and it is a double slash, then replace
1952 from this point, else default to replacing from the
1954 if (slash && *(slash + 1) == '/')
1955 start_insert = slash;
1957 start_insert = base;
1959 span = start_insert - base;
1960 constr = (char *)xmalloc (span + linklength + 1);
1962 memcpy (constr, base, span);
1963 memcpy (constr + span, link, linklength);
1964 constr[span + linklength] = '\0';
1966 else if (*link == '/')
1968 /* LINK is an absolute path: we need to replace everything
1969 after (and including) the FIRST slash with LINK.
1971 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1972 "/qux/xyzzy", our result should be
1973 "http://host/qux/xyzzy". */
1976 const char *start_insert = NULL; /* for gcc to shut up. */
1977 const char *pos = base;
1978 int seen_slash_slash = 0;
1979 /* We're looking for the first slash, but want to ignore
1982 slash = memchr (pos, '/', end - pos);
1983 if (slash && !seen_slash_slash)
1984 if (*(slash + 1) == '/')
1987 seen_slash_slash = 1;
1991 /* At this point, SLASH is the location of the first / after
1992 "//", or the first slash altogether. START_INSERT is the
1993 pointer to the location where LINK will be inserted. When
1994 examining the last two examples, keep in mind that LINK
1997 if (!slash && !seen_slash_slash)
1998 /* example: "foo" */
2000 start_insert = base;
2001 else if (!slash && seen_slash_slash)
2002 /* example: "http://foo" */
2005 else if (slash && !seen_slash_slash)
2006 /* example: "foo/bar" */
2008 start_insert = base;
2009 else if (slash && seen_slash_slash)
2010 /* example: "http://something/" */
2012 start_insert = slash;
2014 span = start_insert - base;
2015 constr = (char *)xmalloc (span + linklength + 1);
2017 memcpy (constr, base, span);
2019 memcpy (constr + span, link, linklength);
2020 constr[span + linklength] = '\0';
2024 /* LINK is a relative URL: we need to replace everything
2025 after last slash (possibly empty) with LINK.
2027 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2028 our result should be "whatever/foo/qux/xyzzy". */
2029 int need_explicit_slash = 0;
2031 const char *start_insert;
2032 const char *last_slash = find_last_char (base, end, '/');
2035 /* No slash found at all. Append LINK to what we have,
2036 but we'll need a slash as a separator.
2038 Example: if base == "foo" and link == "qux/xyzzy", then
2039 we cannot just append link to base, because we'd get
2040 "fooqux/xyzzy", whereas what we want is
2043 To make sure the / gets inserted, we set
2044 need_explicit_slash to 1. We also set start_insert
2045 to end + 1, so that the length calculations work out
2046 correctly for one more (slash) character. Accessing
2047 that character is fine, since it will be the
2048 delimiter, '\0' or '?'. */
2049 /* example: "foo?..." */
2050 /* ^ ('?' gets changed to '/') */
2051 start_insert = end + 1;
2052 need_explicit_slash = 1;
2054 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2056 /* example: http://host" */
2058 start_insert = end + 1;
2059 need_explicit_slash = 1;
2063 /* example: "whatever/foo/bar" */
2065 start_insert = last_slash + 1;
2068 span = start_insert - base;
2069 constr = (char *)xmalloc (span + linklength + 1);
2071 memcpy (constr, base, span);
2072 if (need_explicit_slash)
2073 constr[span - 1] = '/';
2075 memcpy (constr + span, link, linklength);
2076 constr[span + linklength] = '\0';
2079 else /* !no_scheme */
2081 constr = strdupdelim (link, link + linklength);
2086 /* Merge BASE with LINK and return the resulting URI. This is an
2087 interface to uri_merge_1 that assumes that LINK is a
2088 zero-terminated string. */
2090 uri_merge (const char *base, const char *link)
2092 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2095 #define APPEND(p, s) do { \
2096 int len = strlen (s); \
2097 memcpy (p, s, len); \
2101 /* Use this instead of password when the actual password is supposed
2102 to be hidden. We intentionally use a generic string without giving
2103 away the number of characters in the password, like previous
2105 #define HIDDEN_PASSWORD "*password*"
2107 /* Recreate the URL string from the data in URL.
2109 If HIDE is non-zero (as it is when we're calling this on a URL we
2110 plan to print, but not when calling it to canonicalize a URL for
2111 use within the program), password will be hidden. Unsafe
2112 characters in the URL will be quoted. */
2115 url_string (const struct url *url, int hide_password)
2119 char *quoted_user = NULL, *quoted_passwd = NULL;
2121 int scheme_port = supported_schemes[url->scheme].default_port;
2122 char *scheme_str = supported_schemes[url->scheme].leading_string;
2123 int fplen = full_path_length (url);
2125 int brackets_around_host = 0;
2127 assert (scheme_str != NULL);
2129 /* Make sure the user name and password are quoted. */
2132 quoted_user = url_escape_allow_passthrough (url->user);
2136 quoted_passwd = HIDDEN_PASSWORD;
2138 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2142 if (strchr (url->host, ':'))
2143 brackets_around_host = 1;
2145 size = (strlen (scheme_str)
2146 + strlen (url->host)
2147 + (brackets_around_host ? 2 : 0)
2150 if (url->port != scheme_port)
2151 size += 1 + numdigit (url->port);
2154 size += 1 + strlen (quoted_user);
2156 size += 1 + strlen (quoted_passwd);
2159 p = result = xmalloc (size);
2161 APPEND (p, scheme_str);
2164 APPEND (p, quoted_user);
2168 APPEND (p, quoted_passwd);
2173 if (brackets_around_host)
2175 APPEND (p, url->host);
2176 if (brackets_around_host)
2178 if (url->port != scheme_port)
2181 p = number_to_string (p, url->port);
2184 full_path_write (url, p);
2188 assert (p - result == size);
2190 if (quoted_user && quoted_user != url->user)
2191 xfree (quoted_user);
2192 if (quoted_passwd && !hide_password
2193 && quoted_passwd != url->passwd)
2194 xfree (quoted_passwd);
2199 /* Return the URL of the proxy appropriate for url U. */
2201 getproxy (struct url *u)
2204 char *rewritten_url;
2205 static char rewritten_storage[1024];
2209 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2215 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2219 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2223 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2225 case SCHEME_INVALID:
2228 if (!proxy || !*proxy)
2231 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2232 getproxy() to return static storage. */
2233 rewritten_url = rewrite_shorthand_url (proxy);
2236 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2237 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2238 proxy = rewritten_storage;
2244 /* Should a host be accessed through proxy, concerning no_proxy? */
2246 no_proxy_match (const char *host, const char **no_proxy)
2251 return !sufmatch (no_proxy, host);
2254 /* Support for converting links for local viewing in downloaded HTML
2255 files. This should be moved to another file, because it has
2256 nothing to do with processing URLs. */
2258 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2259 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2261 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2262 const char *, int));
2263 static char *local_quote_string PARAMS ((const char *));
2265 /* Change the links in one HTML file. LINKS is a list of links in the
2266 document, along with their positions and the desired direction of
2269 convert_links (const char *file, struct urlpos *links)
2271 struct file_memory *fm;
2274 downloaded_file_t downloaded_file_return;
2276 struct urlpos *link;
2277 int to_url_count = 0, to_file_count = 0;
2279 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2282 /* First we do a "dry run": go through the list L and see whether
2283 any URL needs to be converted in the first place. If not, just
2284 leave the file alone. */
2286 struct urlpos *dry = links;
2287 for (dry = links; dry; dry = dry->next)
2288 if (dry->convert != CO_NOCONVERT)
2292 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2297 fm = read_file (file);
2300 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2301 file, strerror (errno));
2305 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2306 if (opt.backup_converted && downloaded_file_return)
2307 write_backup_file (file, downloaded_file_return);
2309 /* Before opening the file for writing, unlink the file. This is
2310 important if the data in FM is mmaped. In such case, nulling the
2311 file, which is what fopen() below does, would make us read all
2312 zeroes from the mmaped region. */
2313 if (unlink (file) < 0 && errno != ENOENT)
2315 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2316 file, strerror (errno));
2317 read_file_free (fm);
2320 /* Now open the file for writing. */
2321 fp = fopen (file, "wb");
2324 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2325 file, strerror (errno));
2326 read_file_free (fm);
2330 /* Here we loop through all the URLs in file, replacing those of
2331 them that are downloaded with relative references. */
2333 for (link = links; link; link = link->next)
2335 char *url_start = fm->content + link->pos;
2337 if (link->pos >= fm->length)
2339 DEBUGP (("Something strange is going on. Please investigate."));
2342 /* If the URL is not to be converted, skip it. */
2343 if (link->convert == CO_NOCONVERT)
2345 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2349 /* Echo the file contents, up to the offending URL's opening
2350 quote, to the outfile. */
2351 fwrite (p, 1, url_start - p, fp);
2354 switch (link->convert)
2356 case CO_CONVERT_TO_RELATIVE:
2357 /* Convert absolute URL to relative. */
2359 char *newname = construct_relative (file, link->local_name);
2360 char *quoted_newname = local_quote_string (newname);
2362 if (!link->link_refresh_p)
2363 p = replace_attr (p, link->size, fp, quoted_newname);
2365 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2366 link->refresh_timeout);
2368 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2369 link->url->url, newname, link->pos, file));
2371 xfree (quoted_newname);
2375 case CO_CONVERT_TO_COMPLETE:
2376 /* Convert the link to absolute URL. */
2378 char *newlink = link->url->url;
2379 char *quoted_newlink = html_quote_string (newlink);
2381 if (!link->link_refresh_p)
2382 p = replace_attr (p, link->size, fp, quoted_newlink);
2384 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2385 link->refresh_timeout);
2387 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2388 newlink, link->pos, file));
2389 xfree (quoted_newlink);
2393 case CO_NULLIFY_BASE:
2394 /* Change the base href to "". */
2395 p = replace_attr (p, link->size, fp, "");
2403 /* Output the rest of the file. */
2404 if (p - fm->content < fm->length)
2405 fwrite (p, 1, fm->length - (p - fm->content), fp);
2407 read_file_free (fm);
2409 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2412 /* Construct and return a malloced copy of the relative link from two
2413 pieces of information: local name S1 of the referring file and
2414 local name S2 of the referred file.
2416 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2417 "jagor.srce.hr/images/news.gif", the function will return
2420 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2421 "fly.cc.fer.hr/images/fly.gif", the function will return
2422 "../images/fly.gif".
2424 Caveats: S1 should not begin with `/', unless S2 also begins with
2425 '/'. S1 should not contain things like ".." and such --
2426 construct_relative ("fly/ioccc/../index.html",
2427 "fly/images/fly.gif") will fail. (A workaround is to call
2428 something like path_simplify() on S1). */
2430 construct_relative (const char *s1, const char *s2)
2432 int i, cnt, sepdirs1;
2436 return xstrdup (s2);
2437 /* S1 should *not* be absolute, if S2 wasn't. */
2438 assert (*s1 != '/');
2440 /* Skip the directories common to both strings. */
2443 while (s1[i] && s2[i]
2448 if (s1[i] == '/' && s2[i] == '/')
2453 for (sepdirs1 = 0; s1[i]; i++)
2456 /* Now, construct the file as of:
2457 - ../ repeated sepdirs1 time
2458 - all the non-mutual directories of S2. */
2459 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2460 for (i = 0; i < sepdirs1; i++)
2461 memcpy (res + 3 * i, "../", 3);
2462 strcpy (res + 3 * i, s2 + cnt);
2467 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2469 /* Rather than just writing over the original .html file with the
2470 converted version, save the former to *.orig. Note we only do
2471 this for files we've _successfully_ downloaded, so we don't
2472 clobber .orig files sitting around from previous invocations. */
2474 /* Construct the backup filename as the original name plus ".orig". */
2475 size_t filename_len = strlen(file);
2476 char* filename_plus_orig_suffix;
2477 boolean already_wrote_backup_file = FALSE;
2478 slist* converted_file_ptr;
2479 static slist* converted_files = NULL;
2481 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2483 /* Just write "orig" over "html". We need to do it this way
2484 because when we're checking to see if we've downloaded the
2485 file before (to see if we can skip downloading it), we don't
2486 know if it's a text/html file. Therefore we don't know yet
2487 at that stage that -E is going to cause us to tack on
2488 ".html", so we need to compare vs. the original URL plus
2489 ".orig", not the original URL plus ".html.orig". */
2490 filename_plus_orig_suffix = alloca (filename_len + 1);
2491 strcpy(filename_plus_orig_suffix, file);
2492 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2494 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2496 /* Append ".orig" to the name. */
2497 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2498 strcpy(filename_plus_orig_suffix, file);
2499 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2502 /* We can get called twice on the same URL thanks to the
2503 convert_all_links() call in main(). If we write the .orig file
2504 each time in such a case, it'll end up containing the first-pass
2505 conversion, not the original file. So, see if we've already been
2506 called on this file. */
2507 converted_file_ptr = converted_files;
2508 while (converted_file_ptr != NULL)
2509 if (strcmp(converted_file_ptr->string, file) == 0)
2511 already_wrote_backup_file = TRUE;
2515 converted_file_ptr = converted_file_ptr->next;
2517 if (!already_wrote_backup_file)
2519 /* Rename <file> to <file>.orig before former gets written over. */
2520 if (rename(file, filename_plus_orig_suffix) != 0)
2521 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2522 file, filename_plus_orig_suffix, strerror (errno));
2524 /* Remember that we've already written a .orig backup for this file.
2525 Note that we never free this memory since we need it till the
2526 convert_all_links() call, which is one of the last things the
2527 program does before terminating. BTW, I'm not sure if it would be
2528 safe to just set 'converted_file_ptr->string' to 'file' below,
2529 rather than making a copy of the string... Another note is that I
2530 thought I could just add a field to the urlpos structure saying
2531 that we'd written a .orig file for this URL, but that didn't work,
2532 so I had to make this separate list.
2533 -- Dan Harkless <wget@harkless.org>
2535 This [adding a field to the urlpos structure] didn't work
2536 because convert_file() is called from convert_all_links at
2537 the end of the retrieval with a freshly built new urlpos
2539 -- Hrvoje Niksic <hniksic@arsdigita.com>
2541 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2542 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2543 converted_file_ptr->next = converted_files;
2544 converted_files = converted_file_ptr;
2548 static int find_fragment PARAMS ((const char *, int, const char **,
2551 /* Replace an attribute's original text with NEW_TEXT. */
2554 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2557 char quote_char = '\"'; /* use "..." for quoting, unless the
2558 original value is quoted, in which
2559 case reuse its quoting char. */
2560 const char *frag_beg, *frag_end;
2562 /* Structure of our string is:
2563 "...old-contents..."
2564 <--- size ---> (with quotes)
2567 <--- size --> (no quotes) */
2569 if (*p == '\"' || *p == '\'')
2574 size -= 2; /* disregard opening and closing quote */
2576 putc (quote_char, fp);
2577 fputs (new_text, fp);
2579 /* Look for fragment identifier, if any. */
2580 if (find_fragment (p, size, &frag_beg, &frag_end))
2581 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2585 putc (quote_char, fp);
2590 /* The same as REPLACE_ATTR, but used when replacing
2591 <meta http-equiv=refresh content="new_text"> because we need to
2592 append "timeout_value; URL=" before the next_text. */
2595 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2596 const char *new_text, int timeout)
2599 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2603 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2605 return replace_attr (p, size, fp, new_with_timeout);
2608 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2609 preceded by '&'. If the character is not found, return zero. If
2610 the character is found, return 1 and set BP and EP to point to the
2611 beginning and end of the region.
2613 This is used for finding the fragment indentifiers in URLs. */
2616 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2618 const char *end = beg + size;
2620 for (; beg < end; beg++)
2642 /* Quote FILE for use as local reference to an HTML file.
2644 We quote ? as %3F to avoid passing part of the file name as the
2645 parameter when browsing the converted file through HTTP. However,
2646 it is safe to do this only when `--html-extension' is turned on.
2647 This is because converting "index.html?foo=bar" to
2648 "index.html%3Ffoo=bar" would break local browsing, as the latter
2649 isn't even recognized as an HTML file! However, converting
2650 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2651 safe for both local and HTTP-served browsing. */
2654 local_quote_string (const char *file)
2656 const char *file_sans_qmark;
2659 if (!opt.html_extension)
2660 return html_quote_string (file);
2662 qm = count_char (file, '?');
2666 const char *from = file;
2669 /* qm * 2 because we replace each question mark with "%3F",
2670 i.e. replace one char with three, hence two more. */
2671 int fsqlen = strlen (file) + qm * 2;
2673 to = newname = (char *)alloca (fsqlen + 1);
2674 for (; *from; from++)
2685 assert (to - newname == fsqlen);
2688 file_sans_qmark = newname;
2691 file_sans_qmark = file;
2693 return html_quote_string (file_sans_qmark);
2696 /* We're storing "modes" of type downloaded_file_t in the hash table.
2697 However, our hash tables only accept pointers for keys and values.
2698 So when we need a pointer, we use the address of a
2699 downloaded_file_t variable of static storage. */
2701 static downloaded_file_t *
2702 downloaded_mode_to_ptr (downloaded_file_t mode)
2704 static downloaded_file_t
2705 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2706 v2 = FILE_DOWNLOADED_NORMALLY,
2707 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2708 v4 = CHECK_FOR_FILE;
2712 case FILE_NOT_ALREADY_DOWNLOADED:
2714 case FILE_DOWNLOADED_NORMALLY:
2716 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2718 case CHECK_FOR_FILE:
2724 /* This should really be merged with dl_file_url_map and
2725 downloaded_html_files in recur.c. This was originally a list, but
2726 I changed it to a hash table beause it was actually taking a lot of
2727 time to find things in it. */
2729 static struct hash_table *downloaded_files_hash;
2731 /* Remembers which files have been downloaded. In the standard case, should be
2732 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2733 download successfully (i.e. not for ones we have failures on or that we skip
2736 When we've downloaded a file and tacked on a ".html" extension due to -E,
2737 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2738 FILE_DOWNLOADED_NORMALLY.
2740 If you just want to check if a file has been previously added without adding
2741 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2742 with local filenames, not remote URLs. */
2744 downloaded_file (downloaded_file_t mode, const char *file)
2746 downloaded_file_t *ptr;
2748 if (mode == CHECK_FOR_FILE)
2750 if (!downloaded_files_hash)
2751 return FILE_NOT_ALREADY_DOWNLOADED;
2752 ptr = hash_table_get (downloaded_files_hash, file);
2754 return FILE_NOT_ALREADY_DOWNLOADED;
2758 if (!downloaded_files_hash)
2759 downloaded_files_hash = make_string_hash_table (0);
2761 ptr = hash_table_get (downloaded_files_hash, file);
2765 ptr = downloaded_mode_to_ptr (mode);
2766 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2768 return FILE_NOT_ALREADY_DOWNLOADED;
2772 df_free_mapper (void *key, void *value, void *ignored)
2779 downloaded_files_free (void)
2781 if (downloaded_files_hash)
2783 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2784 hash_table_destroy (downloaded_files_hash);
2785 downloaded_files_hash = NULL;
2789 /* Return non-zero if scheme a is similar to scheme b.
2791 Schemes are similar if they are equal. If SSL is supported, schemes
2792 are also similar if one is http (SCHEME_HTTP) and the other is https
2795 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2800 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2801 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2808 /* Debugging and testing support for path_simplify. */
2810 /* Debug: run path_simplify on PATH and return the result in a new
2811 string. Useful for calling from the debugger. */
2815 char *copy = xstrdup (path);
2816 path_simplify (copy);
2821 run_test (char *test, char *expected_result, int expected_change)
2823 char *test_copy = xstrdup (test);
2824 int modified = path_simplify (test_copy);
2826 if (0 != strcmp (test_copy, expected_result))
2828 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2829 test, expected_result, test_copy);
2831 if (modified != expected_change)
2833 if (expected_change == 1)
2834 printf ("Expected no modification with path_simplify(\"%s\").\n",
2837 printf ("Expected modification with path_simplify(\"%s\").\n",
2844 test_path_simplify (void)
2847 char *test, *result;
2853 { "foo", "foo", 0 },
2854 { "foo/bar", "foo/bar", 0 },
2855 { "foo///bar", "foo/bar", 1 },
2856 { "foo/.", "foo/", 1 },
2857 { "foo/./", "foo/", 1 },
2858 { "foo./", "foo./", 0 },
2859 { "foo/../bar", "bar", 1 },
2860 { "foo/../bar/", "bar/", 1 },
2861 { "foo/bar/..", "foo/", 1 },
2862 { "foo/bar/../x", "foo/x", 1 },
2863 { "foo/bar/../x/", "foo/x/", 1 },
2864 { "foo/..", "", 1 },
2865 { "foo/../..", "", 1 },
2866 { "a/b/../../c", "c", 1 },
2867 { "./a/../b", "b", 1 }
2871 for (i = 0; i < ARRAY_SIZE (tests); i++)
2873 char *test = tests[i].test;
2874 char *expected_result = tests[i].result;
2875 int expected_change = tests[i].should_modify;
2876 run_test (test, expected_result, expected_change);
2879 /* Now run all the tests with a leading slash before the test case,
2880 to prove that the slash is being preserved. */
2881 for (i = 0; i < ARRAY_SIZE (tests); i++)
2883 char *test, *expected_result;
2884 int expected_change = tests[i].should_modify;
2886 test = xmalloc (1 + strlen (tests[i].test) + 1);
2887 sprintf (test, "/%s", tests[i].test);
2889 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2890 sprintf (expected_result, "/%s", tests[i].result);
2892 run_test (test, expected_result, expected_change);
2895 xfree (expected_result);