2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
62 static const int NS_INADDRSZ = 4;
63 static const int NS_IN6ADDRSZ = 16;
64 static const int NS_INT16SZ = 2;
74 /* Supported schemes: */
75 static struct scheme_data supported_schemes[] =
77 { "http://", DEFAULT_HTTP_PORT, 1 },
79 { "https://", DEFAULT_HTTPS_PORT, 1 },
81 { "ftp://", DEFAULT_FTP_PORT, 1 },
87 /* Forward declarations: */
89 static char *construct_relative PARAMS ((const char *, const char *));
90 static int path_simplify PARAMS ((char *));
94 /* Support for encoding and decoding of URL strings. We determine
95 whether a character is unsafe through static table lookup. This
96 code assumes ASCII character set and 8-bit chars. */
99 /* rfc1738 reserved chars, preserved from encoding. */
102 /* rfc1738 unsafe chars, plus some more. */
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
110 /* Shorthands for the table: */
111 #define R urlchr_reserved
112 #define U urlchr_unsafe
115 const static unsigned char urlchr_table[256] =
117 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
118 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
119 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
120 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
121 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
122 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
123 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
124 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
125 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
126 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
127 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
128 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
129 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
130 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
131 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
132 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
134 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 /* URL-unescape the string S.
150 This is done by transforming the sequences "%HH" to the character
151 represented by the hexadecimal digits HH. If % is not followed by
152 two hexadecimal digits, it is inserted literally.
154 The transformation is done in place. If you need the original
155 string intact, make a copy before calling this function. */
158 url_unescape (char *s)
160 char *t = s; /* t - tortoise */
161 char *h = s; /* h - hare */
172 /* Do nothing if '%' is not followed by two hex digits. */
173 if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))
175 *t = X2DIGITS_TO_NUM (h[1], h[2]);
182 /* The core of url_escape_* functions. Escapes the characters that
183 match the provided mask in urlchr_table.
185 If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
186 will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
187 freshly allocated string will be returned in all cases. */
190 url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
197 for (p1 = s; *p1; p1++)
198 if (urlchr_test (*p1, mask))
199 addition += 2; /* Two more characters (hex digits) */
202 return allow_passthrough ? (char *)s : xstrdup (s);
204 newlen = (p1 - s) + addition;
205 newstr = (char *)xmalloc (newlen + 1);
211 /* Quote the characters that match the test mask. */
212 if (urlchr_test (*p1, mask))
214 unsigned char c = *p1++;
216 *p2++ = XNUM_TO_digit (c >> 4);
217 *p2++ = XNUM_TO_digit (c & 0xf);
222 assert (p2 - newstr == newlen);
228 /* URL-escape the unsafe characters (see urlchr_table) in a given
229 string, returning a freshly allocated string. */
232 url_escape (const char *s)
234 return url_escape_1 (s, urlchr_unsafe, 0);
237 /* URL-escape the unsafe characters (see urlchr_table) in a given
238 string. If no characters are unsafe, S is returned. */
241 url_escape_allow_passthrough (const char *s)
243 return url_escape_1 (s, urlchr_unsafe, 1);
246 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
248 /* Decide whether to encode, decode, or pass through the char at P.
249 This used to be a macro, but it got a little too convoluted. */
250 static inline enum copy_method
251 decide_copy_method (const char *p)
255 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
257 /* %xx sequence: decode it, unless it would decode to an
258 unsafe or a reserved char; in that case, leave it as
260 char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
261 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
262 return CM_PASSTHROUGH;
267 /* Garbled %.. sequence: encode `%'. */
270 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
273 return CM_PASSTHROUGH;
276 /* Translate a %-escaped (but possibly non-conformant) input string S
277 into a %-escaped (and conformant) output string. If no characters
278 are encoded or decoded, return the same string S; otherwise, return
279 a freshly allocated string with the new contents.
281 After a URL has been run through this function, the protocols that
282 use `%' as the quote character can use the resulting string as-is,
283 while those that don't call url_unescape() to get to the intended
284 data. This function is also stable: after an input string is
285 transformed the first time, all further transformations of the
286 result yield the same result string.
288 Let's discuss why this function is needed.
290 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
291 space character would mess up the HTTP request, it needs to be
294 GET /abc%20def HTTP/1.0
296 It appears that the unsafe chars need to be quoted, for example
297 with url_escape. But what if we're requested to download
298 `abc%20def'? url_escape transforms "%" to "%25", which would leave
299 us with `abc%2520def'. This is incorrect -- since %-escapes are
300 part of URL syntax, "%20" is the correct way to denote a literal
301 space on the Wget command line. This leaves us in the conclusion
302 that in that case Wget should not call url_escape, but leave the
305 And what if the requested URI is `abc%20 def'? If we call
306 url_escape, we end up with `/abc%2520%20def', which is almost
307 certainly not intended. If we don't call url_escape, we are left
308 with the embedded space and cannot complete the request. What the
309 user meant was for Wget to request `/abc%20%20def', and this is
310 where reencode_escapes kicks in.
312 Wget used to solve this by first decoding %-quotes, and then
313 encoding all the "unsafe" characters found in the resulting string.
314 This was wrong because it didn't preserve certain URL special
315 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
316 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
317 whether we considered `+' reserved (it is). One of these results
318 is inevitable because by the second step we would lose information
319 on whether the `+' was originally encoded or not. Both results
320 were wrong because in CGI parameters + means space, while %2B means
321 literal plus. reencode_escapes correctly translates the above to
322 "a%2B+b", i.e. returns the original string.
324 This function uses an algorithm proposed by Anon Sricharoenchai:
326 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
329 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
332 ...except that this code conflates the two steps, and decides
333 whether to encode, decode, or pass through each character in turn.
334 The function still uses two passes, but their logic is the same --
335 the first pass exists merely for the sake of allocation. Another
336 small difference is that we include `+' to URL_RESERVED.
340 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
342 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
346 "foo bar" -> "foo%20bar"
347 "foo%20bar" -> "foo%20bar"
348 "foo %20bar" -> "foo%20%20bar"
349 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
350 "foo%25%20bar" -> "foo%25%20bar"
351 "foo%2%20bar" -> "foo%252%20bar"
352 "foo+bar" -> "foo+bar" (plus is reserved!)
353 "foo%2b+bar" -> "foo%2b+bar" */
356 reencode_escapes (const char *s)
362 int encode_count = 0;
363 int decode_count = 0;
365 /* First, pass through the string to see if there's anything to do,
366 and to calculate the new length. */
367 for (p1 = s; *p1; p1++)
369 switch (decide_copy_method (p1))
382 if (!encode_count && !decode_count)
383 /* The string is good as it is. */
384 return (char *)s; /* C const model sucks. */
387 /* Each encoding adds two characters (hex digits), while each
388 decoding removes two characters. */
389 newlen = oldlen + 2 * (encode_count - decode_count);
390 newstr = xmalloc (newlen + 1);
397 switch (decide_copy_method (p1))
401 unsigned char c = *p1++;
403 *p2++ = XNUM_TO_DIGIT (c >> 4);
404 *p2++ = XNUM_TO_DIGIT (c & 0xf);
408 *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);
409 p1 += 3; /* skip %xx */
416 assert (p2 - newstr == newlen);
420 /* Returns the scheme type if the scheme is supported, or
421 SCHEME_INVALID if not. */
423 url_scheme (const char *url)
427 for (i = 0; supported_schemes[i].leading_string; i++)
428 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
429 strlen (supported_schemes[i].leading_string)))
431 if (supported_schemes[i].enabled)
432 return (enum url_scheme) i;
434 return SCHEME_INVALID;
437 return SCHEME_INVALID;
440 /* Return the number of characters needed to skip the scheme part of
441 the URL, e.g. `http://'. If no scheme is found, returns 0. */
443 url_skip_scheme (const char *url)
447 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
449 while (ISALNUM (*p) || *p == '-' || *p == '+')
456 /* Skip "//" if found. */
457 if (*p == '/' && *(p + 1) == '/')
463 /* Returns 1 if the URL begins with a scheme (supported or
464 unsupported), 0 otherwise. */
466 url_has_scheme (const char *url)
469 while (ISALNUM (*p) || *p == '-' || *p == '+')
475 scheme_default_port (enum url_scheme scheme)
477 return supported_schemes[scheme].default_port;
481 scheme_disable (enum url_scheme scheme)
483 supported_schemes[scheme].enabled = 0;
486 /* Skip the username and password, if present here. The function
487 should be called *not* with the complete URL, but with the part
488 right after the scheme.
490 If no username and password are found, return 0. */
492 url_skip_uname (const char *url)
496 /* Look for '@' that comes before '/' or '?'. */
497 p = (const char *)strpbrk (url, "/?@");
505 parse_uname (const char *str, int len, char **user, char **passwd)
510 /* Empty user name not allowed. */
513 colon = memchr (str, ':', len);
515 /* Empty user name again. */
520 int pwlen = len - (colon + 1 - str);
521 *passwd = xmalloc (pwlen + 1);
522 memcpy (*passwd, colon + 1, pwlen);
523 (*passwd)[pwlen] = '\0';
529 *user = xmalloc (len + 1);
530 memcpy (*user, str, len);
534 url_unescape (*user);
536 url_unescape (*passwd);
541 /* Used by main.c: detect URLs written using the "shorthand" URL forms
542 popularized by Netscape and NcFTP. HTTP shorthands look like this:
544 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
545 www.foo.com[:port] -> http://www.foo.com[:port]
547 FTP shorthands look like this:
549 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
550 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
552 If the URL needs not or cannot be rewritten, return NULL. */
554 rewrite_shorthand_url (const char *url)
558 if (url_has_scheme (url))
561 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
563 for (p = url; *p && *p != ':' && *p != '/'; p++)
573 /* If the characters after the colon and before the next slash
574 or end of string are all digits, it's HTTP. */
576 for (pp = p + 1; ISDIGIT (*pp); pp++)
578 if (digits > 0 && (*pp == '/' || *pp == '\0'))
581 /* Prepend "ftp://" to the entire URL... */
582 res = xmalloc (6 + strlen (url) + 1);
583 sprintf (res, "ftp://%s", url);
584 /* ...and replace ':' with '/'. */
585 res[6 + (p - url)] = '/';
592 /* Just prepend "http://" to what we have. */
593 res = xmalloc (7 + strlen (url) + 1);
594 sprintf (res, "http://%s", url);
599 static void parse_path PARAMS ((const char *, char **, char **));
601 /* Like strpbrk, with the exception that it returns the pointer to the
602 terminating zero (end-of-string aka "eos") if no matching character
605 Although I normally balk at Gcc-specific optimizations, it probably
606 makes sense here: glibc has optimizations that detect strpbrk being
607 called with literal string as ACCEPT and inline the search. That
608 optimization is defeated if strpbrk is hidden within the call to
609 another function. (And no, making strpbrk_or_eos inline doesn't
610 help because the check for literal accept is in the
615 #define strpbrk_or_eos(s, accept) ({ \
616 char *SOE_p = strpbrk (s, accept); \
618 SOE_p = (char *)s + strlen (s); \
622 #else /* not __GNUC__ */
625 strpbrk_or_eos (const char *s, const char *accept)
627 char *p = strpbrk (s, accept);
629 p = (char *)s + strlen (s);
634 /* Turn STR into lowercase; return non-zero if a character was
638 lowercase_str (char *str)
645 *str = TOLOWER (*str);
650 static char *parse_errors[] = {
651 #define PE_NO_ERROR 0
653 #define PE_UNSUPPORTED_SCHEME 1
654 "Unsupported scheme",
655 #define PE_EMPTY_HOST 2
657 #define PE_BAD_PORT_NUMBER 3
659 #define PE_INVALID_USER_NAME 4
661 #define PE_UNTERMINATED_IPV6_ADDRESS 5
662 "Unterminated IPv6 numeric address",
663 #define PE_IPV6_NOT_SUPPORTED 6
664 "IPv6 addresses not supported",
665 #define PE_INVALID_IPV6_ADDRESS 7
666 "Invalid IPv6 numeric address"
669 #define SETERR(p, v) do { \
675 /* The following two functions were adapted from glibc. */
678 is_valid_ipv4_address (const char *str, const char *end)
680 int saw_digit, octets;
690 if (ch >= '0' && ch <= '9') {
691 val = val * 10 + (ch - '0');
695 if (saw_digit == 0) {
700 } else if (ch == '.' && saw_digit == 1) {
715 is_valid_ipv6_address (const char *str, const char *end)
717 static const char xdigits[] = "0123456789abcdef";
730 /* Leading :: requires some special handling. */
734 if (str == end || *str != ':')
746 /* if ch is a number, add it to val. */
747 pch = strchr(xdigits, ch);
750 val |= (pch - xdigits);
757 /* if ch is a colon ... */
760 if (saw_xdigit == 0) {
765 } else if (str == end) {
768 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
776 /* if ch is a dot ... */
777 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
778 is_valid_ipv4_address(curtok, end) == 1) {
787 if (saw_xdigit == 1) {
788 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
793 if (colonp != NULL) {
794 if (tp == NS_IN6ADDRSZ)
799 if (tp != NS_IN6ADDRSZ)
808 Return a new struct url if successful, NULL on error. In case of
809 error, and if ERROR is not NULL, also set *ERROR to the appropriate
812 url_parse (const char *url, int *error)
816 int path_modified, host_modified;
818 enum url_scheme scheme;
820 const char *uname_b, *uname_e;
821 const char *host_b, *host_e;
822 const char *path_b, *path_e;
823 const char *params_b, *params_e;
824 const char *query_b, *query_e;
825 const char *fragment_b, *fragment_e;
828 char *user = NULL, *passwd = NULL;
832 scheme = url_scheme (url);
833 if (scheme == SCHEME_INVALID)
835 SETERR (error, PE_UNSUPPORTED_SCHEME);
839 url_encoded = reencode_escapes (url);
842 p += strlen (supported_schemes[scheme].leading_string);
844 p += url_skip_uname (p);
847 /* scheme://user:pass@host[:port]... */
850 /* We attempt to break down the URL into the components path,
851 params, query, and fragment. They are ordered like this:
853 scheme://host[:port][/path][;params][?query][#fragment] */
855 params_b = params_e = NULL;
856 query_b = query_e = NULL;
857 fragment_b = fragment_e = NULL;
863 /* Handle IPv6 address inside square brackets. Ideally we'd
864 just look for the terminating ']', but rfc2732 mandates
865 rejecting invalid IPv6 addresses. */
867 /* The address begins after '['. */
869 host_e = strchr (host_b, ']');
873 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
878 /* Check if the IPv6 address is valid. */
879 if (!is_valid_ipv6_address(host_b, host_e))
881 SETERR (error, PE_INVALID_IPV6_ADDRESS);
885 /* Continue parsing after the closing ']'. */
888 SETERR (error, PE_IPV6_NOT_SUPPORTED);
894 p = strpbrk_or_eos (p, ":/;?#");
898 if (host_b == host_e)
900 SETERR (error, PE_EMPTY_HOST);
904 port = scheme_default_port (scheme);
907 const char *port_b, *port_e, *pp;
909 /* scheme://host:port/tralala */
913 p = strpbrk_or_eos (p, "/;?#");
916 if (port_b == port_e)
918 /* http://host:/whatever */
920 SETERR (error, PE_BAD_PORT_NUMBER);
924 for (port = 0, pp = port_b; pp < port_e; pp++)
928 /* http://host:12randomgarbage/blah */
930 SETERR (error, PE_BAD_PORT_NUMBER);
934 port = 10 * port + (*pp - '0');
942 p = strpbrk_or_eos (p, ";?#");
947 /* Path is not allowed not to exist. */
955 p = strpbrk_or_eos (p, "?#");
962 p = strpbrk_or_eos (p, "#");
965 /* Hack that allows users to use '?' (a wildcard character) in
966 FTP URLs without it being interpreted as a query string
968 if (scheme == SCHEME_FTP)
970 query_b = query_e = NULL;
983 if (uname_b != uname_e)
985 /* http://user:pass@host */
987 /* uname_b uname_e */
988 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
990 SETERR (error, PE_INVALID_USER_NAME);
995 u = (struct url *)xmalloc (sizeof (struct url));
996 memset (u, 0, sizeof (*u));
999 u->host = strdupdelim (host_b, host_e);
1004 u->path = strdupdelim (path_b, path_e);
1005 path_modified = path_simplify (u->path);
1006 parse_path (u->path, &u->dir, &u->file);
1008 host_modified = lowercase_str (u->host);
1011 u->params = strdupdelim (params_b, params_e);
1013 u->query = strdupdelim (query_b, query_e);
1015 u->fragment = strdupdelim (fragment_b, fragment_e);
1017 if (path_modified || u->fragment || host_modified || path_b == path_e)
1019 /* If we suspect that a transformation has rendered what
1020 url_string might return different from URL_ENCODED, rebuild
1021 u->url using url_string. */
1022 u->url = url_string (u, 0);
1024 if (url_encoded != url)
1025 xfree ((char *) url_encoded);
1029 if (url_encoded == url)
1030 u->url = xstrdup (url);
1032 u->url = url_encoded;
1040 url_error (int error_code)
1042 assert (error_code >= 0 && error_code < countof (parse_errors));
1043 return parse_errors[error_code];
1046 /* Parse PATH into dir and file. PATH is extracted from the URL and
1047 is URL-escaped. The function returns unescaped DIR and FILE. */
1050 parse_path (const char *path, char **dir, char **file)
1054 last_slash = strrchr (path, '/');
1057 *dir = xstrdup ("");
1058 *file = xstrdup (path);
1062 *dir = strdupdelim (path, last_slash);
1063 *file = xstrdup (last_slash + 1);
1065 url_unescape (*dir);
1066 url_unescape (*file);
1069 /* Note: URL's "full path" is the path with the query string and
1070 params appended. The "fragment" (#foo) is intentionally ignored,
1071 but that might be changed. For example, if the original URL was
1072 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1073 the full path will be "/foo/bar/baz;bullshit?querystring". */
1075 /* Return the length of the full path, without the terminating
1079 full_path_length (const struct url *url)
1083 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1094 /* Write out the full path. */
1097 full_path_write (const struct url *url, char *where)
1099 #define FROB(el, chr) do { \
1100 char *f_el = url->el; \
1102 int l = strlen (f_el); \
1104 memcpy (where, f_el, l); \
1116 /* Public function for getting the "full path". E.g. if u->path is
1117 "foo/bar" and u->query is "param=value", full_path will be
1118 "/foo/bar?param=value". */
1121 url_full_path (const struct url *url)
1123 int length = full_path_length (url);
1124 char *full_path = (char *)xmalloc(length + 1);
1126 full_path_write (url, full_path);
1127 full_path[length] = '\0';
1132 /* Escape unsafe and reserved characters, except for the slash
1136 url_escape_dir (const char *dir)
1138 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1143 /* Unescape slashes in NEWDIR. */
1145 h = newdir; /* hare */
1146 t = newdir; /* tortoise */
1148 for (; *h; h++, t++)
1150 if (*h == '%' && h[1] == '2' && h[2] == 'F')
1163 /* Sync u->path and u->url with u->dir and u->file. Called after
1164 u->file or u->dir have been changed, typically by the FTP code. */
1167 sync_path (struct url *u)
1169 char *newpath, *efile, *edir;
1173 /* u->dir and u->file are not escaped. URL-escape them before
1174 reassembling them into u->path. That way, if they contain
1175 separators like '?' or even if u->file contains slashes, the
1176 path will be correctly assembled. (u->file can contain slashes
1177 if the URL specifies it with %2f, or if an FTP server returns
1179 edir = url_escape_dir (u->dir);
1180 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1183 newpath = xstrdup (efile);
1186 int dirlen = strlen (edir);
1187 int filelen = strlen (efile);
1189 /* Copy "DIR/FILE" to newpath. */
1190 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1191 memcpy (p, edir, dirlen);
1194 memcpy (p, efile, filelen);
1203 if (efile != u->file)
1206 /* Regenerate u->url as well. */
1208 u->url = url_string (u, 0);
1211 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1212 This way we can sync u->path and u->url when they get changed. */
1215 url_set_dir (struct url *url, const char *newdir)
1218 url->dir = xstrdup (newdir);
1223 url_set_file (struct url *url, const char *newfile)
1226 url->file = xstrdup (newfile);
1231 url_free (struct url *url)
1237 FREE_MAYBE (url->params);
1238 FREE_MAYBE (url->query);
1239 FREE_MAYBE (url->fragment);
1240 FREE_MAYBE (url->user);
1241 FREE_MAYBE (url->passwd);
1250 get_urls_file (const char *file)
1252 struct file_memory *fm;
1253 struct urlpos *head, *tail;
1254 const char *text, *text_end;
1256 /* Load the file. */
1257 fm = read_file (file);
1260 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1263 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1267 text_end = fm->content + fm->length;
1268 while (text < text_end)
1270 const char *line_beg = text;
1271 const char *line_end = memchr (text, '\n', text_end - text);
1273 line_end = text_end;
1278 /* Strip whitespace from the beginning and end of line. */
1279 while (line_beg < line_end && ISSPACE (*line_beg))
1281 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1284 if (line_end > line_beg)
1286 /* URL is in the [line_beg, line_end) region. */
1290 struct urlpos *entry;
1293 /* We must copy the URL to a zero-terminated string, and we
1294 can't use alloca because we're in a loop. *sigh*. */
1295 url_text = strdupdelim (line_beg, line_end);
1299 /* Merge opt.base_href with URL. */
1300 char *merged = uri_merge (opt.base_href, url_text);
1305 url = url_parse (url_text, &up_error_code);
1308 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1309 file, url_text, url_error (up_error_code));
1315 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1316 memset (entry, 0, sizeof (*entry));
1327 read_file_free (fm);
1331 /* Free the linked list of urlpos. */
1333 free_urlpos (struct urlpos *l)
1337 struct urlpos *next = l->next;
1340 FREE_MAYBE (l->local_name);
1346 /* Rotate FNAME opt.backups times */
1348 rotate_backups(const char *fname)
1350 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1351 char *from = (char *)alloca (maxlen);
1352 char *to = (char *)alloca (maxlen);
1356 if (stat (fname, &sb) == 0)
1357 if (S_ISREG (sb.st_mode) == 0)
1360 for (i = opt.backups; i > 1; i--)
1362 sprintf (from, "%s.%d", fname, i - 1);
1363 sprintf (to, "%s.%d", fname, i);
1367 sprintf (to, "%s.%d", fname, 1);
1371 /* Create all the necessary directories for PATH (a file). Calls
1372 mkdirhier() internally. */
1374 mkalldirs (const char *path)
1381 p = path + strlen (path);
1382 for (; *p != '/' && p != path; p--)
1385 /* Don't create if it's just a file. */
1386 if ((p == path) && (*p != '/'))
1388 t = strdupdelim (path, p);
1390 /* Check whether the directory exists. */
1391 if ((stat (t, &st) == 0))
1393 if (S_ISDIR (st.st_mode))
1400 /* If the dir exists as a file name, remove it first. This
1401 is *only* for Wget to work with buggy old CERN http
1402 servers. Here is the scenario: When Wget tries to
1403 retrieve a directory without a slash, e.g.
1404 http://foo/bar (bar being a directory), CERN server will
1405 not redirect it too http://foo/bar/ -- it will generate a
1406 directory listing containing links to bar/file1,
1407 bar/file2, etc. Wget will lose because it saves this
1408 HTML listing to a file `bar', so it cannot create the
1409 directory. To work around this, if the file of the same
1410 name exists, we just remove it and create the directory
1412 DEBUGP (("Removing %s because of directory danger!\n", t));
1416 res = make_directory (t);
1418 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1423 /* Functions for constructing the file name out of URL components. */
1425 /* A growable string structure, used by url_file_name and friends.
1426 This should perhaps be moved to utils.c.
1428 The idea is to have a convenient and efficient way to construct a
1429 string by having various functions append data to it. Instead of
1430 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1431 functions in questions, we pass the pointer to this struct. */
1439 /* Ensure that the string can accept APPEND_COUNT more characters past
1440 the current TAIL position. If necessary, this will grow the string
1441 and update its allocated size. If the string is already large
1442 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1443 #define GROW(g, append_size) do { \
1444 struct growable *G_ = g; \
1445 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1448 /* Return the tail position of the string. */
1449 #define TAIL(r) ((r)->base + (r)->tail)
1451 /* Move the tail position by APPEND_COUNT characters. */
1452 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1454 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1458 append_string (const char *str, struct growable *dest)
1460 int l = strlen (str);
1462 memcpy (TAIL (dest), str, l);
1463 TAIL_INCR (dest, l);
1466 /* Append CH to DEST. For example, append_char (0, DEST)
1467 zero-terminates DEST. */
1470 append_char (char ch, struct growable *dest)
1474 TAIL_INCR (dest, 1);
1478 filechr_not_unix = 1, /* unusable on Unix, / and \0 */
1479 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
1480 filechr_control = 4, /* a control character, e.g. 0-31 */
1483 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1485 /* Shorthands for the table: */
1486 #define U filechr_not_unix
1487 #define W filechr_not_windows
1488 #define C filechr_control
1493 /* Table of characters unsafe under various conditions (see above).
1495 Arguably we could also claim `%' to be unsafe, since we use it as
1496 the escape character. If we ever want to be able to reliably
1497 translate file name back to URL, this would become important
1498 crucial. Right now, it's better to be minimal in escaping. */
1500 const static unsigned char filechr_table[256] =
1502 UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1503 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
1504 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1505 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
1506 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1507 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
1508 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1509 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1510 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1511 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1512 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1513 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1514 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1515 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1516 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1517 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1519 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
1520 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
1521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1535 /* FN_PORT_SEP is the separator between host and port in file names
1536 for non-standard port numbers. On Unix this is normally ':', as in
1537 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1538 because Windows can't handle ':' in file names. */
1539 #define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
1541 /* FN_QUERY_SEP is the separator between the file name and the URL
1542 query, normally '?'. Since Windows cannot handle '?' as part of
1543 file name, we use '@' instead there. */
1544 #define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1546 /* Quote path element, characters in [b, e), as file name, and append
1547 the quoted string to DEST. Each character is quoted as per
1548 file_unsafe_char and the corresponding table. */
1551 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1560 if (opt.restrict_files_os == restrict_unix)
1561 mask = filechr_not_unix;
1563 mask = filechr_not_windows;
1564 if (opt.restrict_files_ctrl)
1565 mask |= filechr_control;
1567 /* Copy [b, e) to PATHEL and URL-unescape it. */
1568 BOUNDED_TO_ALLOCA (b, e, pathel);
1569 url_unescape (pathel);
1570 pathlen = strlen (pathel);
1572 /* Go through PATHEL and check how many characters we'll need to
1573 add for file quoting. */
1575 for (p = pathel; *p; p++)
1576 if (FILE_CHAR_TEST (*p, mask))
1579 /* p - pathel is the string length. Each quoted char means two
1580 additional characters in the string, hence 2*quoted. */
1581 outlen = (p - pathel) + (2 * quoted);
1582 GROW (dest, outlen);
1586 /* If there's nothing to quote, we don't need to go through the
1587 string the second time. */
1588 memcpy (TAIL (dest), pathel, outlen);
1592 char *q = TAIL (dest);
1593 for (p = pathel; *p; p++)
1595 if (!FILE_CHAR_TEST (*p, mask))
1599 unsigned char ch = *p;
1601 *q++ = XNUM_TO_DIGIT (ch >> 4);
1602 *q++ = XNUM_TO_DIGIT (ch & 0xf);
1605 assert (q - TAIL (dest) == outlen);
1607 TAIL_INCR (dest, outlen);
1610 /* Append to DEST the directory structure that corresponds the
1611 directory part of URL's path. For example, if the URL is
1612 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1614 Each path element ("dir1" and "dir2" in the above example) is
1615 examined, url-unescaped, and re-escaped as file name element.
1617 Additionally, it cuts as many directories from the path as
1618 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1619 will produce "bar" for the above example. For 2 or more, it will
1622 Each component of the path is quoted for use as file name. */
1625 append_dir_structure (const struct url *u, struct growable *dest)
1627 char *pathel, *next;
1628 int cut = opt.cut_dirs;
1630 /* Go through the path components, de-URL-quote them, and quote them
1631 (if necessary) as file names. */
1634 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1639 /* Ignore empty pathels. path_simplify should remove
1640 occurrences of "//" from the path, but it has special cases
1641 for starting / which generates an empty pathel here. */
1645 append_char ('/', dest);
1646 append_uri_pathel (pathel, next, dest);
1650 /* Return a unique file name that matches the given URL as good as
1651 possible. Does not create directories on the file system. */
1654 url_file_name (const struct url *u)
1656 struct growable fnres;
1658 char *u_file, *u_query;
1659 char *fname, *unique;
1665 /* Start with the directory prefix, if specified. */
1666 if (!DOTP (opt.dir_prefix))
1667 append_string (opt.dir_prefix, &fnres);
1669 /* If "dirstruct" is turned on (typically the case with -r), add
1670 the host and port (unless those have been turned off) and
1671 directory structure. */
1674 if (opt.add_hostdir)
1677 append_char ('/', &fnres);
1678 append_string (u->host, &fnres);
1679 if (u->port != scheme_default_port (u->scheme))
1682 number_to_string (portstr, u->port);
1683 append_char (FN_PORT_SEP, &fnres);
1684 append_string (portstr, &fnres);
1688 append_dir_structure (u, &fnres);
1691 /* Add the file name. */
1693 append_char ('/', &fnres);
1694 u_file = *u->file ? u->file : "index.html";
1695 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1697 /* Append "?query" to the file name. */
1698 u_query = u->query && *u->query ? u->query : NULL;
1701 append_char (FN_QUERY_SEP, &fnres);
1702 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1705 /* Zero-terminate the file name. */
1706 append_char ('\0', &fnres);
1710 /* Check the cases in which the unique extensions are not used:
1711 1) Clobbering is turned off (-nc).
1712 2) Retrieval with regetting.
1713 3) Timestamping is used.
1714 4) Hierarchy is built.
1716 The exception is the case when file does exist and is a
1717 directory (see `mkalldirs' for explanation). */
1719 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1720 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1723 unique = unique_name (fname, 1);
1724 if (unique != fname)
1729 /* Return the length of URL's path. Path is considered to be
1730 terminated by one of '?', ';', '#', or by the end of the
1733 path_length (const char *url)
1735 const char *q = strpbrk_or_eos (url, "?;#");
1739 /* Find the last occurrence of character C in the range [b, e), or
1740 NULL, if none are present. This is equivalent to strrchr(b, c),
1741 except that it accepts an END argument instead of requiring the
1742 string to be zero-terminated. Why is there no memrchr()? */
1744 find_last_char (const char *b, const char *e, char c)
1752 /* Resolve "." and ".." elements of PATH by destructively modifying
1753 PATH. "." is resolved by removing that path element, and ".." is
1754 resolved by removing the preceding path element. Leading and
1755 trailing slashes are preserved.
1757 Return non-zero if any changes have been made.
1759 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1760 test examples are provided below. If you change anything in this
1761 function, run test_path_simplify to make sure you haven't broken a
1764 A previous version of this function was based on path_simplify()
1765 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1768 path_simplify (char *path)
1774 ++path; /* preserve the leading '/'. */
1777 end = p + strlen (p) + 1; /* position past the terminating zero. */
1782 /* P should point to the beginning of a path element. */
1784 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1786 /* Handle "./foo" by moving "foo" two characters to the
1788 if (*(p + 1) == '/')
1791 memmove (p, p + 2, end - p);
1802 else if (*p == '.' && *(p + 1) == '.'
1803 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1805 /* Handle "../foo" by moving "foo" one path element to the
1807 char *b = p; /* not p-1 because P can equal PATH */
1809 /* Backtrack by one path element, but not past the beginning
1812 /* foo/bar/../baz */
1818 /* Move backwards until B hits the beginning of the
1819 previous path element or the beginning of path. */
1820 for (--b; b > path && *(b - 1) != '/'; b--)
1825 if (*(p + 2) == '/')
1827 memmove (b, p + 3, end - (p + 3));
1841 /* Remove empty path elements. Not mandated by rfc1808 et
1842 al, but it seems like a good idea to get rid of them.
1843 Supporting them properly is hard (in which directory do
1844 you save http://x.com///y.html?) and they don't seem to
1855 memmove (p, q, end - q);
1860 /* Skip to the next path element. */
1861 while (*p && *p != '/')
1866 /* Make sure P points to the beginning of the next path element,
1867 which is location after the slash. */
1874 /* Resolve the result of "linking" a base URI (BASE) to a
1875 link-specified URI (LINK).
1877 Either of the URIs may be absolute or relative, complete with the
1878 host name, or path only. This tries to behave "reasonably" in all
1879 foreseeable cases. It employs little specific knowledge about
1880 schemes or URL-specific stuff -- it just works on strings.
1882 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1883 See uri_merge for a gentler interface to this functionality.
1885 Perhaps this function should call path_simplify so that the callers
1886 don't have to call url_parse unconditionally. */
1888 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1894 const char *end = base + path_length (base);
1898 /* Empty LINK points back to BASE, query string and all. */
1899 constr = xstrdup (base);
1901 else if (*link == '?')
1903 /* LINK points to the same location, but changes the query
1904 string. Examples: */
1905 /* uri_merge("path", "?new") -> "path?new" */
1906 /* uri_merge("path?foo", "?new") -> "path?new" */
1907 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1908 /* uri_merge("path#foo", "?new") -> "path?new" */
1909 int baselength = end - base;
1910 constr = xmalloc (baselength + linklength + 1);
1911 memcpy (constr, base, baselength);
1912 memcpy (constr + baselength, link, linklength);
1913 constr[baselength + linklength] = '\0';
1915 else if (*link == '#')
1917 /* uri_merge("path", "#new") -> "path#new" */
1918 /* uri_merge("path#foo", "#new") -> "path#new" */
1919 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1920 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1922 const char *end1 = strchr (base, '#');
1924 end1 = base + strlen (base);
1925 baselength = end1 - base;
1926 constr = xmalloc (baselength + linklength + 1);
1927 memcpy (constr, base, baselength);
1928 memcpy (constr + baselength, link, linklength);
1929 constr[baselength + linklength] = '\0';
1931 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1933 /* LINK begins with "//" and so is a net path: we need to
1934 replace everything after (and including) the double slash
1937 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1938 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1939 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1943 const char *start_insert;
1945 /* Look for first slash. */
1946 slash = memchr (base, '/', end - base);
1947 /* If found slash and it is a double slash, then replace
1948 from this point, else default to replacing from the
1950 if (slash && *(slash + 1) == '/')
1951 start_insert = slash;
1953 start_insert = base;
1955 span = start_insert - base;
1956 constr = (char *)xmalloc (span + linklength + 1);
1958 memcpy (constr, base, span);
1959 memcpy (constr + span, link, linklength);
1960 constr[span + linklength] = '\0';
1962 else if (*link == '/')
1964 /* LINK is an absolute path: we need to replace everything
1965 after (and including) the FIRST slash with LINK.
1967 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1968 "/qux/xyzzy", our result should be
1969 "http://host/qux/xyzzy". */
1972 const char *start_insert = NULL; /* for gcc to shut up. */
1973 const char *pos = base;
1974 int seen_slash_slash = 0;
1975 /* We're looking for the first slash, but want to ignore
1978 slash = memchr (pos, '/', end - pos);
1979 if (slash && !seen_slash_slash)
1980 if (*(slash + 1) == '/')
1983 seen_slash_slash = 1;
1987 /* At this point, SLASH is the location of the first / after
1988 "//", or the first slash altogether. START_INSERT is the
1989 pointer to the location where LINK will be inserted. When
1990 examining the last two examples, keep in mind that LINK
1993 if (!slash && !seen_slash_slash)
1994 /* example: "foo" */
1996 start_insert = base;
1997 else if (!slash && seen_slash_slash)
1998 /* example: "http://foo" */
2001 else if (slash && !seen_slash_slash)
2002 /* example: "foo/bar" */
2004 start_insert = base;
2005 else if (slash && seen_slash_slash)
2006 /* example: "http://something/" */
2008 start_insert = slash;
2010 span = start_insert - base;
2011 constr = (char *)xmalloc (span + linklength + 1);
2013 memcpy (constr, base, span);
2015 memcpy (constr + span, link, linklength);
2016 constr[span + linklength] = '\0';
2020 /* LINK is a relative URL: we need to replace everything
2021 after last slash (possibly empty) with LINK.
2023 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2024 our result should be "whatever/foo/qux/xyzzy". */
2025 int need_explicit_slash = 0;
2027 const char *start_insert;
2028 const char *last_slash = find_last_char (base, end, '/');
2031 /* No slash found at all. Append LINK to what we have,
2032 but we'll need a slash as a separator.
2034 Example: if base == "foo" and link == "qux/xyzzy", then
2035 we cannot just append link to base, because we'd get
2036 "fooqux/xyzzy", whereas what we want is
2039 To make sure the / gets inserted, we set
2040 need_explicit_slash to 1. We also set start_insert
2041 to end + 1, so that the length calculations work out
2042 correctly for one more (slash) character. Accessing
2043 that character is fine, since it will be the
2044 delimiter, '\0' or '?'. */
2045 /* example: "foo?..." */
2046 /* ^ ('?' gets changed to '/') */
2047 start_insert = end + 1;
2048 need_explicit_slash = 1;
2050 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2052 /* example: http://host" */
2054 start_insert = end + 1;
2055 need_explicit_slash = 1;
2059 /* example: "whatever/foo/bar" */
2061 start_insert = last_slash + 1;
2064 span = start_insert - base;
2065 constr = (char *)xmalloc (span + linklength + 1);
2067 memcpy (constr, base, span);
2068 if (need_explicit_slash)
2069 constr[span - 1] = '/';
2071 memcpy (constr + span, link, linklength);
2072 constr[span + linklength] = '\0';
2075 else /* !no_scheme */
2077 constr = strdupdelim (link, link + linklength);
2082 /* Merge BASE with LINK and return the resulting URI. This is an
2083 interface to uri_merge_1 that assumes that LINK is a
2084 zero-terminated string. */
2086 uri_merge (const char *base, const char *link)
2088 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2091 #define APPEND(p, s) do { \
2092 int len = strlen (s); \
2093 memcpy (p, s, len); \
2097 /* Use this instead of password when the actual password is supposed
2098 to be hidden. We intentionally use a generic string without giving
2099 away the number of characters in the password, like previous
2101 #define HIDDEN_PASSWORD "*password*"
2103 /* Recreate the URL string from the data in URL.
2105 If HIDE is non-zero (as it is when we're calling this on a URL we
2106 plan to print, but not when calling it to canonicalize a URL for
2107 use within the program), password will be hidden. Unsafe
2108 characters in the URL will be quoted. */
2111 url_string (const struct url *url, int hide_password)
2115 char *quoted_user = NULL, *quoted_passwd = NULL;
2117 int scheme_port = supported_schemes[url->scheme].default_port;
2118 char *scheme_str = supported_schemes[url->scheme].leading_string;
2119 int fplen = full_path_length (url);
2121 int brackets_around_host = 0;
2123 assert (scheme_str != NULL);
2125 /* Make sure the user name and password are quoted. */
2128 quoted_user = url_escape_allow_passthrough (url->user);
2132 quoted_passwd = HIDDEN_PASSWORD;
2134 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2138 if (strchr (url->host, ':'))
2139 brackets_around_host = 1;
2141 size = (strlen (scheme_str)
2142 + strlen (url->host)
2143 + (brackets_around_host ? 2 : 0)
2146 if (url->port != scheme_port)
2147 size += 1 + numdigit (url->port);
2150 size += 1 + strlen (quoted_user);
2152 size += 1 + strlen (quoted_passwd);
2155 p = result = xmalloc (size);
2157 APPEND (p, scheme_str);
2160 APPEND (p, quoted_user);
2164 APPEND (p, quoted_passwd);
2169 if (brackets_around_host)
2171 APPEND (p, url->host);
2172 if (brackets_around_host)
2174 if (url->port != scheme_port)
2177 p = number_to_string (p, url->port);
2180 full_path_write (url, p);
2184 assert (p - result == size);
2186 if (quoted_user && quoted_user != url->user)
2187 xfree (quoted_user);
2188 if (quoted_passwd && !hide_password
2189 && quoted_passwd != url->passwd)
2190 xfree (quoted_passwd);
2195 /* Return the URL of the proxy appropriate for url U. */
2197 getproxy (struct url *u)
2200 char *rewritten_url;
2201 static char rewritten_storage[1024];
2205 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2211 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2215 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2219 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2221 case SCHEME_INVALID:
2224 if (!proxy || !*proxy)
2227 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2228 getproxy() to return static storage. */
2229 rewritten_url = rewrite_shorthand_url (proxy);
2232 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2233 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2234 proxy = rewritten_storage;
2240 /* Should a host be accessed through proxy, concerning no_proxy? */
2242 no_proxy_match (const char *host, const char **no_proxy)
2247 return !sufmatch (no_proxy, host);
2250 /* Support for converting links for local viewing in downloaded HTML
2251 files. This should be moved to another file, because it has
2252 nothing to do with processing URLs. */
2254 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2255 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2257 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2258 const char *, int));
2259 static char *local_quote_string PARAMS ((const char *));
2261 /* Change the links in one HTML file. LINKS is a list of links in the
2262 document, along with their positions and the desired direction of
2265 convert_links (const char *file, struct urlpos *links)
2267 struct file_memory *fm;
2270 downloaded_file_t downloaded_file_return;
2272 struct urlpos *link;
2273 int to_url_count = 0, to_file_count = 0;
2275 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2278 /* First we do a "dry run": go through the list L and see whether
2279 any URL needs to be converted in the first place. If not, just
2280 leave the file alone. */
2282 struct urlpos *dry = links;
2283 for (dry = links; dry; dry = dry->next)
2284 if (dry->convert != CO_NOCONVERT)
2288 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2293 fm = read_file (file);
2296 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2297 file, strerror (errno));
2301 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2302 if (opt.backup_converted && downloaded_file_return)
2303 write_backup_file (file, downloaded_file_return);
2305 /* Before opening the file for writing, unlink the file. This is
2306 important if the data in FM is mmaped. In such case, nulling the
2307 file, which is what fopen() below does, would make us read all
2308 zeroes from the mmaped region. */
2309 if (unlink (file) < 0 && errno != ENOENT)
2311 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2312 file, strerror (errno));
2313 read_file_free (fm);
2316 /* Now open the file for writing. */
2317 fp = fopen (file, "wb");
2320 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2321 file, strerror (errno));
2322 read_file_free (fm);
2326 /* Here we loop through all the URLs in file, replacing those of
2327 them that are downloaded with relative references. */
2329 for (link = links; link; link = link->next)
2331 char *url_start = fm->content + link->pos;
2333 if (link->pos >= fm->length)
2335 DEBUGP (("Something strange is going on. Please investigate."));
2338 /* If the URL is not to be converted, skip it. */
2339 if (link->convert == CO_NOCONVERT)
2341 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2345 /* Echo the file contents, up to the offending URL's opening
2346 quote, to the outfile. */
2347 fwrite (p, 1, url_start - p, fp);
2350 switch (link->convert)
2352 case CO_CONVERT_TO_RELATIVE:
2353 /* Convert absolute URL to relative. */
2355 char *newname = construct_relative (file, link->local_name);
2356 char *quoted_newname = local_quote_string (newname);
2358 if (!link->link_refresh_p)
2359 p = replace_attr (p, link->size, fp, quoted_newname);
2361 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2362 link->refresh_timeout);
2364 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2365 link->url->url, newname, link->pos, file));
2367 xfree (quoted_newname);
2371 case CO_CONVERT_TO_COMPLETE:
2372 /* Convert the link to absolute URL. */
2374 char *newlink = link->url->url;
2375 char *quoted_newlink = html_quote_string (newlink);
2377 if (!link->link_refresh_p)
2378 p = replace_attr (p, link->size, fp, quoted_newlink);
2380 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2381 link->refresh_timeout);
2383 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2384 newlink, link->pos, file));
2385 xfree (quoted_newlink);
2389 case CO_NULLIFY_BASE:
2390 /* Change the base href to "". */
2391 p = replace_attr (p, link->size, fp, "");
2399 /* Output the rest of the file. */
2400 if (p - fm->content < fm->length)
2401 fwrite (p, 1, fm->length - (p - fm->content), fp);
2403 read_file_free (fm);
2405 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2408 /* Construct and return a malloced copy of the relative link from two
2409 pieces of information: local name S1 of the referring file and
2410 local name S2 of the referred file.
2412 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2413 "jagor.srce.hr/images/news.gif", the function will return
2416 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2417 "fly.cc.fer.hr/images/fly.gif", the function will return
2418 "../images/fly.gif".
2420 Caveats: S1 should not begin with `/', unless S2 also begins with
2421 '/'. S1 should not contain things like ".." and such --
2422 construct_relative ("fly/ioccc/../index.html",
2423 "fly/images/fly.gif") will fail. (A workaround is to call
2424 something like path_simplify() on S1). */
2426 construct_relative (const char *s1, const char *s2)
2428 int i, cnt, sepdirs1;
2432 return xstrdup (s2);
2433 /* S1 should *not* be absolute, if S2 wasn't. */
2434 assert (*s1 != '/');
2436 /* Skip the directories common to both strings. */
2439 while (s1[i] && s2[i]
2444 if (s1[i] == '/' && s2[i] == '/')
2449 for (sepdirs1 = 0; s1[i]; i++)
2452 /* Now, construct the file as of:
2453 - ../ repeated sepdirs1 time
2454 - all the non-mutual directories of S2. */
2455 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2456 for (i = 0; i < sepdirs1; i++)
2457 memcpy (res + 3 * i, "../", 3);
2458 strcpy (res + 3 * i, s2 + cnt);
2463 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2465 /* Rather than just writing over the original .html file with the
2466 converted version, save the former to *.orig. Note we only do
2467 this for files we've _successfully_ downloaded, so we don't
2468 clobber .orig files sitting around from previous invocations. */
2470 /* Construct the backup filename as the original name plus ".orig". */
2471 size_t filename_len = strlen(file);
2472 char* filename_plus_orig_suffix;
2473 boolean already_wrote_backup_file = FALSE;
2474 slist* converted_file_ptr;
2475 static slist* converted_files = NULL;
2477 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2479 /* Just write "orig" over "html". We need to do it this way
2480 because when we're checking to see if we've downloaded the
2481 file before (to see if we can skip downloading it), we don't
2482 know if it's a text/html file. Therefore we don't know yet
2483 at that stage that -E is going to cause us to tack on
2484 ".html", so we need to compare vs. the original URL plus
2485 ".orig", not the original URL plus ".html.orig". */
2486 filename_plus_orig_suffix = alloca (filename_len + 1);
2487 strcpy(filename_plus_orig_suffix, file);
2488 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2490 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2492 /* Append ".orig" to the name. */
2493 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2494 strcpy(filename_plus_orig_suffix, file);
2495 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2498 /* We can get called twice on the same URL thanks to the
2499 convert_all_links() call in main(). If we write the .orig file
2500 each time in such a case, it'll end up containing the first-pass
2501 conversion, not the original file. So, see if we've already been
2502 called on this file. */
2503 converted_file_ptr = converted_files;
2504 while (converted_file_ptr != NULL)
2505 if (strcmp(converted_file_ptr->string, file) == 0)
2507 already_wrote_backup_file = TRUE;
2511 converted_file_ptr = converted_file_ptr->next;
2513 if (!already_wrote_backup_file)
2515 /* Rename <file> to <file>.orig before former gets written over. */
2516 if (rename(file, filename_plus_orig_suffix) != 0)
2517 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2518 file, filename_plus_orig_suffix, strerror (errno));
2520 /* Remember that we've already written a .orig backup for this file.
2521 Note that we never free this memory since we need it till the
2522 convert_all_links() call, which is one of the last things the
2523 program does before terminating. BTW, I'm not sure if it would be
2524 safe to just set 'converted_file_ptr->string' to 'file' below,
2525 rather than making a copy of the string... Another note is that I
2526 thought I could just add a field to the urlpos structure saying
2527 that we'd written a .orig file for this URL, but that didn't work,
2528 so I had to make this separate list.
2529 -- Dan Harkless <wget@harkless.org>
2531 This [adding a field to the urlpos structure] didn't work
2532 because convert_file() is called from convert_all_links at
2533 the end of the retrieval with a freshly built new urlpos
2535 -- Hrvoje Niksic <hniksic@arsdigita.com>
2537 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2538 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2539 converted_file_ptr->next = converted_files;
2540 converted_files = converted_file_ptr;
2544 static int find_fragment PARAMS ((const char *, int, const char **,
2547 /* Replace an attribute's original text with NEW_TEXT. */
2550 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2553 char quote_char = '\"'; /* use "..." for quoting, unless the
2554 original value is quoted, in which
2555 case reuse its quoting char. */
2556 const char *frag_beg, *frag_end;
2558 /* Structure of our string is:
2559 "...old-contents..."
2560 <--- size ---> (with quotes)
2563 <--- size --> (no quotes) */
2565 if (*p == '\"' || *p == '\'')
2570 size -= 2; /* disregard opening and closing quote */
2572 putc (quote_char, fp);
2573 fputs (new_text, fp);
2575 /* Look for fragment identifier, if any. */
2576 if (find_fragment (p, size, &frag_beg, &frag_end))
2577 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2581 putc (quote_char, fp);
2586 /* The same as REPLACE_ATTR, but used when replacing
2587 <meta http-equiv=refresh content="new_text"> because we need to
2588 append "timeout_value; URL=" before the next_text. */
2591 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2592 const char *new_text, int timeout)
2595 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2599 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2601 return replace_attr (p, size, fp, new_with_timeout);
2604 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2605 preceded by '&'. If the character is not found, return zero. If
2606 the character is found, return 1 and set BP and EP to point to the
2607 beginning and end of the region.
2609 This is used for finding the fragment indentifiers in URLs. */
2612 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2614 const char *end = beg + size;
2616 for (; beg < end; beg++)
2638 /* Quote FILE for use as local reference to an HTML file.
2640 We quote ? as %3F to avoid passing part of the file name as the
2641 parameter when browsing the converted file through HTTP. However,
2642 it is safe to do this only when `--html-extension' is turned on.
2643 This is because converting "index.html?foo=bar" to
2644 "index.html%3Ffoo=bar" would break local browsing, as the latter
2645 isn't even recognized as an HTML file! However, converting
2646 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2647 safe for both local and HTTP-served browsing. */
2650 local_quote_string (const char *file)
2652 const char *file_sans_qmark;
2655 if (!opt.html_extension)
2656 return html_quote_string (file);
2658 qm = count_char (file, '?');
2662 const char *from = file;
2665 /* qm * 2 because we replace each question mark with "%3F",
2666 i.e. replace one char with three, hence two more. */
2667 int fsqlen = strlen (file) + qm * 2;
2669 to = newname = (char *)alloca (fsqlen + 1);
2670 for (; *from; from++)
2681 assert (to - newname == fsqlen);
2684 file_sans_qmark = newname;
2687 file_sans_qmark = file;
2689 return html_quote_string (file_sans_qmark);
2692 /* We're storing "modes" of type downloaded_file_t in the hash table.
2693 However, our hash tables only accept pointers for keys and values.
2694 So when we need a pointer, we use the address of a
2695 downloaded_file_t variable of static storage. */
2697 static downloaded_file_t *
2698 downloaded_mode_to_ptr (downloaded_file_t mode)
2700 static downloaded_file_t
2701 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2702 v2 = FILE_DOWNLOADED_NORMALLY,
2703 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2704 v4 = CHECK_FOR_FILE;
2708 case FILE_NOT_ALREADY_DOWNLOADED:
2710 case FILE_DOWNLOADED_NORMALLY:
2712 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2714 case CHECK_FOR_FILE:
2720 /* This should really be merged with dl_file_url_map and
2721 downloaded_html_files in recur.c. This was originally a list, but
2722 I changed it to a hash table beause it was actually taking a lot of
2723 time to find things in it. */
2725 static struct hash_table *downloaded_files_hash;
2727 /* Remembers which files have been downloaded. In the standard case, should be
2728 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2729 download successfully (i.e. not for ones we have failures on or that we skip
2732 When we've downloaded a file and tacked on a ".html" extension due to -E,
2733 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2734 FILE_DOWNLOADED_NORMALLY.
2736 If you just want to check if a file has been previously added without adding
2737 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2738 with local filenames, not remote URLs. */
2740 downloaded_file (downloaded_file_t mode, const char *file)
2742 downloaded_file_t *ptr;
2744 if (mode == CHECK_FOR_FILE)
2746 if (!downloaded_files_hash)
2747 return FILE_NOT_ALREADY_DOWNLOADED;
2748 ptr = hash_table_get (downloaded_files_hash, file);
2750 return FILE_NOT_ALREADY_DOWNLOADED;
2754 if (!downloaded_files_hash)
2755 downloaded_files_hash = make_string_hash_table (0);
2757 ptr = hash_table_get (downloaded_files_hash, file);
2761 ptr = downloaded_mode_to_ptr (mode);
2762 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2764 return FILE_NOT_ALREADY_DOWNLOADED;
2768 df_free_mapper (void *key, void *value, void *ignored)
2775 downloaded_files_free (void)
2777 if (downloaded_files_hash)
2779 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2780 hash_table_destroy (downloaded_files_hash);
2781 downloaded_files_hash = NULL;
2785 /* Return non-zero if scheme a is similar to scheme b.
2787 Schemes are similar if they are equal. If SSL is supported, schemes
2788 are also similar if one is http (SCHEME_HTTP) and the other is https
2791 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2796 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2797 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2804 /* Debugging and testing support for path_simplify. */
2806 /* Debug: run path_simplify on PATH and return the result in a new
2807 string. Useful for calling from the debugger. */
2811 char *copy = xstrdup (path);
2812 path_simplify (copy);
2817 run_test (char *test, char *expected_result, int expected_change)
2819 char *test_copy = xstrdup (test);
2820 int modified = path_simplify (test_copy);
2822 if (0 != strcmp (test_copy, expected_result))
2824 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2825 test, expected_result, test_copy);
2827 if (modified != expected_change)
2829 if (expected_change == 1)
2830 printf ("Expected no modification with path_simplify(\"%s\").\n",
2833 printf ("Expected modification with path_simplify(\"%s\").\n",
2840 test_path_simplify (void)
2843 char *test, *result;
2849 { "foo", "foo", 0 },
2850 { "foo/bar", "foo/bar", 0 },
2851 { "foo///bar", "foo/bar", 1 },
2852 { "foo/.", "foo/", 1 },
2853 { "foo/./", "foo/", 1 },
2854 { "foo./", "foo./", 0 },
2855 { "foo/../bar", "bar", 1 },
2856 { "foo/../bar/", "bar/", 1 },
2857 { "foo/bar/..", "foo/", 1 },
2858 { "foo/bar/../x", "foo/x", 1 },
2859 { "foo/bar/../x/", "foo/x/", 1 },
2860 { "foo/..", "", 1 },
2861 { "foo/../..", "", 1 },
2862 { "a/b/../../c", "c", 1 },
2863 { "./a/../b", "b", 1 }
2867 for (i = 0; i < countof (tests); i++)
2869 char *test = tests[i].test;
2870 char *expected_result = tests[i].result;
2871 int expected_change = tests[i].should_modify;
2872 run_test (test, expected_result, expected_change);
2875 /* Now run all the tests with a leading slash before the test case,
2876 to prove that the slash is being preserved. */
2877 for (i = 0; i < countof (tests); i++)
2879 char *test, *expected_result;
2880 int expected_change = tests[i].should_modify;
2882 test = xmalloc (1 + strlen (tests[i].test) + 1);
2883 sprintf (test, "/%s", tests[i].test);
2885 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2886 sprintf (expected_result, "/%s", tests[i].result);
2888 run_test (test, expected_result, expected_change);
2891 xfree (expected_result);