2 Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or (at
10 your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
40 #include <sys/types.h>
58 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
60 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
62 static const int NS_INADDRSZ = 4;
63 static const int NS_IN6ADDRSZ = 16;
64 static const int NS_INT16SZ = 2;
74 /* Supported schemes: */
75 static struct scheme_data supported_schemes[] =
77 { "http://", DEFAULT_HTTP_PORT, 1 },
79 { "https://", DEFAULT_HTTPS_PORT, 1 },
81 { "ftp://", DEFAULT_FTP_PORT, 1 },
87 /* Forward declarations: */
89 static char *construct_relative PARAMS ((const char *, const char *));
90 static int path_simplify PARAMS ((char *));
94 /* Support for encoding and decoding of URL strings. We determine
95 whether a character is unsafe through static table lookup. This
96 code assumes ASCII character set and 8-bit chars. */
99 /* rfc1738 reserved chars, preserved from encoding. */
102 /* rfc1738 unsafe chars, plus some more. */
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
107 #define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
108 #define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
110 /* Shorthands for the table: */
111 #define R urlchr_reserved
112 #define U urlchr_unsafe
115 const static unsigned char urlchr_table[256] =
117 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
118 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
119 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
120 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
121 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
122 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
123 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
124 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
125 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
126 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
127 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
128 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
129 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
130 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
131 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
132 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
134 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
139 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
148 /* Decodes the forms %xy in a URL to the character the hexadecimal
149 code of which is xy. xy are hexadecimal digits from
150 [0123456789ABCDEF] (case-insensitive). If x or y are not
151 hex-digits or `%' precedes `\0', the sequence is inserted
155 url_unescape (char *s)
157 char *t = s; /* t - tortoise */
158 char *h = s; /* h - hare */
169 /* Do nothing if '%' is not followed by two hex digits. */
170 if (!*(h + 1) || !*(h + 2)
171 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
173 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
180 /* Like url_escape, but return S if there are no unsafe chars. */
183 url_escape_allow_passthrough (const char *s)
190 for (p1 = s; *p1; p1++)
191 if (URL_UNSAFE_CHAR (*p1))
192 addition += 2; /* Two more characters (hex digits) */
197 newlen = (p1 - s) + addition;
198 newstr = (char *)xmalloc (newlen + 1);
204 if (URL_UNSAFE_CHAR (*p1))
206 unsigned char c = *p1++;
208 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
209 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
215 assert (p2 - newstr == newlen);
220 /* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
221 given string, returning a malloc-ed %XX encoded string. */
224 url_escape (const char *s)
226 char *encoded = url_escape_allow_passthrough (s);
233 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
234 the old value of PTR is freed and PTR is made to point to the newly
235 allocated storage. */
237 #define ENCODE(ptr) do { \
238 char *e_new = url_escape_allow_passthrough (ptr); \
246 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
248 /* Decide whether to encode, decode, or pass through the char at P.
249 This used to be a macro, but it got a little too convoluted. */
250 static inline enum copy_method
251 decide_copy_method (const char *p)
255 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
257 /* %xx sequence: decode it, unless it would decode to an
258 unsafe or a reserved char; in that case, leave it as
260 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
261 XCHAR_TO_XDIGIT (*(p + 2));
263 if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))
264 return CM_PASSTHROUGH;
269 /* Garbled %.. sequence: encode `%'. */
272 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
275 return CM_PASSTHROUGH;
278 /* Translate a %-escaped (but possibly non-conformant) input string S
279 into a %-escaped (and conformant) output string. If no characters
280 are encoded or decoded, return the same string S; otherwise, return
281 a freshly allocated string with the new contents.
283 After a URL has been run through this function, the protocols that
284 use `%' as the quote character can use the resulting string as-is,
285 while those that don't call url_unescape() to get to the intended
286 data. This function is also stable: after an input string is
287 transformed the first time, all further transformations of the
288 result yield the same result string.
290 Let's discuss why this function is needed.
292 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
293 space character would mess up the HTTP request, it needs to be
296 GET /abc%20def HTTP/1.0
298 It appears that the unsafe chars need to be quoted, for example
299 with url_escape. But what if we're requested to download
300 `abc%20def'? url_escape transforms "%" to "%25", which would leave
301 us with `abc%2520def'. This is incorrect -- since %-escapes are
302 part of URL syntax, "%20" is the correct way to denote a literal
303 space on the Wget command line. This leaves us in the conclusion
304 that in that case Wget should not call url_escape, but leave the
307 And what if the requested URI is `abc%20 def'? If we call
308 url_escape, we end up with `/abc%2520%20def', which is almost
309 certainly not intended. If we don't call url_escape, we are left
310 with the embedded space and cannot complete the request. What the
311 user meant was for Wget to request `/abc%20%20def', and this is
312 where reencode_escapes kicks in.
314 Wget used to solve this by first decoding %-quotes, and then
315 encoding all the "unsafe" characters found in the resulting string.
316 This was wrong because it didn't preserve certain URL special
317 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
318 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
319 whether we considered `+' reserved (it is). One of these results
320 is inevitable because by the second step we would lose information
321 on whether the `+' was originally encoded or not. Both results
322 were wrong because in CGI parameters + means space, while %2B means
323 literal plus. reencode_escapes correctly translates the above to
324 "a%2B+b", i.e. returns the original string.
326 This function uses an algorithm proposed by Anon Sricharoenchai:
328 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
331 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
334 ...except that this code conflates the two steps, and decides
335 whether to encode, decode, or pass through each character in turn.
336 The function still uses two passes, but their logic is the same --
337 the first pass exists merely for the sake of allocation. Another
338 small difference is that we include `+' to URL_RESERVED.
342 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
344 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
348 "foo bar" -> "foo%20bar"
349 "foo%20bar" -> "foo%20bar"
350 "foo %20bar" -> "foo%20%20bar"
351 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
352 "foo%25%20bar" -> "foo%25%20bar"
353 "foo%2%20bar" -> "foo%252%20bar"
354 "foo+bar" -> "foo+bar" (plus is reserved!)
355 "foo%2b+bar" -> "foo%2b+bar" */
358 reencode_escapes (const char *s)
364 int encode_count = 0;
365 int decode_count = 0;
367 /* First, pass through the string to see if there's anything to do,
368 and to calculate the new length. */
369 for (p1 = s; *p1; p1++)
371 switch (decide_copy_method (p1))
384 if (!encode_count && !decode_count)
385 /* The string is good as it is. */
386 return (char *)s; /* C const model sucks. */
389 /* Each encoding adds two characters (hex digits), while each
390 decoding removes two characters. */
391 newlen = oldlen + 2 * (encode_count - decode_count);
392 newstr = xmalloc (newlen + 1);
399 switch (decide_copy_method (p1))
403 unsigned char c = *p1++;
405 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
406 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
410 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
411 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
412 p1 += 3; /* skip %xx */
419 assert (p2 - newstr == newlen);
423 /* Run PTR_VAR through reencode_escapes. If a new string is consed,
424 free PTR_VAR and make it point to the new storage. Obviously,
425 PTR_VAR needs to be an lvalue. */
427 #define REENCODE(ptr_var) do { \
428 char *rf_new = reencode_escapes (ptr_var); \
429 if (rf_new != ptr_var) \
436 /* Returns the scheme type if the scheme is supported, or
437 SCHEME_INVALID if not. */
439 url_scheme (const char *url)
443 for (i = 0; supported_schemes[i].leading_string; i++)
444 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
445 strlen (supported_schemes[i].leading_string)))
447 if (supported_schemes[i].enabled)
448 return (enum url_scheme) i;
450 return SCHEME_INVALID;
453 return SCHEME_INVALID;
456 /* Return the number of characters needed to skip the scheme part of
457 the URL, e.g. `http://'. If no scheme is found, returns 0. */
459 url_skip_scheme (const char *url)
463 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
465 while (ISALNUM (*p) || *p == '-' || *p == '+')
472 /* Skip "//" if found. */
473 if (*p == '/' && *(p + 1) == '/')
479 /* Returns 1 if the URL begins with a scheme (supported or
480 unsupported), 0 otherwise. */
482 url_has_scheme (const char *url)
485 while (ISALNUM (*p) || *p == '-' || *p == '+')
491 scheme_default_port (enum url_scheme scheme)
493 return supported_schemes[scheme].default_port;
497 scheme_disable (enum url_scheme scheme)
499 supported_schemes[scheme].enabled = 0;
502 /* Skip the username and password, if present here. The function
503 should be called *not* with the complete URL, but with the part
504 right after the scheme.
506 If no username and password are found, return 0. */
508 url_skip_uname (const char *url)
512 /* Look for '@' that comes before '/' or '?'. */
513 p = (const char *)strpbrk (url, "/?@");
521 parse_uname (const char *str, int len, char **user, char **passwd)
526 /* Empty user name not allowed. */
529 colon = memchr (str, ':', len);
531 /* Empty user name again. */
536 int pwlen = len - (colon + 1 - str);
537 *passwd = xmalloc (pwlen + 1);
538 memcpy (*passwd, colon + 1, pwlen);
539 (*passwd)[pwlen] = '\0';
545 *user = xmalloc (len + 1);
546 memcpy (*user, str, len);
550 url_unescape (*user);
552 url_unescape (*passwd);
557 /* Used by main.c: detect URLs written using the "shorthand" URL forms
558 popularized by Netscape and NcFTP. HTTP shorthands look like this:
560 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
561 www.foo.com[:port] -> http://www.foo.com[:port]
563 FTP shorthands look like this:
565 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
566 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
568 If the URL needs not or cannot be rewritten, return NULL. */
570 rewrite_shorthand_url (const char *url)
574 if (url_has_scheme (url))
577 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
579 for (p = url; *p && *p != ':' && *p != '/'; p++)
589 /* If the characters after the colon and before the next slash
590 or end of string are all digits, it's HTTP. */
592 for (pp = p + 1; ISDIGIT (*pp); pp++)
594 if (digits > 0 && (*pp == '/' || *pp == '\0'))
597 /* Prepend "ftp://" to the entire URL... */
598 res = xmalloc (6 + strlen (url) + 1);
599 sprintf (res, "ftp://%s", url);
600 /* ...and replace ':' with '/'. */
601 res[6 + (p - url)] = '/';
608 /* Just prepend "http://" to what we have. */
609 res = xmalloc (7 + strlen (url) + 1);
610 sprintf (res, "http://%s", url);
615 static void parse_path PARAMS ((const char *, char **, char **));
617 /* Like strpbrk, with the exception that it returns the pointer to the
618 terminating zero (end-of-string aka "eos") if no matching character
621 Although I normally balk at Gcc-specific optimizations, it probably
622 makes sense here: glibc has optimizations that detect strpbrk being
623 called with literal string as ACCEPT and inline the search. That
624 optimization is defeated if strpbrk is hidden within the call to
625 another function. (And no, making strpbrk_or_eos inline doesn't
626 help because the check for literal accept is in the
631 #define strpbrk_or_eos(s, accept) ({ \
632 char *SOE_p = strpbrk (s, accept); \
634 SOE_p = (char *)s + strlen (s); \
638 #else /* not __GNUC__ */
641 strpbrk_or_eos (const char *s, const char *accept)
643 char *p = strpbrk (s, accept);
645 p = (char *)s + strlen (s);
650 /* Turn STR into lowercase; return non-zero if a character was
654 lowercase_str (char *str)
661 *str = TOLOWER (*str);
666 static char *parse_errors[] = {
667 #define PE_NO_ERROR 0
669 #define PE_UNSUPPORTED_SCHEME 1
670 "Unsupported scheme",
671 #define PE_EMPTY_HOST 2
673 #define PE_BAD_PORT_NUMBER 3
675 #define PE_INVALID_USER_NAME 4
677 #define PE_UNTERMINATED_IPV6_ADDRESS 5
678 "Unterminated IPv6 numeric address",
679 #define PE_IPV6_NOT_SUPPORTED 6
680 "IPv6 addresses not supported",
681 #define PE_INVALID_IPV6_ADDRESS 7
682 "Invalid IPv6 numeric address"
685 #define SETERR(p, v) do { \
691 /* The following two functions were adapted from glibc. */
694 is_valid_ipv4_address (const char *str, const char *end)
696 int saw_digit, octets;
706 if (ch >= '0' && ch <= '9') {
707 val = val * 10 + (ch - '0');
711 if (saw_digit == 0) {
716 } else if (ch == '.' && saw_digit == 1) {
731 is_valid_ipv6_address (const char *str, const char *end)
733 static const char xdigits[] = "0123456789abcdef";
746 /* Leading :: requires some special handling. */
750 if (str == end || *str != ':')
762 /* if ch is a number, add it to val. */
763 pch = strchr(xdigits, ch);
766 val |= (pch - xdigits);
773 /* if ch is a colon ... */
776 if (saw_xdigit == 0) {
781 } else if (str == end) {
784 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
792 /* if ch is a dot ... */
793 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
794 is_valid_ipv4_address(curtok, end) == 1) {
803 if (saw_xdigit == 1) {
804 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
809 if (colonp != NULL) {
810 if (tp == NS_IN6ADDRSZ)
815 if (tp != NS_IN6ADDRSZ)
824 Return a new struct url if successful, NULL on error. In case of
825 error, and if ERROR is not NULL, also set *ERROR to the appropriate
828 url_parse (const char *url, int *error)
832 int path_modified, host_modified;
834 enum url_scheme scheme;
836 const char *uname_b, *uname_e;
837 const char *host_b, *host_e;
838 const char *path_b, *path_e;
839 const char *params_b, *params_e;
840 const char *query_b, *query_e;
841 const char *fragment_b, *fragment_e;
844 char *user = NULL, *passwd = NULL;
848 scheme = url_scheme (url);
849 if (scheme == SCHEME_INVALID)
851 SETERR (error, PE_UNSUPPORTED_SCHEME);
855 url_encoded = reencode_escapes (url);
858 p += strlen (supported_schemes[scheme].leading_string);
860 p += url_skip_uname (p);
863 /* scheme://user:pass@host[:port]... */
866 /* We attempt to break down the URL into the components path,
867 params, query, and fragment. They are ordered like this:
869 scheme://host[:port][/path][;params][?query][#fragment] */
871 params_b = params_e = NULL;
872 query_b = query_e = NULL;
873 fragment_b = fragment_e = NULL;
879 /* Handle IPv6 address inside square brackets. Ideally we'd
880 just look for the terminating ']', but rfc2732 mandates
881 rejecting invalid IPv6 addresses. */
883 /* The address begins after '['. */
885 host_e = strchr (host_b, ']');
889 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
894 /* Check if the IPv6 address is valid. */
895 if (!is_valid_ipv6_address(host_b, host_e))
897 SETERR (error, PE_INVALID_IPV6_ADDRESS);
901 /* Continue parsing after the closing ']'. */
904 SETERR (error, PE_IPV6_NOT_SUPPORTED);
910 p = strpbrk_or_eos (p, ":/;?#");
914 if (host_b == host_e)
916 SETERR (error, PE_EMPTY_HOST);
920 port = scheme_default_port (scheme);
923 const char *port_b, *port_e, *pp;
925 /* scheme://host:port/tralala */
929 p = strpbrk_or_eos (p, "/;?#");
932 if (port_b == port_e)
934 /* http://host:/whatever */
936 SETERR (error, PE_BAD_PORT_NUMBER);
940 for (port = 0, pp = port_b; pp < port_e; pp++)
944 /* http://host:12randomgarbage/blah */
946 SETERR (error, PE_BAD_PORT_NUMBER);
950 port = 10 * port + (*pp - '0');
958 p = strpbrk_or_eos (p, ";?#");
963 /* Path is not allowed not to exist. */
971 p = strpbrk_or_eos (p, "?#");
978 p = strpbrk_or_eos (p, "#");
981 /* Hack that allows users to use '?' (a wildcard character) in
982 FTP URLs without it being interpreted as a query string
984 if (scheme == SCHEME_FTP)
986 query_b = query_e = NULL;
999 if (uname_b != uname_e)
1001 /* http://user:pass@host */
1003 /* uname_b uname_e */
1004 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
1006 SETERR (error, PE_INVALID_USER_NAME);
1011 u = (struct url *)xmalloc (sizeof (struct url));
1012 memset (u, 0, sizeof (*u));
1015 u->host = strdupdelim (host_b, host_e);
1020 u->path = strdupdelim (path_b, path_e);
1021 path_modified = path_simplify (u->path);
1022 parse_path (u->path, &u->dir, &u->file);
1024 host_modified = lowercase_str (u->host);
1027 u->params = strdupdelim (params_b, params_e);
1029 u->query = strdupdelim (query_b, query_e);
1031 u->fragment = strdupdelim (fragment_b, fragment_e);
1033 if (path_modified || u->fragment || host_modified || path_b == path_e)
1035 /* If we suspect that a transformation has rendered what
1036 url_string might return different from URL_ENCODED, rebuild
1037 u->url using url_string. */
1038 u->url = url_string (u, 0);
1040 if (url_encoded != url)
1041 xfree ((char *) url_encoded);
1045 if (url_encoded == url)
1046 u->url = xstrdup (url);
1048 u->url = url_encoded;
1056 url_error (int error_code)
1058 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1059 return parse_errors[error_code];
1062 /* Parse PATH into dir and file. PATH is extracted from the URL and
1063 is URL-escaped. The function returns unescaped DIR and FILE. */
1066 parse_path (const char *path, char **dir, char **file)
1070 last_slash = strrchr (path, '/');
1073 *dir = xstrdup ("");
1074 *file = xstrdup (path);
1078 *dir = strdupdelim (path, last_slash);
1079 *file = xstrdup (last_slash + 1);
1081 url_unescape (*dir);
1082 url_unescape (*file);
1085 /* Note: URL's "full path" is the path with the query string and
1086 params appended. The "fragment" (#foo) is intentionally ignored,
1087 but that might be changed. For example, if the original URL was
1088 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1089 the full path will be "/foo/bar/baz;bullshit?querystring". */
1091 /* Return the length of the full path, without the terminating
1095 full_path_length (const struct url *url)
1099 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1110 /* Write out the full path. */
1113 full_path_write (const struct url *url, char *where)
1115 #define FROB(el, chr) do { \
1116 char *f_el = url->el; \
1118 int l = strlen (f_el); \
1120 memcpy (where, f_el, l); \
1132 /* Public function for getting the "full path". E.g. if u->path is
1133 "foo/bar" and u->query is "param=value", full_path will be
1134 "/foo/bar?param=value". */
1137 url_full_path (const struct url *url)
1139 int length = full_path_length (url);
1140 char *full_path = (char *)xmalloc(length + 1);
1142 full_path_write (url, full_path);
1143 full_path[length] = '\0';
1148 /* Sync u->path and u->url with u->dir and u->file. */
1151 sync_path (struct url *url)
1159 newpath = xstrdup (url->file);
1164 int dirlen = strlen (url->dir);
1165 int filelen = strlen (url->file);
1167 newpath = xmalloc (dirlen + 1 + filelen + 1);
1168 memcpy (newpath, url->dir, dirlen);
1169 newpath[dirlen] = '/';
1170 memcpy (newpath + dirlen + 1, url->file, filelen);
1171 newpath[dirlen + 1 + filelen] = '\0';
1175 url->path = newpath;
1177 /* Synchronize u->url. */
1179 url->url = url_string (url, 0);
1182 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1183 This way we can sync u->path and u->url when they get changed. */
1186 url_set_dir (struct url *url, const char *newdir)
1189 url->dir = xstrdup (newdir);
1194 url_set_file (struct url *url, const char *newfile)
1197 url->file = xstrdup (newfile);
1202 url_free (struct url *url)
1208 FREE_MAYBE (url->params);
1209 FREE_MAYBE (url->query);
1210 FREE_MAYBE (url->fragment);
1211 FREE_MAYBE (url->user);
1212 FREE_MAYBE (url->passwd);
1221 get_urls_file (const char *file)
1223 struct file_memory *fm;
1224 struct urlpos *head, *tail;
1225 const char *text, *text_end;
1227 /* Load the file. */
1228 fm = read_file (file);
1231 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1234 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1238 text_end = fm->content + fm->length;
1239 while (text < text_end)
1241 const char *line_beg = text;
1242 const char *line_end = memchr (text, '\n', text_end - text);
1244 line_end = text_end;
1249 /* Strip whitespace from the beginning and end of line. */
1250 while (line_beg < line_end && ISSPACE (*line_beg))
1252 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1255 if (line_end > line_beg)
1257 /* URL is in the [line_beg, line_end) region. */
1261 struct urlpos *entry;
1264 /* We must copy the URL to a zero-terminated string, and we
1265 can't use alloca because we're in a loop. *sigh*. */
1266 url_text = strdupdelim (line_beg, line_end);
1270 /* Merge opt.base_href with URL. */
1271 char *merged = uri_merge (opt.base_href, url_text);
1276 url = url_parse (url_text, &up_error_code);
1279 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1280 file, url_text, url_error (up_error_code));
1286 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1287 memset (entry, 0, sizeof (*entry));
1298 read_file_free (fm);
1302 /* Free the linked list of urlpos. */
1304 free_urlpos (struct urlpos *l)
1308 struct urlpos *next = l->next;
1311 FREE_MAYBE (l->local_name);
1317 /* Rotate FNAME opt.backups times */
1319 rotate_backups(const char *fname)
1321 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1322 char *from = (char *)alloca (maxlen);
1323 char *to = (char *)alloca (maxlen);
1327 if (stat (fname, &sb) == 0)
1328 if (S_ISREG (sb.st_mode) == 0)
1331 for (i = opt.backups; i > 1; i--)
1333 sprintf (from, "%s.%d", fname, i - 1);
1334 sprintf (to, "%s.%d", fname, i);
1338 sprintf (to, "%s.%d", fname, 1);
1342 /* Create all the necessary directories for PATH (a file). Calls
1343 mkdirhier() internally. */
1345 mkalldirs (const char *path)
1352 p = path + strlen (path);
1353 for (; *p != '/' && p != path; p--)
1356 /* Don't create if it's just a file. */
1357 if ((p == path) && (*p != '/'))
1359 t = strdupdelim (path, p);
1361 /* Check whether the directory exists. */
1362 if ((stat (t, &st) == 0))
1364 if (S_ISDIR (st.st_mode))
1371 /* If the dir exists as a file name, remove it first. This
1372 is *only* for Wget to work with buggy old CERN http
1373 servers. Here is the scenario: When Wget tries to
1374 retrieve a directory without a slash, e.g.
1375 http://foo/bar (bar being a directory), CERN server will
1376 not redirect it too http://foo/bar/ -- it will generate a
1377 directory listing containing links to bar/file1,
1378 bar/file2, etc. Wget will lose because it saves this
1379 HTML listing to a file `bar', so it cannot create the
1380 directory. To work around this, if the file of the same
1381 name exists, we just remove it and create the directory
1383 DEBUGP (("Removing %s because of directory danger!\n", t));
1387 res = make_directory (t);
1389 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1394 /* Functions for constructing the file name out of URL components. */
1396 /* A growable string structure, used by url_file_name and friends.
1397 This should perhaps be moved to utils.c.
1399 The idea is to have an easy way to construct a string by having
1400 various functions append data to it. Instead of passing the
1401 obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
1402 questions, we pass the pointer to this struct. */
1410 /* Ensure that the string can accept APPEND_COUNT more characters past
1411 the current TAIL position. If necessary, this will grow the string
1412 and update its allocated size. If the string is already large
1413 enough to take TAIL+APPEND_COUNT characters, this does nothing. */
1414 #define GROW(g, append_size) do { \
1415 struct growable *G_ = g; \
1416 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \
1419 /* Return the tail position of the string. */
1420 #define TAIL(r) ((r)->base + (r)->tail)
1422 /* Move the tail position by APPEND_COUNT characters. */
1423 #define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1425 /* Append the string STR to DEST. NOTICE: the string in DEST is not
1429 append_string (const char *str, struct growable *dest)
1431 int l = strlen (str);
1433 memcpy (TAIL (dest), str, l);
1434 TAIL_INCR (dest, l);
1437 /* Append CH to DEST. For example, append_char (0, DEST)
1438 zero-terminates DEST. */
1441 append_char (char ch, struct growable *dest)
1445 TAIL_INCR (dest, 1);
1449 filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
1450 filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
1451 filechr_unsafe_windows = 2, /* disallowed on Windows file system */
1454 #define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
1456 /* Shorthands for the table: */
1457 #define A filechr_unsafe_always
1458 #define S filechr_unsafe_shell
1459 #define W filechr_unsafe_windows
1464 Unix shell: 0-31, 128-159
1465 Windows: \, |, /, <, >, ?, :
1467 Arguably we could also claim `%' to be unsafe, since we use it as
1468 the escape character. If we ever want to be able to reliably
1469 translate file name back to URL, this would become important
1470 crucial. Right now, it's better to be minimal in escaping. */
1472 const static unsigned char filechr_table[256] =
1474 A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
1475 S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
1476 S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
1477 S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
1478 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
1479 0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
1480 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
1481 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
1482 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
1483 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
1484 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
1485 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */
1486 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
1487 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
1488 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
1489 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
1491 S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
1492 S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
1493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1502 /* Return non-zero if character CH is unsafe for use in file or
1503 directory name. Called by append_uri_pathel. */
1506 file_unsafe_char (char ch, int restrict)
1508 int mask = filechr_unsafe_always;
1509 if (restrict == restrict_shell)
1510 mask |= filechr_unsafe_shell;
1511 else if (restrict == restrict_windows)
1512 mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
1513 return FILE_CHAR_TEST (ch, mask);
1516 /* FN_PORT_SEP is the separator between host and port in file names
1517 for non-standard port numbers. On Unix this is normally ':', as in
1518 "www.xemacs.org:4001/index.html". Under Windows, we set it to +
1519 because Windows can't handle ':' in file names. */
1520 #define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
1522 /* FN_QUERY_SEP is the separator between the file name and the URL
1523 query, normally '?'. Since Windows cannot handle '?' as part of
1524 file name, we use '@' instead there. */
1525 #define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
1527 /* Quote path element, characters in [b, e), as file name, and append
1528 the quoted string to DEST. Each character is quoted as per
1529 file_unsafe_char and the corresponding table. */
1532 append_uri_pathel (const char *b, const char *e, struct growable *dest)
1540 /* Currently restrict_for_windows is determined at compile time
1541 only. But some users download files to Windows partitions; they
1542 should be able to say --windows-file-names so Wget escapes
1543 characters invalid on Windows. Similar run-time restrictions for
1544 other file systems can be implemented. */
1545 const int restrict = opt.restrict_file_names;
1547 /* Copy [b, e) to PATHEL and URL-unescape it. */
1548 BOUNDED_TO_ALLOCA (b, e, pathel);
1549 url_unescape (pathel);
1550 pathlen = strlen (pathel);
1552 /* Go through PATHEL and check how many characters we'll need to
1553 add for file quoting. */
1555 for (p = pathel; *p; p++)
1556 if (file_unsafe_char (*p, restrict))
1559 /* p - pathel is the string length. Each quoted char means two
1560 additional characters in the string, hence 2*quoted. */
1561 outlen = (p - pathel) + (2 * quoted);
1562 GROW (dest, outlen);
1566 /* If there's nothing to quote, we don't need to go through the
1567 string the second time. */
1568 memcpy (TAIL (dest), pathel, outlen);
1572 char *q = TAIL (dest);
1573 for (p = pathel; *p; p++)
1575 if (!file_unsafe_char (*p, restrict))
1579 unsigned char ch = *p;
1581 *q++ = XDIGIT_TO_XCHAR (ch >> 4);
1582 *q++ = XDIGIT_TO_XCHAR (ch & 0xf);
1585 assert (q - TAIL (dest) == outlen);
1587 TAIL_INCR (dest, outlen);
1590 /* Append to DEST the directory structure that corresponds the
1591 directory part of URL's path. For example, if the URL is
1592 http://server/dir1/dir2/file, this appends "/dir1/dir2".
1594 Each path element ("dir1" and "dir2" in the above example) is
1595 examined, url-unescaped, and re-escaped as file name element.
1597 Additionally, it cuts as many directories from the path as
1598 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it
1599 will produce "bar" for the above example. For 2 or more, it will
1602 Each component of the path is quoted for use as file name. */
1605 append_dir_structure (const struct url *u, struct growable *dest)
1607 char *pathel, *next;
1608 int cut = opt.cut_dirs;
1610 /* Go through the path components, de-URL-quote them, and quote them
1611 (if necessary) as file names. */
1614 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1619 /* Ignore empty pathels. path_simplify should remove
1620 occurrences of "//" from the path, but it has special cases
1621 for starting / which generates an empty pathel here. */
1625 append_char ('/', dest);
1626 append_uri_pathel (pathel, next, dest);
1630 /* Return a unique file name that matches the given URL as good as
1631 possible. Does not create directories on the file system. */
1634 url_file_name (const struct url *u)
1636 struct growable fnres;
1638 char *u_file, *u_query;
1639 char *fname, *unique;
1645 /* Start with the directory prefix, if specified. */
1646 if (!DOTP (opt.dir_prefix))
1647 append_string (opt.dir_prefix, &fnres);
1649 /* If "dirstruct" is turned on (typically the case with -r), add
1650 the host and port (unless those have been turned off) and
1651 directory structure. */
1654 if (opt.add_hostdir)
1657 append_char ('/', &fnres);
1658 append_string (u->host, &fnres);
1659 if (u->port != scheme_default_port (u->scheme))
1662 number_to_string (portstr, u->port);
1663 append_char (FN_PORT_SEP, &fnres);
1664 append_string (portstr, &fnres);
1668 append_dir_structure (u, &fnres);
1671 /* Add the file name. */
1673 append_char ('/', &fnres);
1674 u_file = *u->file ? u->file : "index.html";
1675 append_uri_pathel (u_file, u_file + strlen (u_file), &fnres);
1677 /* Append "?query" to the file name. */
1678 u_query = u->query && *u->query ? u->query : NULL;
1681 append_char (FN_QUERY_SEP, &fnres);
1682 append_uri_pathel (u_query, u_query + strlen (u_query), &fnres);
1685 /* Zero-terminate the file name. */
1686 append_char ('\0', &fnres);
1690 /* Check the cases in which the unique extensions are not used:
1691 1) Clobbering is turned off (-nc).
1692 2) Retrieval with regetting.
1693 3) Timestamping is used.
1694 4) Hierarchy is built.
1696 The exception is the case when file does exist and is a
1697 directory (actually support for bad httpd-s). */
1699 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1700 && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1703 /* Find a unique name. */
1704 unique = unique_name (fname);
1709 /* Return the length of URL's path. Path is considered to be
1710 terminated by one of '?', ';', '#', or by the end of the
1713 path_length (const char *url)
1715 const char *q = strpbrk_or_eos (url, "?;#");
1719 /* Find the last occurrence of character C in the range [b, e), or
1720 NULL, if none are present. This is equivalent to strrchr(b, c),
1721 except that it accepts an END argument instead of requiring the
1722 string to be zero-terminated. Why is there no memrchr()? */
1724 find_last_char (const char *b, const char *e, char c)
1732 /* Resolve "." and ".." elements of PATH by destructively modifying
1733 PATH. "." is resolved by removing that path element, and ".." is
1734 resolved by removing the preceding path element. Leading and
1735 trailing slashes are preserved.
1737 Return non-zero if any changes have been made.
1739 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1740 test examples are provided below. If you change anything in this
1741 function, run test_path_simplify to make sure you haven't broken a
1744 A previous version of this function was based on path_simplify()
1745 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1748 path_simplify (char *path)
1754 ++path; /* preserve the leading '/'. */
1757 end = p + strlen (p) + 1; /* position past the terminating zero. */
1762 /* P should point to the beginning of a path element. */
1764 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1766 /* Handle "./foo" by moving "foo" two characters to the
1768 if (*(p + 1) == '/')
1771 memmove (p, p + 2, end - p);
1782 else if (*p == '.' && *(p + 1) == '.'
1783 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1785 /* Handle "../foo" by moving "foo" one path element to the
1787 char *b = p; /* not p-1 because P can equal PATH */
1789 /* Backtrack by one path element, but not past the beginning
1792 /* foo/bar/../baz */
1798 /* Move backwards until B hits the beginning of the
1799 previous path element or the beginning of path. */
1800 for (--b; b > path && *(b - 1) != '/'; b--)
1805 if (*(p + 2) == '/')
1807 memmove (b, p + 3, end - (p + 3));
1821 /* Remove empty path elements. Not mandated by rfc1808 et
1822 al, but it seems like a good idea to get rid of them.
1823 Supporting them properly is hard (in which directory do
1824 you save http://x.com///y.html?) and they don't seem to
1835 memmove (p, q, end - q);
1840 /* Skip to the next path element. */
1841 while (*p && *p != '/')
1846 /* Make sure P points to the beginning of the next path element,
1847 which is location after the slash. */
1854 /* Resolve the result of "linking" a base URI (BASE) to a
1855 link-specified URI (LINK).
1857 Either of the URIs may be absolute or relative, complete with the
1858 host name, or path only. This tries to behave "reasonably" in all
1859 foreseeable cases. It employs little specific knowledge about
1860 schemes or URL-specific stuff -- it just works on strings.
1862 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1863 See uri_merge for a gentler interface to this functionality.
1865 Perhaps this function should call path_simplify so that the callers
1866 don't have to call url_parse unconditionally. */
1868 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1874 const char *end = base + path_length (base);
1878 /* Empty LINK points back to BASE, query string and all. */
1879 constr = xstrdup (base);
1881 else if (*link == '?')
1883 /* LINK points to the same location, but changes the query
1884 string. Examples: */
1885 /* uri_merge("path", "?new") -> "path?new" */
1886 /* uri_merge("path?foo", "?new") -> "path?new" */
1887 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1888 /* uri_merge("path#foo", "?new") -> "path?new" */
1889 int baselength = end - base;
1890 constr = xmalloc (baselength + linklength + 1);
1891 memcpy (constr, base, baselength);
1892 memcpy (constr + baselength, link, linklength);
1893 constr[baselength + linklength] = '\0';
1895 else if (*link == '#')
1897 /* uri_merge("path", "#new") -> "path#new" */
1898 /* uri_merge("path#foo", "#new") -> "path#new" */
1899 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1900 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1902 const char *end1 = strchr (base, '#');
1904 end1 = base + strlen (base);
1905 baselength = end1 - base;
1906 constr = xmalloc (baselength + linklength + 1);
1907 memcpy (constr, base, baselength);
1908 memcpy (constr + baselength, link, linklength);
1909 constr[baselength + linklength] = '\0';
1911 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1913 /* LINK begins with "//" and so is a net path: we need to
1914 replace everything after (and including) the double slash
1917 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1918 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1919 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1923 const char *start_insert;
1925 /* Look for first slash. */
1926 slash = memchr (base, '/', end - base);
1927 /* If found slash and it is a double slash, then replace
1928 from this point, else default to replacing from the
1930 if (slash && *(slash + 1) == '/')
1931 start_insert = slash;
1933 start_insert = base;
1935 span = start_insert - base;
1936 constr = (char *)xmalloc (span + linklength + 1);
1938 memcpy (constr, base, span);
1939 memcpy (constr + span, link, linklength);
1940 constr[span + linklength] = '\0';
1942 else if (*link == '/')
1944 /* LINK is an absolute path: we need to replace everything
1945 after (and including) the FIRST slash with LINK.
1947 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1948 "/qux/xyzzy", our result should be
1949 "http://host/qux/xyzzy". */
1952 const char *start_insert = NULL; /* for gcc to shut up. */
1953 const char *pos = base;
1954 int seen_slash_slash = 0;
1955 /* We're looking for the first slash, but want to ignore
1958 slash = memchr (pos, '/', end - pos);
1959 if (slash && !seen_slash_slash)
1960 if (*(slash + 1) == '/')
1963 seen_slash_slash = 1;
1967 /* At this point, SLASH is the location of the first / after
1968 "//", or the first slash altogether. START_INSERT is the
1969 pointer to the location where LINK will be inserted. When
1970 examining the last two examples, keep in mind that LINK
1973 if (!slash && !seen_slash_slash)
1974 /* example: "foo" */
1976 start_insert = base;
1977 else if (!slash && seen_slash_slash)
1978 /* example: "http://foo" */
1981 else if (slash && !seen_slash_slash)
1982 /* example: "foo/bar" */
1984 start_insert = base;
1985 else if (slash && seen_slash_slash)
1986 /* example: "http://something/" */
1988 start_insert = slash;
1990 span = start_insert - base;
1991 constr = (char *)xmalloc (span + linklength + 1);
1993 memcpy (constr, base, span);
1995 memcpy (constr + span, link, linklength);
1996 constr[span + linklength] = '\0';
2000 /* LINK is a relative URL: we need to replace everything
2001 after last slash (possibly empty) with LINK.
2003 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
2004 our result should be "whatever/foo/qux/xyzzy". */
2005 int need_explicit_slash = 0;
2007 const char *start_insert;
2008 const char *last_slash = find_last_char (base, end, '/');
2011 /* No slash found at all. Append LINK to what we have,
2012 but we'll need a slash as a separator.
2014 Example: if base == "foo" and link == "qux/xyzzy", then
2015 we cannot just append link to base, because we'd get
2016 "fooqux/xyzzy", whereas what we want is
2019 To make sure the / gets inserted, we set
2020 need_explicit_slash to 1. We also set start_insert
2021 to end + 1, so that the length calculations work out
2022 correctly for one more (slash) character. Accessing
2023 that character is fine, since it will be the
2024 delimiter, '\0' or '?'. */
2025 /* example: "foo?..." */
2026 /* ^ ('?' gets changed to '/') */
2027 start_insert = end + 1;
2028 need_explicit_slash = 1;
2030 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
2032 /* example: http://host" */
2034 start_insert = end + 1;
2035 need_explicit_slash = 1;
2039 /* example: "whatever/foo/bar" */
2041 start_insert = last_slash + 1;
2044 span = start_insert - base;
2045 constr = (char *)xmalloc (span + linklength + 1);
2047 memcpy (constr, base, span);
2048 if (need_explicit_slash)
2049 constr[span - 1] = '/';
2051 memcpy (constr + span, link, linklength);
2052 constr[span + linklength] = '\0';
2055 else /* !no_scheme */
2057 constr = strdupdelim (link, link + linklength);
2062 /* Merge BASE with LINK and return the resulting URI. This is an
2063 interface to uri_merge_1 that assumes that LINK is a
2064 zero-terminated string. */
2066 uri_merge (const char *base, const char *link)
2068 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
2071 #define APPEND(p, s) do { \
2072 int len = strlen (s); \
2073 memcpy (p, s, len); \
2077 /* Use this instead of password when the actual password is supposed
2078 to be hidden. We intentionally use a generic string without giving
2079 away the number of characters in the password, like previous
2081 #define HIDDEN_PASSWORD "*password*"
2083 /* Recreate the URL string from the data in URL.
2085 If HIDE is non-zero (as it is when we're calling this on a URL we
2086 plan to print, but not when calling it to canonicalize a URL for
2087 use within the program), password will be hidden. Unsafe
2088 characters in the URL will be quoted. */
2091 url_string (const struct url *url, int hide_password)
2095 char *quoted_user = NULL, *quoted_passwd = NULL;
2097 int scheme_port = supported_schemes[url->scheme].default_port;
2098 char *scheme_str = supported_schemes[url->scheme].leading_string;
2099 int fplen = full_path_length (url);
2101 int brackets_around_host = 0;
2103 assert (scheme_str != NULL);
2105 /* Make sure the user name and password are quoted. */
2108 quoted_user = url_escape_allow_passthrough (url->user);
2112 quoted_passwd = HIDDEN_PASSWORD;
2114 quoted_passwd = url_escape_allow_passthrough (url->passwd);
2118 if (strchr (url->host, ':'))
2119 brackets_around_host = 1;
2121 size = (strlen (scheme_str)
2122 + strlen (url->host)
2123 + (brackets_around_host ? 2 : 0)
2126 if (url->port != scheme_port)
2127 size += 1 + numdigit (url->port);
2130 size += 1 + strlen (quoted_user);
2132 size += 1 + strlen (quoted_passwd);
2135 p = result = xmalloc (size);
2137 APPEND (p, scheme_str);
2140 APPEND (p, quoted_user);
2144 APPEND (p, quoted_passwd);
2149 if (brackets_around_host)
2151 APPEND (p, url->host);
2152 if (brackets_around_host)
2154 if (url->port != scheme_port)
2157 p = number_to_string (p, url->port);
2160 full_path_write (url, p);
2164 assert (p - result == size);
2166 if (quoted_user && quoted_user != url->user)
2167 xfree (quoted_user);
2168 if (quoted_passwd && !hide_password
2169 && quoted_passwd != url->passwd)
2170 xfree (quoted_passwd);
2175 /* Return the URL of the proxy appropriate for url U. */
2177 getproxy (struct url *u)
2180 char *rewritten_url;
2181 static char rewritten_storage[1024];
2185 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2191 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2195 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2199 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2201 case SCHEME_INVALID:
2204 if (!proxy || !*proxy)
2207 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2208 getproxy() to return static storage. */
2209 rewritten_url = rewrite_shorthand_url (proxy);
2212 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2213 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2214 proxy = rewritten_storage;
2220 /* Should a host be accessed through proxy, concerning no_proxy? */
2222 no_proxy_match (const char *host, const char **no_proxy)
2227 return !sufmatch (no_proxy, host);
2230 /* Support for converting links for local viewing in downloaded HTML
2231 files. This should be moved to another file, because it has
2232 nothing to do with processing URLs. */
2234 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2235 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2237 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2238 const char *, int));
2239 static char *local_quote_string PARAMS ((const char *));
2241 /* Change the links in one HTML file. LINKS is a list of links in the
2242 document, along with their positions and the desired direction of
2245 convert_links (const char *file, struct urlpos *links)
2247 struct file_memory *fm;
2250 downloaded_file_t downloaded_file_return;
2252 struct urlpos *link;
2253 int to_url_count = 0, to_file_count = 0;
2255 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2258 /* First we do a "dry run": go through the list L and see whether
2259 any URL needs to be converted in the first place. If not, just
2260 leave the file alone. */
2262 struct urlpos *dry = links;
2263 for (dry = links; dry; dry = dry->next)
2264 if (dry->convert != CO_NOCONVERT)
2268 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2273 fm = read_file (file);
2276 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2277 file, strerror (errno));
2281 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2282 if (opt.backup_converted && downloaded_file_return)
2283 write_backup_file (file, downloaded_file_return);
2285 /* Before opening the file for writing, unlink the file. This is
2286 important if the data in FM is mmaped. In such case, nulling the
2287 file, which is what fopen() below does, would make us read all
2288 zeroes from the mmaped region. */
2289 if (unlink (file) < 0 && errno != ENOENT)
2291 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2292 file, strerror (errno));
2293 read_file_free (fm);
2296 /* Now open the file for writing. */
2297 fp = fopen (file, "wb");
2300 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2301 file, strerror (errno));
2302 read_file_free (fm);
2306 /* Here we loop through all the URLs in file, replacing those of
2307 them that are downloaded with relative references. */
2309 for (link = links; link; link = link->next)
2311 char *url_start = fm->content + link->pos;
2313 if (link->pos >= fm->length)
2315 DEBUGP (("Something strange is going on. Please investigate."));
2318 /* If the URL is not to be converted, skip it. */
2319 if (link->convert == CO_NOCONVERT)
2321 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2325 /* Echo the file contents, up to the offending URL's opening
2326 quote, to the outfile. */
2327 fwrite (p, 1, url_start - p, fp);
2330 switch (link->convert)
2332 case CO_CONVERT_TO_RELATIVE:
2333 /* Convert absolute URL to relative. */
2335 char *newname = construct_relative (file, link->local_name);
2336 char *quoted_newname = local_quote_string (newname);
2338 if (!link->link_refresh_p)
2339 p = replace_attr (p, link->size, fp, quoted_newname);
2341 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2342 link->refresh_timeout);
2344 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2345 link->url->url, newname, link->pos, file));
2347 xfree (quoted_newname);
2351 case CO_CONVERT_TO_COMPLETE:
2352 /* Convert the link to absolute URL. */
2354 char *newlink = link->url->url;
2355 char *quoted_newlink = html_quote_string (newlink);
2357 if (!link->link_refresh_p)
2358 p = replace_attr (p, link->size, fp, quoted_newlink);
2360 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2361 link->refresh_timeout);
2363 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2364 newlink, link->pos, file));
2365 xfree (quoted_newlink);
2369 case CO_NULLIFY_BASE:
2370 /* Change the base href to "". */
2371 p = replace_attr (p, link->size, fp, "");
2379 /* Output the rest of the file. */
2380 if (p - fm->content < fm->length)
2381 fwrite (p, 1, fm->length - (p - fm->content), fp);
2383 read_file_free (fm);
2385 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2388 /* Construct and return a malloced copy of the relative link from two
2389 pieces of information: local name S1 of the referring file and
2390 local name S2 of the referred file.
2392 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2393 "jagor.srce.hr/images/news.gif", the function will return
2396 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2397 "fly.cc.fer.hr/images/fly.gif", the function will return
2398 "../images/fly.gif".
2400 Caveats: S1 should not begin with `/', unless S2 also begins with
2401 '/'. S1 should not contain things like ".." and such --
2402 construct_relative ("fly/ioccc/../index.html",
2403 "fly/images/fly.gif") will fail. (A workaround is to call
2404 something like path_simplify() on S1). */
2406 construct_relative (const char *s1, const char *s2)
2408 int i, cnt, sepdirs1;
2412 return xstrdup (s2);
2413 /* S1 should *not* be absolute, if S2 wasn't. */
2414 assert (*s1 != '/');
2416 /* Skip the directories common to both strings. */
2419 while (s1[i] && s2[i]
2424 if (s1[i] == '/' && s2[i] == '/')
2429 for (sepdirs1 = 0; s1[i]; i++)
2432 /* Now, construct the file as of:
2433 - ../ repeated sepdirs1 time
2434 - all the non-mutual directories of S2. */
2435 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2436 for (i = 0; i < sepdirs1; i++)
2437 memcpy (res + 3 * i, "../", 3);
2438 strcpy (res + 3 * i, s2 + cnt);
2443 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2445 /* Rather than just writing over the original .html file with the
2446 converted version, save the former to *.orig. Note we only do
2447 this for files we've _successfully_ downloaded, so we don't
2448 clobber .orig files sitting around from previous invocations. */
2450 /* Construct the backup filename as the original name plus ".orig". */
2451 size_t filename_len = strlen(file);
2452 char* filename_plus_orig_suffix;
2453 boolean already_wrote_backup_file = FALSE;
2454 slist* converted_file_ptr;
2455 static slist* converted_files = NULL;
2457 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2459 /* Just write "orig" over "html". We need to do it this way
2460 because when we're checking to see if we've downloaded the
2461 file before (to see if we can skip downloading it), we don't
2462 know if it's a text/html file. Therefore we don't know yet
2463 at that stage that -E is going to cause us to tack on
2464 ".html", so we need to compare vs. the original URL plus
2465 ".orig", not the original URL plus ".html.orig". */
2466 filename_plus_orig_suffix = alloca (filename_len + 1);
2467 strcpy(filename_plus_orig_suffix, file);
2468 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2470 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2472 /* Append ".orig" to the name. */
2473 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2474 strcpy(filename_plus_orig_suffix, file);
2475 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2478 /* We can get called twice on the same URL thanks to the
2479 convert_all_links() call in main(). If we write the .orig file
2480 each time in such a case, it'll end up containing the first-pass
2481 conversion, not the original file. So, see if we've already been
2482 called on this file. */
2483 converted_file_ptr = converted_files;
2484 while (converted_file_ptr != NULL)
2485 if (strcmp(converted_file_ptr->string, file) == 0)
2487 already_wrote_backup_file = TRUE;
2491 converted_file_ptr = converted_file_ptr->next;
2493 if (!already_wrote_backup_file)
2495 /* Rename <file> to <file>.orig before former gets written over. */
2496 if (rename(file, filename_plus_orig_suffix) != 0)
2497 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2498 file, filename_plus_orig_suffix, strerror (errno));
2500 /* Remember that we've already written a .orig backup for this file.
2501 Note that we never free this memory since we need it till the
2502 convert_all_links() call, which is one of the last things the
2503 program does before terminating. BTW, I'm not sure if it would be
2504 safe to just set 'converted_file_ptr->string' to 'file' below,
2505 rather than making a copy of the string... Another note is that I
2506 thought I could just add a field to the urlpos structure saying
2507 that we'd written a .orig file for this URL, but that didn't work,
2508 so I had to make this separate list.
2509 -- Dan Harkless <wget@harkless.org>
2511 This [adding a field to the urlpos structure] didn't work
2512 because convert_file() is called from convert_all_links at
2513 the end of the retrieval with a freshly built new urlpos
2515 -- Hrvoje Niksic <hniksic@arsdigita.com>
2517 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2518 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2519 converted_file_ptr->next = converted_files;
2520 converted_files = converted_file_ptr;
2524 static int find_fragment PARAMS ((const char *, int, const char **,
2527 /* Replace an attribute's original text with NEW_TEXT. */
2530 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2533 char quote_char = '\"'; /* use "..." for quoting, unless the
2534 original value is quoted, in which
2535 case reuse its quoting char. */
2536 const char *frag_beg, *frag_end;
2538 /* Structure of our string is:
2539 "...old-contents..."
2540 <--- size ---> (with quotes)
2543 <--- size --> (no quotes) */
2545 if (*p == '\"' || *p == '\'')
2550 size -= 2; /* disregard opening and closing quote */
2552 putc (quote_char, fp);
2553 fputs (new_text, fp);
2555 /* Look for fragment identifier, if any. */
2556 if (find_fragment (p, size, &frag_beg, &frag_end))
2557 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2561 putc (quote_char, fp);
2566 /* The same as REPLACE_ATTR, but used when replacing
2567 <meta http-equiv=refresh content="new_text"> because we need to
2568 append "timeout_value; URL=" before the next_text. */
2571 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2572 const char *new_text, int timeout)
2575 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2579 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2581 return replace_attr (p, size, fp, new_with_timeout);
2584 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2585 preceded by '&'. If the character is not found, return zero. If
2586 the character is found, return 1 and set BP and EP to point to the
2587 beginning and end of the region.
2589 This is used for finding the fragment indentifiers in URLs. */
2592 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2594 const char *end = beg + size;
2596 for (; beg < end; beg++)
2618 /* Quote FILE for use as local reference to an HTML file.
2620 We quote ? as %3F to avoid passing part of the file name as the
2621 parameter when browsing the converted file through HTTP. However,
2622 it is safe to do this only when `--html-extension' is turned on.
2623 This is because converting "index.html?foo=bar" to
2624 "index.html%3Ffoo=bar" would break local browsing, as the latter
2625 isn't even recognized as an HTML file! However, converting
2626 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2627 safe for both local and HTTP-served browsing. */
2630 local_quote_string (const char *file)
2632 const char *file_sans_qmark;
2635 if (!opt.html_extension)
2636 return html_quote_string (file);
2638 qm = count_char (file, '?');
2642 const char *from = file;
2645 /* qm * 2 because we replace each question mark with "%3F",
2646 i.e. replace one char with three, hence two more. */
2647 int fsqlen = strlen (file) + qm * 2;
2649 to = newname = (char *)alloca (fsqlen + 1);
2650 for (; *from; from++)
2661 assert (to - newname == fsqlen);
2664 file_sans_qmark = newname;
2667 file_sans_qmark = file;
2669 return html_quote_string (file_sans_qmark);
2672 /* We're storing "modes" of type downloaded_file_t in the hash table.
2673 However, our hash tables only accept pointers for keys and values.
2674 So when we need a pointer, we use the address of a
2675 downloaded_file_t variable of static storage. */
2677 static downloaded_file_t *
2678 downloaded_mode_to_ptr (downloaded_file_t mode)
2680 static downloaded_file_t
2681 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2682 v2 = FILE_DOWNLOADED_NORMALLY,
2683 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2684 v4 = CHECK_FOR_FILE;
2688 case FILE_NOT_ALREADY_DOWNLOADED:
2690 case FILE_DOWNLOADED_NORMALLY:
2692 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2694 case CHECK_FOR_FILE:
2700 /* This should really be merged with dl_file_url_map and
2701 downloaded_html_files in recur.c. This was originally a list, but
2702 I changed it to a hash table beause it was actually taking a lot of
2703 time to find things in it. */
2705 static struct hash_table *downloaded_files_hash;
2707 /* Remembers which files have been downloaded. In the standard case, should be
2708 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2709 download successfully (i.e. not for ones we have failures on or that we skip
2712 When we've downloaded a file and tacked on a ".html" extension due to -E,
2713 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2714 FILE_DOWNLOADED_NORMALLY.
2716 If you just want to check if a file has been previously added without adding
2717 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2718 with local filenames, not remote URLs. */
2720 downloaded_file (downloaded_file_t mode, const char *file)
2722 downloaded_file_t *ptr;
2724 if (mode == CHECK_FOR_FILE)
2726 if (!downloaded_files_hash)
2727 return FILE_NOT_ALREADY_DOWNLOADED;
2728 ptr = hash_table_get (downloaded_files_hash, file);
2730 return FILE_NOT_ALREADY_DOWNLOADED;
2734 if (!downloaded_files_hash)
2735 downloaded_files_hash = make_string_hash_table (0);
2737 ptr = hash_table_get (downloaded_files_hash, file);
2741 ptr = downloaded_mode_to_ptr (mode);
2742 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2744 return FILE_NOT_ALREADY_DOWNLOADED;
2748 df_free_mapper (void *key, void *value, void *ignored)
2755 downloaded_files_free (void)
2757 if (downloaded_files_hash)
2759 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2760 hash_table_destroy (downloaded_files_hash);
2761 downloaded_files_hash = NULL;
2765 /* Return non-zero if scheme a is similar to scheme b.
2767 Schemes are similar if they are equal. If SSL is supported, schemes
2768 are also similar if one is http (SCHEME_HTTP) and the other is https
2771 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2776 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2777 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2784 /* Debugging and testing support for path_simplify. */
2786 /* Debug: run path_simplify on PATH and return the result in a new
2787 string. Useful for calling from the debugger. */
2791 char *copy = xstrdup (path);
2792 path_simplify (copy);
2797 run_test (char *test, char *expected_result, int expected_change)
2799 char *test_copy = xstrdup (test);
2800 int modified = path_simplify (test_copy);
2802 if (0 != strcmp (test_copy, expected_result))
2804 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2805 test, expected_result, test_copy);
2807 if (modified != expected_change)
2809 if (expected_change == 1)
2810 printf ("Expected no modification with path_simplify(\"%s\").\n",
2813 printf ("Expected modification with path_simplify(\"%s\").\n",
2820 test_path_simplify (void)
2823 char *test, *result;
2829 { "foo", "foo", 0 },
2830 { "foo/bar", "foo/bar", 0 },
2831 { "foo///bar", "foo/bar", 1 },
2832 { "foo/.", "foo/", 1 },
2833 { "foo/./", "foo/", 1 },
2834 { "foo./", "foo./", 0 },
2835 { "foo/../bar", "bar", 1 },
2836 { "foo/../bar/", "bar/", 1 },
2837 { "foo/bar/..", "foo/", 1 },
2838 { "foo/bar/../x", "foo/x", 1 },
2839 { "foo/bar/../x/", "foo/x/", 1 },
2840 { "foo/..", "", 1 },
2841 { "foo/../..", "", 1 },
2842 { "a/b/../../c", "c", 1 },
2843 { "./a/../b", "b", 1 }
2847 for (i = 0; i < ARRAY_SIZE (tests); i++)
2849 char *test = tests[i].test;
2850 char *expected_result = tests[i].result;
2851 int expected_change = tests[i].should_modify;
2852 run_test (test, expected_result, expected_change);
2855 /* Now run all the tests with a leading slash before the test case,
2856 to prove that the slash is being preserved. */
2857 for (i = 0; i < ARRAY_SIZE (tests); i++)
2859 char *test, *expected_result;
2860 int expected_change = tests[i].should_modify;
2862 test = xmalloc (1 + strlen (tests[i].test) + 1);
2863 sprintf (test, "/%s", tests[i].test);
2865 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2866 sprintf (expected_result, "/%s", tests[i].result);
2868 run_test (test, expected_result, expected_change);
2871 xfree (expected_result);