2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
39 #include <sys/types.h>
57 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
59 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
61 static const int NS_INADDRSZ = 4;
62 static const int NS_IN6ADDRSZ = 16;
63 static const int NS_INT16SZ = 2;
73 /* Supported schemes: */
74 static struct scheme_data supported_schemes[] =
76 { "http://", DEFAULT_HTTP_PORT, 1 },
78 { "https://", DEFAULT_HTTPS_PORT, 1 },
80 { "ftp://", DEFAULT_FTP_PORT, 1 },
86 /* Forward declarations: */
88 static char *construct_relative PARAMS ((const char *, const char *));
89 static int path_simplify PARAMS ((char *));
93 /* Support for encoding and decoding of URL strings. We determine
94 whether a character is unsafe through static table lookup. This
95 code assumes ASCII character set and 8-bit chars. */
102 #define R urlchr_reserved
103 #define U urlchr_unsafe
106 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
108 /* rfc1738 reserved chars, preserved from encoding. */
110 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
112 /* rfc1738 unsafe chars, plus some more. */
114 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
116 const static unsigned char urlchr_table[256] =
118 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
119 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
120 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
121 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
122 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
123 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
124 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
125 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
126 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
127 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
128 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
129 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
130 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
131 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
132 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
133 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
138 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
140 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
142 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
143 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
146 /* Decodes the forms %xy in a URL to the character the hexadecimal
147 code of which is xy. xy are hexadecimal digits from
148 [0123456789ABCDEF] (case-insensitive). If x or y are not
149 hex-digits or `%' precedes `\0', the sequence is inserted
153 decode_string (char *s)
155 char *t = s; /* t - tortoise */
156 char *h = s; /* h - hare */
167 /* Do nothing if '%' is not followed by two hex digits. */
168 if (!*(h + 1) || !*(h + 2)
169 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
171 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
178 /* Like encode_string, but return S if there are no unsafe chars. */
181 encode_string_maybe (const char *s)
188 for (p1 = s; *p1; p1++)
189 if (UNSAFE_CHAR (*p1))
190 addition += 2; /* Two more characters (hex digits) */
195 newlen = (p1 - s) + addition;
196 newstr = (char *)xmalloc (newlen + 1);
202 if (UNSAFE_CHAR (*p1))
204 unsigned char c = *p1++;
206 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
207 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
213 assert (p2 - newstr == newlen);
218 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
219 given string, returning a malloc-ed %XX encoded string. */
222 encode_string (const char *s)
224 char *encoded = encode_string_maybe (s);
231 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
232 the old value of PTR is freed and PTR is made to point to the newly
233 allocated storage. */
235 #define ENCODE(ptr) do { \
236 char *e_new = encode_string_maybe (ptr); \
244 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
246 /* Decide whether to encode, decode, or pass through the char at P.
247 This used to be a macro, but it got a little too convoluted. */
248 static inline enum copy_method
249 decide_copy_method (const char *p)
253 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
255 /* %xx sequence: decode it, unless it would decode to an
256 unsafe or a reserved char; in that case, leave it as
258 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
259 XCHAR_TO_XDIGIT (*(p + 2));
261 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
262 return CM_PASSTHROUGH;
267 /* Garbled %.. sequence: encode `%'. */
270 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
273 return CM_PASSTHROUGH;
276 /* Translate a %-quoting (but possibly non-conformant) input string S
277 into a %-quoting (and conformant) output string. If no characters
278 are encoded or decoded, return the same string S; otherwise, return
279 a freshly allocated string with the new contents.
281 After a URL has been run through this function, the protocols that
282 use `%' as the quote character can use the resulting string as-is,
283 while those that don't call decode_string() to get to the intended
284 data. This function is also stable: after an input string is
285 transformed the first time, all further transformations of the
286 result yield the same result string.
288 Let's discuss why this function is needed.
290 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
291 space character would mess up the HTTP request, it needs to be
294 GET /abc%20def HTTP/1.0
296 So it appears that the unsafe chars need to be quoted, as with
297 encode_string. But what if we're requested to download
298 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
299 the user meant was a literal space, and he was kind enough to quote
300 it. In that case, Wget should obviously leave the `%20' as is, and
301 send the same request as above. So in this case we may not call
304 But what if the requested URI is `abc%20 def'? If we call
305 encode_string, we end up with `/abc%2520%20def', which is almost
306 certainly not intended. If we don't call encode_string, we are
307 left with the embedded space and cannot send the request. What the
308 user meant was for Wget to request `/abc%20%20def', and this is
309 where reencode_string kicks in.
311 Wget used to solve this by first decoding %-quotes, and then
312 encoding all the "unsafe" characters found in the resulting string.
313 This was wrong because it didn't preserve certain URL special
314 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
315 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
316 whether we considered `+' reserved (it is). One of these results
317 is inevitable because by the second step we would lose information
318 on whether the `+' was originally encoded or not. Both results
319 were wrong because in CGI parameters + means space, while %2B means
320 literal plus. reencode_string correctly translates the above to
321 "a%2B+b", i.e. returns the original string.
323 This function uses an algorithm proposed by Anon Sricharoenchai:
325 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
328 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
331 ...except that this code conflates the two steps, and decides
332 whether to encode, decode, or pass through each character in turn.
333 The function still uses two passes, but their logic is the same --
334 the first pass exists merely for the sake of allocation. Another
335 small difference is that we include `+' to URL_RESERVED.
339 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
341 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
345 "foo bar" -> "foo%20bar"
346 "foo%20bar" -> "foo%20bar"
347 "foo %20bar" -> "foo%20%20bar"
348 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
349 "foo%25%20bar" -> "foo%25%20bar"
350 "foo%2%20bar" -> "foo%252%20bar"
351 "foo+bar" -> "foo+bar" (plus is reserved!)
352 "foo%2b+bar" -> "foo%2b+bar" */
355 reencode_string (const char *s)
361 int encode_count = 0;
362 int decode_count = 0;
364 /* First, pass through the string to see if there's anything to do,
365 and to calculate the new length. */
366 for (p1 = s; *p1; p1++)
368 switch (decide_copy_method (p1))
381 if (!encode_count && !decode_count)
382 /* The string is good as it is. */
383 return (char *)s; /* C const model sucks. */
386 /* Each encoding adds two characters (hex digits), while each
387 decoding removes two characters. */
388 newlen = oldlen + 2 * (encode_count - decode_count);
389 newstr = xmalloc (newlen + 1);
396 switch (decide_copy_method (p1))
400 unsigned char c = *p1++;
402 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
403 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
407 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
408 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
409 p1 += 3; /* skip %xx */
416 assert (p2 - newstr == newlen);
420 /* Run PTR_VAR through reencode_string. If a new string is consed,
421 free PTR_VAR and make it point to the new storage. Obviously,
422 PTR_VAR needs to be an lvalue. */
424 #define REENCODE(ptr_var) do { \
425 char *rf_new = reencode_string (ptr_var); \
426 if (rf_new != ptr_var) \
433 /* Returns the scheme type if the scheme is supported, or
434 SCHEME_INVALID if not. */
436 url_scheme (const char *url)
440 for (i = 0; supported_schemes[i].leading_string; i++)
441 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
442 strlen (supported_schemes[i].leading_string)))
444 if (supported_schemes[i].enabled)
445 return (enum url_scheme) i;
447 return SCHEME_INVALID;
450 return SCHEME_INVALID;
453 /* Return the number of characters needed to skip the scheme part of
454 the URL, e.g. `http://'. If no scheme is found, returns 0. */
456 url_skip_scheme (const char *url)
460 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
462 while (ISALNUM (*p) || *p == '-' || *p == '+')
469 /* Skip "//" if found. */
470 if (*p == '/' && *(p + 1) == '/')
476 /* Returns 1 if the URL begins with a scheme (supported or
477 unsupported), 0 otherwise. */
479 url_has_scheme (const char *url)
482 while (ISALNUM (*p) || *p == '-' || *p == '+')
488 scheme_default_port (enum url_scheme scheme)
490 return supported_schemes[scheme].default_port;
494 scheme_disable (enum url_scheme scheme)
496 supported_schemes[scheme].enabled = 0;
499 /* Skip the username and password, if present here. The function
500 should be called *not* with the complete URL, but with the part
501 right after the scheme.
503 If no username and password are found, return 0. */
505 url_skip_uname (const char *url)
509 /* Look for '@' that comes before '/' or '?'. */
510 p = (const char *)strpbrk (url, "/?@");
518 parse_uname (const char *str, int len, char **user, char **passwd)
523 /* Empty user name not allowed. */
526 colon = memchr (str, ':', len);
528 /* Empty user name again. */
533 int pwlen = len - (colon + 1 - str);
534 *passwd = xmalloc (pwlen + 1);
535 memcpy (*passwd, colon + 1, pwlen);
536 (*passwd)[pwlen] = '\0';
542 *user = xmalloc (len + 1);
543 memcpy (*user, str, len);
547 decode_string (*user);
549 decode_string (*passwd);
554 /* Used by main.c: detect URLs written using the "shorthand" URL forms
555 popularized by Netscape and NcFTP. HTTP shorthands look like this:
557 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
558 www.foo.com[:port] -> http://www.foo.com[:port]
560 FTP shorthands look like this:
562 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
563 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
565 If the URL needs not or cannot be rewritten, return NULL. */
567 rewrite_shorthand_url (const char *url)
571 if (url_has_scheme (url))
574 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
576 for (p = url; *p && *p != ':' && *p != '/'; p++)
586 /* If the characters after the colon and before the next slash
587 or end of string are all digits, it's HTTP. */
589 for (pp = p + 1; ISDIGIT (*pp); pp++)
591 if (digits > 0 && (*pp == '/' || *pp == '\0'))
594 /* Prepend "ftp://" to the entire URL... */
595 res = xmalloc (6 + strlen (url) + 1);
596 sprintf (res, "ftp://%s", url);
597 /* ...and replace ':' with '/'. */
598 res[6 + (p - url)] = '/';
605 /* Just prepend "http://" to what we have. */
606 res = xmalloc (7 + strlen (url) + 1);
607 sprintf (res, "http://%s", url);
612 static void parse_path PARAMS ((const char *, char **, char **));
615 strpbrk_or_eos (const char *s, const char *accept)
617 char *p = strpbrk (s, accept);
619 p = (char *)s + strlen (s);
623 /* Turn STR into lowercase; return non-zero if a character was
627 lowercase_str (char *str)
634 *str = TOLOWER (*str);
639 static char *parse_errors[] = {
640 #define PE_NO_ERROR 0
642 #define PE_UNSUPPORTED_SCHEME 1
643 "Unsupported scheme",
644 #define PE_EMPTY_HOST 2
646 #define PE_BAD_PORT_NUMBER 3
648 #define PE_INVALID_USER_NAME 4
650 #define PE_UNTERMINATED_IPV6_ADDRESS 5
651 "Unterminated IPv6 numeric address",
652 #define PE_INVALID_IPV6_ADDRESS 6
653 "Invalid IPv6 numeric address"
656 #define SETERR(p, v) do { \
661 /* The following two functions were adapted from glibc. */
664 is_valid_ipv4_address (const char *str, const char *end)
666 int saw_digit, octets;
676 if (ch >= '0' && ch <= '9') {
677 val = val * 10 + (ch - '0');
681 if (saw_digit == 0) {
686 } else if (ch == '.' && saw_digit == 1) {
701 is_valid_ipv6_address (const char *str, const char *end)
703 static const char xdigits[] = "0123456789abcdef";
716 /* Leading :: requires some special handling. */
720 if (str == end || *str != ':')
732 /* if ch is a number, add it to val. */
733 pch = strchr(xdigits, ch);
736 val |= (pch - xdigits);
743 /* if ch is a colon ... */
746 if (saw_xdigit == 0) {
751 } else if (str == end) {
754 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
762 /* if ch is a dot ... */
763 if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&
764 is_valid_ipv4_address(curtok, end) == 1) {
773 if (saw_xdigit == 1) {
774 if (tp > NS_IN6ADDRSZ - NS_INT16SZ)
779 if (colonp != NULL) {
780 if (tp == NS_IN6ADDRSZ)
785 if (tp != NS_IN6ADDRSZ)
794 Return a new struct url if successful, NULL on error. In case of
795 error, and if ERROR is not NULL, also set *ERROR to the appropriate
798 url_parse (const char *url, int *error)
802 int path_modified, host_modified;
804 enum url_scheme scheme;
806 const char *uname_b, *uname_e;
807 const char *host_b, *host_e;
808 const char *path_b, *path_e;
809 const char *params_b, *params_e;
810 const char *query_b, *query_e;
811 const char *fragment_b, *fragment_e;
814 char *user = NULL, *passwd = NULL;
818 scheme = url_scheme (url);
819 if (scheme == SCHEME_INVALID)
821 SETERR (error, PE_UNSUPPORTED_SCHEME);
825 url_encoded = reencode_string (url);
828 p += strlen (supported_schemes[scheme].leading_string);
830 p += url_skip_uname (p);
833 /* scheme://user:pass@host[:port]... */
836 /* We attempt to break down the URL into the components path,
837 params, query, and fragment. They are ordered like this:
839 scheme://host[:port][/path][;params][?query][#fragment] */
841 params_b = params_e = NULL;
842 query_b = query_e = NULL;
843 fragment_b = fragment_e = NULL;
849 /* Handle IPv6 address inside square brackets. Ideally we'd
850 just look for the terminating ']', but rfc2732 mandates
851 rejecting invalid IPv6 addresses. */
853 /* The address begins after '['. */
855 host_e = strchr (host_b, ']');
859 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
863 /* Check if the IPv6 address is valid. */
864 if (!is_valid_ipv6_address(host_b, host_e))
866 SETERR (error, PE_INVALID_IPV6_ADDRESS);
870 /* Continue parsing after the closing ']'. */
875 p = strpbrk_or_eos (p, ":/;?#");
879 if (host_b == host_e)
881 SETERR (error, PE_EMPTY_HOST);
885 port = scheme_default_port (scheme);
888 const char *port_b, *port_e, *pp;
890 /* scheme://host:port/tralala */
894 p = strpbrk_or_eos (p, "/;?#");
897 if (port_b == port_e)
899 /* http://host:/whatever */
901 SETERR (error, PE_BAD_PORT_NUMBER);
905 for (port = 0, pp = port_b; pp < port_e; pp++)
909 /* http://host:12randomgarbage/blah */
911 SETERR (error, PE_BAD_PORT_NUMBER);
915 port = 10 * port + (*pp - '0');
923 p = strpbrk_or_eos (p, ";?#");
928 /* Path is not allowed not to exist. */
936 p = strpbrk_or_eos (p, "?#");
943 p = strpbrk_or_eos (p, "#");
946 /* Hack that allows users to use '?' (a wildcard character) in
947 FTP URLs without it being interpreted as a query string
949 if (scheme == SCHEME_FTP)
951 query_b = query_e = NULL;
964 if (uname_b != uname_e)
966 /* http://user:pass@host */
968 /* uname_b uname_e */
969 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
971 SETERR (error, PE_INVALID_USER_NAME);
976 u = (struct url *)xmalloc (sizeof (struct url));
977 memset (u, 0, sizeof (*u));
980 u->host = strdupdelim (host_b, host_e);
985 u->path = strdupdelim (path_b, path_e);
986 path_modified = path_simplify (u->path);
987 parse_path (u->path, &u->dir, &u->file);
989 host_modified = lowercase_str (u->host);
992 u->params = strdupdelim (params_b, params_e);
994 u->query = strdupdelim (query_b, query_e);
996 u->fragment = strdupdelim (fragment_b, fragment_e);
998 if (path_modified || u->fragment || host_modified || path_b == path_e)
1000 /* If we suspect that a transformation has rendered what
1001 url_string might return different from URL_ENCODED, rebuild
1002 u->url using url_string. */
1003 u->url = url_string (u, 0);
1005 if (url_encoded != url)
1006 xfree ((char *) url_encoded);
1010 if (url_encoded == url)
1011 u->url = xstrdup (url);
1013 u->url = url_encoded;
1021 url_error (int error_code)
1023 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
1024 return parse_errors[error_code];
1028 parse_path (const char *quoted_path, char **dir, char **file)
1030 char *path, *last_slash;
1032 STRDUP_ALLOCA (path, quoted_path);
1033 decode_string (path);
1035 last_slash = strrchr (path, '/');
1038 *dir = xstrdup ("");
1039 *file = xstrdup (path);
1043 *dir = strdupdelim (path, last_slash);
1044 *file = xstrdup (last_slash + 1);
1048 /* Note: URL's "full path" is the path with the query string and
1049 params appended. The "fragment" (#foo) is intentionally ignored,
1050 but that might be changed. For example, if the original URL was
1051 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1052 the full path will be "/foo/bar/baz;bullshit?querystring". */
1054 /* Return the length of the full path, without the terminating
1058 full_path_length (const struct url *url)
1062 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
1073 /* Write out the full path. */
1076 full_path_write (const struct url *url, char *where)
1078 #define FROB(el, chr) do { \
1079 char *f_el = url->el; \
1081 int l = strlen (f_el); \
1083 memcpy (where, f_el, l); \
1095 /* Public function for getting the "full path". E.g. if u->path is
1096 "foo/bar" and u->query is "param=value", full_path will be
1097 "/foo/bar?param=value". */
1100 url_full_path (const struct url *url)
1102 int length = full_path_length (url);
1103 char *full_path = (char *)xmalloc(length + 1);
1105 full_path_write (url, full_path);
1106 full_path[length] = '\0';
1111 /* Sync u->path and u->url with u->dir and u->file. */
1114 sync_path (struct url *url)
1122 newpath = xstrdup (url->file);
1127 int dirlen = strlen (url->dir);
1128 int filelen = strlen (url->file);
1130 newpath = xmalloc (dirlen + 1 + filelen + 1);
1131 memcpy (newpath, url->dir, dirlen);
1132 newpath[dirlen] = '/';
1133 memcpy (newpath + dirlen + 1, url->file, filelen);
1134 newpath[dirlen + 1 + filelen] = '\0';
1138 url->path = newpath;
1140 /* Synchronize u->url. */
1142 url->url = url_string (url, 0);
1145 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1146 This way we can sync u->path and u->url when they get changed. */
1149 url_set_dir (struct url *url, const char *newdir)
1152 url->dir = xstrdup (newdir);
1157 url_set_file (struct url *url, const char *newfile)
1160 url->file = xstrdup (newfile);
1165 url_free (struct url *url)
1171 FREE_MAYBE (url->params);
1172 FREE_MAYBE (url->query);
1173 FREE_MAYBE (url->fragment);
1174 FREE_MAYBE (url->user);
1175 FREE_MAYBE (url->passwd);
1184 get_urls_file (const char *file)
1186 struct file_memory *fm;
1187 struct urlpos *head, *tail;
1188 const char *text, *text_end;
1190 /* Load the file. */
1191 fm = read_file (file);
1194 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1197 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1201 text_end = fm->content + fm->length;
1202 while (text < text_end)
1204 const char *line_beg = text;
1205 const char *line_end = memchr (text, '\n', text_end - text);
1207 line_end = text_end;
1212 /* Strip whitespace from the beginning and end of line. */
1213 while (line_beg < line_end && ISSPACE (*line_beg))
1215 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1218 if (line_end > line_beg)
1220 /* URL is in the [line_beg, line_end) region. */
1224 struct urlpos *entry;
1227 /* We must copy the URL to a zero-terminated string, and we
1228 can't use alloca because we're in a loop. *sigh*. */
1229 url_text = strdupdelim (line_beg, line_end);
1233 /* Merge opt.base_href with URL. */
1234 char *merged = uri_merge (opt.base_href, url_text);
1239 url = url_parse (url_text, &up_error_code);
1242 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1243 file, url_text, url_error (up_error_code));
1249 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1250 memset (entry, 0, sizeof (*entry));
1261 read_file_free (fm);
1265 /* Free the linked list of urlpos. */
1267 free_urlpos (struct urlpos *l)
1271 struct urlpos *next = l->next;
1274 FREE_MAYBE (l->local_name);
1280 /* Rotate FNAME opt.backups times */
1282 rotate_backups(const char *fname)
1284 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1285 char *from = (char *)alloca (maxlen);
1286 char *to = (char *)alloca (maxlen);
1290 if (stat (fname, &sb) == 0)
1291 if (S_ISREG (sb.st_mode) == 0)
1294 for (i = opt.backups; i > 1; i--)
1296 sprintf (from, "%s.%d", fname, i - 1);
1297 sprintf (to, "%s.%d", fname, i);
1298 /* #### This will fail on machines without the rename() system
1303 sprintf (to, "%s.%d", fname, 1);
1307 /* Create all the necessary directories for PATH (a file). Calls
1308 mkdirhier() internally. */
1310 mkalldirs (const char *path)
1317 p = path + strlen (path);
1318 for (; *p != '/' && p != path; p--);
1319 /* Don't create if it's just a file. */
1320 if ((p == path) && (*p != '/'))
1322 t = strdupdelim (path, p);
1323 /* Check whether the directory exists. */
1324 if ((stat (t, &st) == 0))
1326 if (S_ISDIR (st.st_mode))
1333 /* If the dir exists as a file name, remove it first. This
1334 is *only* for Wget to work with buggy old CERN http
1335 servers. Here is the scenario: When Wget tries to
1336 retrieve a directory without a slash, e.g.
1337 http://foo/bar (bar being a directory), CERN server will
1338 not redirect it too http://foo/bar/ -- it will generate a
1339 directory listing containing links to bar/file1,
1340 bar/file2, etc. Wget will lose because it saves this
1341 HTML listing to a file `bar', so it cannot create the
1342 directory. To work around this, if the file of the same
1343 name exists, we just remove it and create the directory
1345 DEBUGP (("Removing %s because of directory danger!\n", t));
1349 res = make_directory (t);
1351 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1357 count_slashes (const char *s)
1366 /* Return the path name of the URL-equivalent file name, with a
1367 remote-like structure of directories. */
1369 mkstruct (const struct url *u)
1372 char *res, *dirpref;
1377 char *ptr = u->dir + (*u->dir == '/');
1378 int slash_count = 1 + count_slashes (ptr);
1379 int cut = MINVAL (opt.cut_dirs, slash_count);
1380 for (; cut && *ptr; ptr++)
1383 STRDUP_ALLOCA (dir, ptr);
1386 dir = u->dir + (*u->dir == '/');
1388 /* Check for the true name (or at least a consistent name for saving
1389 to directory) of HOST, reusing the hlist if possible. */
1390 if (opt.add_hostdir)
1392 /* Add dir_prefix and hostname (if required) to the beginning of
1394 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1396 + 1 + numdigit (u->port)
1398 if (!DOTP (opt.dir_prefix))
1399 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1401 strcpy (dirpref, u->host);
1403 if (u->port != scheme_default_port (u->scheme))
1405 int len = strlen (dirpref);
1407 number_to_string (dirpref + len + 1, u->port);
1410 else /* not add_hostdir */
1412 if (!DOTP (opt.dir_prefix))
1413 dirpref = opt.dir_prefix;
1418 /* If there is a prefix, prepend it. */
1421 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1422 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1427 if (l && dir[l - 1] == '/')
1431 file = "index.html";
1435 /* Finally, construct the full name. */
1436 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1438 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1443 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1444 an escaped query string. The trick is to make sure that unsafe
1445 characters in BASE are escaped, and that slashes in QUERY are also
1449 compose_file_name (char *base, char *query)
1455 /* Copy BASE to RESULT and encode all unsafe characters. */
1457 while (*from && to - result < sizeof (result))
1459 if (UNSAFE_CHAR (*from))
1461 unsigned char c = *from++;
1463 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1464 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1470 if (query && to - result < sizeof (result))
1474 /* Copy QUERY to RESULT and encode all '/' characters. */
1476 while (*from && to - result < sizeof (result))
1490 if (to - result < sizeof (result))
1493 /* Truncate input which is too long, presumably due to a huge
1495 result[sizeof (result) - 1] = '\0';
1497 return xstrdup (result);
1500 /* Create a unique filename, corresponding to a given URL. Calls
1501 mkstruct if necessary. Does *not* actually create any directories. */
1503 url_filename (const struct url *u)
1507 char *query = u->query && *u->query ? u->query : NULL;
1511 char *base = mkstruct (u);
1512 file = compose_file_name (base, query);
1517 char *base = *u->file ? u->file : "index.html";
1518 file = compose_file_name (base, query);
1520 /* Check whether the prefix directory is something other than "."
1521 before prepending it. */
1522 if (!DOTP (opt.dir_prefix))
1524 /* #### should just realloc FILE and prepend dir_prefix. */
1525 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1526 + 1 + strlen (file) + 1);
1527 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1533 /* DOS-ish file systems don't like `%' signs in them; we change it
1538 for (p = file; *p; p++)
1542 #endif /* WINDOWS */
1544 /* Check the cases in which the unique extensions are not used:
1545 1) Clobbering is turned off (-nc).
1546 2) Retrieval with regetting.
1547 3) Timestamping is used.
1548 4) Hierarchy is built.
1550 The exception is the case when file does exist and is a
1551 directory (actually support for bad httpd-s). */
1552 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1553 && !(file_exists_p (file) && !file_non_directory_p (file)))
1556 /* Find a unique name. */
1557 name = unique_name (file);
1562 /* Return the langth of URL's path. Path is considered to be
1563 terminated by one of '?', ';', '#', or by the end of the
1566 path_length (const char *url)
1568 const char *q = strpbrk_or_eos (url, "?;#");
1572 /* Find the last occurrence of character C in the range [b, e), or
1573 NULL, if none are present. This is equivalent to strrchr(b, c),
1574 except that it accepts an END argument instead of requiring the
1575 string to be zero-terminated. Why is there no memrchr()? */
1577 find_last_char (const char *b, const char *e, char c)
1585 /* Resolve "." and ".." elements of PATH by destructively modifying
1586 PATH. "." is resolved by removing that path element, and ".." is
1587 resolved by removing the preceding path element. Leading and
1588 trailing slashes are preserved.
1590 Return non-zero if any changes have been made.
1592 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1593 test examples are provided below. If you change anything in this
1594 function, run test_path_simplify to make sure you haven't broken a
1597 A previous version of this function was based on path_simplify()
1598 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1601 path_simplify (char *path)
1607 ++path; /* preserve the leading '/'. */
1610 end = p + strlen (p) + 1; /* position past the terminating zero. */
1615 /* P should point to the beginning of a path element. */
1617 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1619 /* Handle "./foo" by moving "foo" two characters to the
1621 if (*(p + 1) == '/')
1624 memmove (p, p + 2, end - p);
1635 else if (*p == '.' && *(p + 1) == '.'
1636 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1638 /* Handle "../foo" by moving "foo" one path element to the
1640 char *b = p; /* not p-1 because P can equal PATH */
1642 /* Backtrack by one path element, but not past the beginning
1645 /* foo/bar/../baz */
1651 /* Move backwards until B hits the beginning of the
1652 previous path element or the beginning of path. */
1653 for (--b; b > path && *(b - 1) != '/'; b--)
1658 if (*(p + 2) == '/')
1660 memmove (b, p + 3, end - (p + 3));
1674 /* Remove empty path elements. Not mandated by rfc1808 et
1675 al, but empty path elements are not all that useful, and
1676 the rest of Wget might not deal with them well. */
1686 memmove (p, q, end - q);
1691 /* Skip to the next path element. */
1692 while (*p && *p != '/')
1697 /* Make sure P points to the beginning of the next path element,
1698 which is location after the slash. */
1705 /* Resolve the result of "linking" a base URI (BASE) to a
1706 link-specified URI (LINK).
1708 Either of the URIs may be absolute or relative, complete with the
1709 host name, or path only. This tries to behave "reasonably" in all
1710 foreseeable cases. It employs little specific knowledge about
1711 schemes or URL-specific stuff -- it just works on strings.
1713 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1714 See uri_merge for a gentler interface to this functionality.
1716 Perhaps this function should call path_simplify so that the callers
1717 don't have to call url_parse unconditionally. */
1719 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1725 const char *end = base + path_length (base);
1729 /* Empty LINK points back to BASE, query string and all. */
1730 constr = xstrdup (base);
1732 else if (*link == '?')
1734 /* LINK points to the same location, but changes the query
1735 string. Examples: */
1736 /* uri_merge("path", "?new") -> "path?new" */
1737 /* uri_merge("path?foo", "?new") -> "path?new" */
1738 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1739 /* uri_merge("path#foo", "?new") -> "path?new" */
1740 int baselength = end - base;
1741 constr = xmalloc (baselength + linklength + 1);
1742 memcpy (constr, base, baselength);
1743 memcpy (constr + baselength, link, linklength);
1744 constr[baselength + linklength] = '\0';
1746 else if (*link == '#')
1748 /* uri_merge("path", "#new") -> "path#new" */
1749 /* uri_merge("path#foo", "#new") -> "path#new" */
1750 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1751 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1753 const char *end1 = strchr (base, '#');
1755 end1 = base + strlen (base);
1756 baselength = end1 - base;
1757 constr = xmalloc (baselength + linklength + 1);
1758 memcpy (constr, base, baselength);
1759 memcpy (constr + baselength, link, linklength);
1760 constr[baselength + linklength] = '\0';
1762 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1764 /* LINK begins with "//" and so is a net path: we need to
1765 replace everything after (and including) the double slash
1768 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1769 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1770 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1774 const char *start_insert;
1776 /* Look for first slash. */
1777 slash = memchr (base, '/', end - base);
1778 /* If found slash and it is a double slash, then replace
1779 from this point, else default to replacing from the
1781 if (slash && *(slash + 1) == '/')
1782 start_insert = slash;
1784 start_insert = base;
1786 span = start_insert - base;
1787 constr = (char *)xmalloc (span + linklength + 1);
1789 memcpy (constr, base, span);
1790 memcpy (constr + span, link, linklength);
1791 constr[span + linklength] = '\0';
1793 else if (*link == '/')
1795 /* LINK is an absolute path: we need to replace everything
1796 after (and including) the FIRST slash with LINK.
1798 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1799 "/qux/xyzzy", our result should be
1800 "http://host/qux/xyzzy". */
1803 const char *start_insert = NULL; /* for gcc to shut up. */
1804 const char *pos = base;
1805 int seen_slash_slash = 0;
1806 /* We're looking for the first slash, but want to ignore
1809 slash = memchr (pos, '/', end - pos);
1810 if (slash && !seen_slash_slash)
1811 if (*(slash + 1) == '/')
1814 seen_slash_slash = 1;
1818 /* At this point, SLASH is the location of the first / after
1819 "//", or the first slash altogether. START_INSERT is the
1820 pointer to the location where LINK will be inserted. When
1821 examining the last two examples, keep in mind that LINK
1824 if (!slash && !seen_slash_slash)
1825 /* example: "foo" */
1827 start_insert = base;
1828 else if (!slash && seen_slash_slash)
1829 /* example: "http://foo" */
1832 else if (slash && !seen_slash_slash)
1833 /* example: "foo/bar" */
1835 start_insert = base;
1836 else if (slash && seen_slash_slash)
1837 /* example: "http://something/" */
1839 start_insert = slash;
1841 span = start_insert - base;
1842 constr = (char *)xmalloc (span + linklength + 1);
1844 memcpy (constr, base, span);
1846 memcpy (constr + span, link, linklength);
1847 constr[span + linklength] = '\0';
1851 /* LINK is a relative URL: we need to replace everything
1852 after last slash (possibly empty) with LINK.
1854 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1855 our result should be "whatever/foo/qux/xyzzy". */
1856 int need_explicit_slash = 0;
1858 const char *start_insert;
1859 const char *last_slash = find_last_char (base, end, '/');
1862 /* No slash found at all. Append LINK to what we have,
1863 but we'll need a slash as a separator.
1865 Example: if base == "foo" and link == "qux/xyzzy", then
1866 we cannot just append link to base, because we'd get
1867 "fooqux/xyzzy", whereas what we want is
1870 To make sure the / gets inserted, we set
1871 need_explicit_slash to 1. We also set start_insert
1872 to end + 1, so that the length calculations work out
1873 correctly for one more (slash) character. Accessing
1874 that character is fine, since it will be the
1875 delimiter, '\0' or '?'. */
1876 /* example: "foo?..." */
1877 /* ^ ('?' gets changed to '/') */
1878 start_insert = end + 1;
1879 need_explicit_slash = 1;
1881 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1883 /* example: http://host" */
1885 start_insert = end + 1;
1886 need_explicit_slash = 1;
1890 /* example: "whatever/foo/bar" */
1892 start_insert = last_slash + 1;
1895 span = start_insert - base;
1896 constr = (char *)xmalloc (span + linklength + 1);
1898 memcpy (constr, base, span);
1899 if (need_explicit_slash)
1900 constr[span - 1] = '/';
1902 memcpy (constr + span, link, linklength);
1903 constr[span + linklength] = '\0';
1906 else /* !no_scheme */
1908 constr = strdupdelim (link, link + linklength);
1913 /* Merge BASE with LINK and return the resulting URI. This is an
1914 interface to uri_merge_1 that assumes that LINK is a
1915 zero-terminated string. */
1917 uri_merge (const char *base, const char *link)
1919 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1922 #define APPEND(p, s) do { \
1923 int len = strlen (s); \
1924 memcpy (p, s, len); \
1928 /* Use this instead of password when the actual password is supposed
1929 to be hidden. We intentionally use a generic string without giving
1930 away the number of characters in the password, like previous
1932 #define HIDDEN_PASSWORD "*password*"
1934 /* Recreate the URL string from the data in URL.
1936 If HIDE is non-zero (as it is when we're calling this on a URL we
1937 plan to print, but not when calling it to canonicalize a URL for
1938 use within the program), password will be hidden. Unsafe
1939 characters in the URL will be quoted. */
1942 url_string (const struct url *url, int hide_password)
1946 char *quoted_user = NULL, *quoted_passwd = NULL;
1948 int scheme_port = supported_schemes[url->scheme].default_port;
1949 char *scheme_str = supported_schemes[url->scheme].leading_string;
1950 int fplen = full_path_length (url);
1952 int brackets_around_host = 0;
1954 assert (scheme_str != NULL);
1956 /* Make sure the user name and password are quoted. */
1959 quoted_user = encode_string_maybe (url->user);
1963 quoted_passwd = HIDDEN_PASSWORD;
1965 quoted_passwd = encode_string_maybe (url->passwd);
1969 if (strchr (url->host, ':'))
1970 brackets_around_host = 1;
1972 size = (strlen (scheme_str)
1973 + strlen (url->host)
1974 + (brackets_around_host ? 2 : 0)
1977 if (url->port != scheme_port)
1978 size += 1 + numdigit (url->port);
1981 size += 1 + strlen (quoted_user);
1983 size += 1 + strlen (quoted_passwd);
1986 p = result = xmalloc (size);
1988 APPEND (p, scheme_str);
1991 APPEND (p, quoted_user);
1995 APPEND (p, quoted_passwd);
2000 if (brackets_around_host)
2002 APPEND (p, url->host);
2003 if (brackets_around_host)
2005 if (url->port != scheme_port)
2008 p = number_to_string (p, url->port);
2011 full_path_write (url, p);
2015 assert (p - result == size);
2017 if (quoted_user && quoted_user != url->user)
2018 xfree (quoted_user);
2019 if (quoted_passwd && !hide_password
2020 && quoted_passwd != url->passwd)
2021 xfree (quoted_passwd);
2026 /* Return the URL of the proxy appropriate for url U. */
2028 getproxy (struct url *u)
2031 char *rewritten_url;
2032 static char rewritten_storage[1024];
2036 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
2042 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
2046 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
2050 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
2052 case SCHEME_INVALID:
2055 if (!proxy || !*proxy)
2058 /* Handle shorthands. `rewritten_storage' is a kludge to allow
2059 getproxy() to return static storage. */
2060 rewritten_url = rewrite_shorthand_url (proxy);
2063 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
2064 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
2065 proxy = rewritten_storage;
2071 /* Should a host be accessed through proxy, concerning no_proxy? */
2073 no_proxy_match (const char *host, const char **no_proxy)
2078 return !sufmatch (no_proxy, host);
2081 /* Support for converting links for local viewing in downloaded HTML
2082 files. This should be moved to another file, because it has
2083 nothing to do with processing URLs. */
2085 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
2086 static const char *replace_attr PARAMS ((const char *, int, FILE *,
2088 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
2089 const char *, int));
2090 static char *local_quote_string PARAMS ((const char *));
2092 /* Change the links in one HTML file. LINKS is a list of links in the
2093 document, along with their positions and the desired direction of
2096 convert_links (const char *file, struct urlpos *links)
2098 struct file_memory *fm;
2101 downloaded_file_t downloaded_file_return;
2103 struct urlpos *link;
2104 int to_url_count = 0, to_file_count = 0;
2106 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
2109 /* First we do a "dry run": go through the list L and see whether
2110 any URL needs to be converted in the first place. If not, just
2111 leave the file alone. */
2113 struct urlpos *dry = links;
2114 for (dry = links; dry; dry = dry->next)
2115 if (dry->convert != CO_NOCONVERT)
2119 logputs (LOG_VERBOSE, _("nothing to do.\n"));
2124 fm = read_file (file);
2127 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2128 file, strerror (errno));
2132 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2133 if (opt.backup_converted && downloaded_file_return)
2134 write_backup_file (file, downloaded_file_return);
2136 /* Before opening the file for writing, unlink the file. This is
2137 important if the data in FM is mmaped. In such case, nulling the
2138 file, which is what fopen() below does, would make us read all
2139 zeroes from the mmaped region. */
2140 if (unlink (file) < 0 && errno != ENOENT)
2142 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2143 file, strerror (errno));
2144 read_file_free (fm);
2147 /* Now open the file for writing. */
2148 fp = fopen (file, "wb");
2151 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2152 file, strerror (errno));
2153 read_file_free (fm);
2157 /* Here we loop through all the URLs in file, replacing those of
2158 them that are downloaded with relative references. */
2160 for (link = links; link; link = link->next)
2162 char *url_start = fm->content + link->pos;
2164 if (link->pos >= fm->length)
2166 DEBUGP (("Something strange is going on. Please investigate."));
2169 /* If the URL is not to be converted, skip it. */
2170 if (link->convert == CO_NOCONVERT)
2172 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2176 /* Echo the file contents, up to the offending URL's opening
2177 quote, to the outfile. */
2178 fwrite (p, 1, url_start - p, fp);
2181 switch (link->convert)
2183 case CO_CONVERT_TO_RELATIVE:
2184 /* Convert absolute URL to relative. */
2186 char *newname = construct_relative (file, link->local_name);
2187 char *quoted_newname = local_quote_string (newname);
2189 if (!link->link_refresh_p)
2190 p = replace_attr (p, link->size, fp, quoted_newname);
2192 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2193 link->refresh_timeout);
2195 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2196 link->url->url, newname, link->pos, file));
2198 xfree (quoted_newname);
2202 case CO_CONVERT_TO_COMPLETE:
2203 /* Convert the link to absolute URL. */
2205 char *newlink = link->url->url;
2206 char *quoted_newlink = html_quote_string (newlink);
2208 if (!link->link_refresh_p)
2209 p = replace_attr (p, link->size, fp, quoted_newlink);
2211 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2212 link->refresh_timeout);
2214 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2215 newlink, link->pos, file));
2216 xfree (quoted_newlink);
2220 case CO_NULLIFY_BASE:
2221 /* Change the base href to "". */
2222 p = replace_attr (p, link->size, fp, "");
2230 /* Output the rest of the file. */
2231 if (p - fm->content < fm->length)
2232 fwrite (p, 1, fm->length - (p - fm->content), fp);
2234 read_file_free (fm);
2236 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2239 /* Construct and return a malloced copy of the relative link from two
2240 pieces of information: local name S1 of the referring file and
2241 local name S2 of the referred file.
2243 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2244 "jagor.srce.hr/images/news.gif", the function will return
2247 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2248 "fly.cc.fer.hr/images/fly.gif", the function will return
2249 "../images/fly.gif".
2251 Caveats: S1 should not begin with `/', unless S2 also begins with
2252 '/'. S1 should not contain things like ".." and such --
2253 construct_relative ("fly/ioccc/../index.html",
2254 "fly/images/fly.gif") will fail. (A workaround is to call
2255 something like path_simplify() on S1). */
2257 construct_relative (const char *s1, const char *s2)
2259 int i, cnt, sepdirs1;
2263 return xstrdup (s2);
2264 /* S1 should *not* be absolute, if S2 wasn't. */
2265 assert (*s1 != '/');
2267 /* Skip the directories common to both strings. */
2270 while (s1[i] && s2[i]
2275 if (s1[i] == '/' && s2[i] == '/')
2280 for (sepdirs1 = 0; s1[i]; i++)
2283 /* Now, construct the file as of:
2284 - ../ repeated sepdirs1 time
2285 - all the non-mutual directories of S2. */
2286 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2287 for (i = 0; i < sepdirs1; i++)
2288 memcpy (res + 3 * i, "../", 3);
2289 strcpy (res + 3 * i, s2 + cnt);
2294 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2296 /* Rather than just writing over the original .html file with the
2297 converted version, save the former to *.orig. Note we only do
2298 this for files we've _successfully_ downloaded, so we don't
2299 clobber .orig files sitting around from previous invocations. */
2301 /* Construct the backup filename as the original name plus ".orig". */
2302 size_t filename_len = strlen(file);
2303 char* filename_plus_orig_suffix;
2304 boolean already_wrote_backup_file = FALSE;
2305 slist* converted_file_ptr;
2306 static slist* converted_files = NULL;
2308 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2310 /* Just write "orig" over "html". We need to do it this way
2311 because when we're checking to see if we've downloaded the
2312 file before (to see if we can skip downloading it), we don't
2313 know if it's a text/html file. Therefore we don't know yet
2314 at that stage that -E is going to cause us to tack on
2315 ".html", so we need to compare vs. the original URL plus
2316 ".orig", not the original URL plus ".html.orig". */
2317 filename_plus_orig_suffix = alloca (filename_len + 1);
2318 strcpy(filename_plus_orig_suffix, file);
2319 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2321 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2323 /* Append ".orig" to the name. */
2324 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2325 strcpy(filename_plus_orig_suffix, file);
2326 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2329 /* We can get called twice on the same URL thanks to the
2330 convert_all_links() call in main(). If we write the .orig file
2331 each time in such a case, it'll end up containing the first-pass
2332 conversion, not the original file. So, see if we've already been
2333 called on this file. */
2334 converted_file_ptr = converted_files;
2335 while (converted_file_ptr != NULL)
2336 if (strcmp(converted_file_ptr->string, file) == 0)
2338 already_wrote_backup_file = TRUE;
2342 converted_file_ptr = converted_file_ptr->next;
2344 if (!already_wrote_backup_file)
2346 /* Rename <file> to <file>.orig before former gets written over. */
2347 if (rename(file, filename_plus_orig_suffix) != 0)
2348 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2349 file, filename_plus_orig_suffix, strerror (errno));
2351 /* Remember that we've already written a .orig backup for this file.
2352 Note that we never free this memory since we need it till the
2353 convert_all_links() call, which is one of the last things the
2354 program does before terminating. BTW, I'm not sure if it would be
2355 safe to just set 'converted_file_ptr->string' to 'file' below,
2356 rather than making a copy of the string... Another note is that I
2357 thought I could just add a field to the urlpos structure saying
2358 that we'd written a .orig file for this URL, but that didn't work,
2359 so I had to make this separate list.
2360 -- Dan Harkless <wget@harkless.org>
2362 This [adding a field to the urlpos structure] didn't work
2363 because convert_file() is called from convert_all_links at
2364 the end of the retrieval with a freshly built new urlpos
2366 -- Hrvoje Niksic <hniksic@arsdigita.com>
2368 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2369 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2370 converted_file_ptr->next = converted_files;
2371 converted_files = converted_file_ptr;
2375 static int find_fragment PARAMS ((const char *, int, const char **,
2378 /* Replace an attribute's original text with NEW_TEXT. */
2381 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2384 char quote_char = '\"'; /* use "..." for quoting, unless the
2385 original value is quoted, in which
2386 case reuse its quoting char. */
2387 const char *frag_beg, *frag_end;
2389 /* Structure of our string is:
2390 "...old-contents..."
2391 <--- size ---> (with quotes)
2394 <--- size --> (no quotes) */
2396 if (*p == '\"' || *p == '\'')
2401 size -= 2; /* disregard opening and closing quote */
2403 putc (quote_char, fp);
2404 fputs (new_text, fp);
2406 /* Look for fragment identifier, if any. */
2407 if (find_fragment (p, size, &frag_beg, &frag_end))
2408 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2412 putc (quote_char, fp);
2417 /* The same as REPLACE_ATTR, but used when replacing
2418 <meta http-equiv=refresh content="new_text"> because we need to
2419 append "timeout_value; URL=" before the next_text. */
2422 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2423 const char *new_text, int timeout)
2426 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2430 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2432 return replace_attr (p, size, fp, new_with_timeout);
2435 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2436 preceded by '&'. If the character is not found, return zero. If
2437 the character is found, return 1 and set BP and EP to point to the
2438 beginning and end of the region.
2440 This is used for finding the fragment indentifiers in URLs. */
2443 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2445 const char *end = beg + size;
2447 for (; beg < end; beg++)
2469 /* Quote FILE for use as local reference to an HTML file.
2471 We quote ? as %3F to avoid passing part of the file name as the
2472 parameter when browsing the converted file through HTTP. However,
2473 it is safe to do this only when `--html-extension' is turned on.
2474 This is because converting "index.html?foo=bar" to
2475 "index.html%3Ffoo=bar" would break local browsing, as the latter
2476 isn't even recognized as an HTML file! However, converting
2477 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2478 safe for both local and HTTP-served browsing. */
2481 local_quote_string (const char *file)
2483 const char *file_sans_qmark;
2486 if (!opt.html_extension)
2487 return html_quote_string (file);
2489 qm = count_char (file, '?');
2493 const char *from = file;
2496 /* qm * 2 because we replace each question mark with "%3F",
2497 i.e. replace one char with three, hence two more. */
2498 int fsqlen = strlen (file) + qm * 2;
2500 to = newname = (char *)alloca (fsqlen + 1);
2501 for (; *from; from++)
2512 assert (to - newname == fsqlen);
2515 file_sans_qmark = newname;
2518 file_sans_qmark = file;
2520 return html_quote_string (file_sans_qmark);
2523 /* We're storing "modes" of type downloaded_file_t in the hash table.
2524 However, our hash tables only accept pointers for keys and values.
2525 So when we need a pointer, we use the address of a
2526 downloaded_file_t variable of static storage. */
2528 static downloaded_file_t *
2529 downloaded_mode_to_ptr (downloaded_file_t mode)
2531 static downloaded_file_t
2532 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2533 v2 = FILE_DOWNLOADED_NORMALLY,
2534 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2535 v4 = CHECK_FOR_FILE;
2539 case FILE_NOT_ALREADY_DOWNLOADED:
2541 case FILE_DOWNLOADED_NORMALLY:
2543 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2545 case CHECK_FOR_FILE:
2551 /* This should really be merged with dl_file_url_map and
2552 downloaded_html_files in recur.c. This was originally a list, but
2553 I changed it to a hash table beause it was actually taking a lot of
2554 time to find things in it. */
2556 static struct hash_table *downloaded_files_hash;
2558 /* Remembers which files have been downloaded. In the standard case, should be
2559 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2560 download successfully (i.e. not for ones we have failures on or that we skip
2563 When we've downloaded a file and tacked on a ".html" extension due to -E,
2564 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2565 FILE_DOWNLOADED_NORMALLY.
2567 If you just want to check if a file has been previously added without adding
2568 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2569 with local filenames, not remote URLs. */
2571 downloaded_file (downloaded_file_t mode, const char *file)
2573 downloaded_file_t *ptr;
2575 if (mode == CHECK_FOR_FILE)
2577 if (!downloaded_files_hash)
2578 return FILE_NOT_ALREADY_DOWNLOADED;
2579 ptr = hash_table_get (downloaded_files_hash, file);
2581 return FILE_NOT_ALREADY_DOWNLOADED;
2585 if (!downloaded_files_hash)
2586 downloaded_files_hash = make_string_hash_table (0);
2588 ptr = hash_table_get (downloaded_files_hash, file);
2592 ptr = downloaded_mode_to_ptr (mode);
2593 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2595 return FILE_NOT_ALREADY_DOWNLOADED;
2599 df_free_mapper (void *key, void *value, void *ignored)
2606 downloaded_files_free (void)
2608 if (downloaded_files_hash)
2610 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2611 hash_table_destroy (downloaded_files_hash);
2612 downloaded_files_hash = NULL;
2616 /* Return non-zero if scheme a is similar to scheme b.
2618 Schemes are similar if they are equal. If SSL is supported, schemes
2619 are also similar if one is http (SCHEME_HTTP) and the other is https
2622 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2627 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2628 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2635 /* Debugging and testing support for path_simplify. */
2637 /* Debug: run path_simplify on PATH and return the result in a new
2638 string. Useful for calling from the debugger. */
2642 char *copy = xstrdup (path);
2643 path_simplify (copy);
2648 run_test (char *test, char *expected_result, int expected_change)
2650 char *test_copy = xstrdup (test);
2651 int modified = path_simplify (test_copy);
2653 if (0 != strcmp (test_copy, expected_result))
2655 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2656 test, expected_result, test_copy);
2658 if (modified != expected_change)
2660 if (expected_change == 1)
2661 printf ("Expected no modification with path_simplify(\"%s\").\n",
2664 printf ("Expected modification with path_simplify(\"%s\").\n",
2671 test_path_simplify (void)
2674 char *test, *result;
2680 { "foo", "foo", 0 },
2681 { "foo/bar", "foo/bar", 0 },
2682 { "foo///bar", "foo/bar", 1 },
2683 { "foo/.", "foo/", 1 },
2684 { "foo/./", "foo/", 1 },
2685 { "foo./", "foo./", 0 },
2686 { "foo/../bar", "bar", 1 },
2687 { "foo/../bar/", "bar/", 1 },
2688 { "foo/bar/..", "foo/", 1 },
2689 { "foo/bar/../x", "foo/x", 1 },
2690 { "foo/bar/../x/", "foo/x/", 1 },
2691 { "foo/..", "", 1 },
2692 { "foo/../..", "", 1 },
2693 { "a/b/../../c", "c", 1 },
2694 { "./a/../b", "b", 1 }
2698 for (i = 0; i < ARRAY_SIZE (tests); i++)
2700 char *test = tests[i].test;
2701 char *expected_result = tests[i].result;
2702 int expected_change = tests[i].should_modify;
2703 run_test (test, expected_result, expected_change);
2706 /* Now run all the tests with a leading slash before the test case,
2707 to prove that the slash is being preserved. */
2708 for (i = 0; i < ARRAY_SIZE (tests); i++)
2710 char *test, *expected_result;
2711 int expected_change = tests[i].should_modify;
2713 test = xmalloc (1 + strlen (tests[i].test) + 1);
2714 sprintf (test, "/%s", tests[i].test);
2716 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2717 sprintf (expected_result, "/%s", tests[i].result);
2719 run_test (test, expected_result, expected_change);
2722 xfree (expected_result);