2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
39 #include <sys/types.h>
57 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
59 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
68 /* Supported schemes: */
69 static struct scheme_data supported_schemes[] =
71 { "http://", DEFAULT_HTTP_PORT, 1 },
73 { "https://", DEFAULT_HTTPS_PORT, 1 },
75 { "ftp://", DEFAULT_FTP_PORT, 1 },
81 /* Forward declarations: */
83 static char *construct_relative PARAMS ((const char *, const char *));
84 static int path_simplify PARAMS ((char *));
88 /* Support for encoding and decoding of URL strings. We determine
89 whether a character is unsafe through static table lookup. This
90 code assumes ASCII character set and 8-bit chars. */
97 #define R urlchr_reserved
98 #define U urlchr_unsafe
101 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
103 /* rfc1738 reserved chars, preserved from encoding. */
105 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
107 /* rfc1738 unsafe chars, plus some more. */
109 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
111 const static unsigned char urlchr_table[256] =
113 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
114 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
115 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
116 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
117 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
118 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
119 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
120 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
121 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
122 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
123 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
124 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
125 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
126 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
127 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
128 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
130 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
131 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
132 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
133 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
135 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
136 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
137 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
138 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
141 /* Decodes the forms %xy in a URL to the character the hexadecimal
142 code of which is xy. xy are hexadecimal digits from
143 [0123456789ABCDEF] (case-insensitive). If x or y are not
144 hex-digits or `%' precedes `\0', the sequence is inserted
148 decode_string (char *s)
150 char *t = s; /* t - tortoise */
151 char *h = s; /* h - hare */
162 /* Do nothing if '%' is not followed by two hex digits. */
163 if (!*(h + 1) || !*(h + 2)
164 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
166 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
173 /* Like encode_string, but return S if there are no unsafe chars. */
176 encode_string_maybe (const char *s)
183 for (p1 = s; *p1; p1++)
184 if (UNSAFE_CHAR (*p1))
185 addition += 2; /* Two more characters (hex digits) */
190 newlen = (p1 - s) + addition;
191 newstr = (char *)xmalloc (newlen + 1);
197 if (UNSAFE_CHAR (*p1))
199 unsigned char c = *p1++;
201 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
202 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
208 assert (p2 - newstr == newlen);
213 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
214 given string, returning a malloc-ed %XX encoded string. */
217 encode_string (const char *s)
219 char *encoded = encode_string_maybe (s);
226 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
227 the old value of PTR is freed and PTR is made to point to the newly
228 allocated storage. */
230 #define ENCODE(ptr) do { \
231 char *e_new = encode_string_maybe (ptr); \
239 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
241 /* Decide whether to encode, decode, or pass through the char at P.
242 This used to be a macro, but it got a little too convoluted. */
243 static inline enum copy_method
244 decide_copy_method (const char *p)
248 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
250 /* %xx sequence: decode it, unless it would decode to an
251 unsafe or a reserved char; in that case, leave it as
253 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
254 XCHAR_TO_XDIGIT (*(p + 2));
256 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
257 return CM_PASSTHROUGH;
262 /* Garbled %.. sequence: encode `%'. */
265 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
268 return CM_PASSTHROUGH;
271 /* Translate a %-quoting (but possibly non-conformant) input string S
272 into a %-quoting (and conformant) output string. If no characters
273 are encoded or decoded, return the same string S; otherwise, return
274 a freshly allocated string with the new contents.
276 After a URL has been run through this function, the protocols that
277 use `%' as the quote character can use the resulting string as-is,
278 while those that don't call decode_string() to get to the intended
279 data. This function is also stable: after an input string is
280 transformed the first time, all further transformations of the
281 result yield the same result string.
283 Let's discuss why this function is needed.
285 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
286 space character would mess up the HTTP request, it needs to be
289 GET /abc%20def HTTP/1.0
291 So it appears that the unsafe chars need to be quoted, as with
292 encode_string. But what if we're requested to download
293 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
294 the user meant was a literal space, and he was kind enough to quote
295 it. In that case, Wget should obviously leave the `%20' as is, and
296 send the same request as above. So in this case we may not call
299 But what if the requested URI is `abc%20 def'? If we call
300 encode_string, we end up with `/abc%2520%20def', which is almost
301 certainly not intended. If we don't call encode_string, we are
302 left with the embedded space and cannot send the request. What the
303 user meant was for Wget to request `/abc%20%20def', and this is
304 where reencode_string kicks in.
306 Wget used to solve this by first decoding %-quotes, and then
307 encoding all the "unsafe" characters found in the resulting string.
308 This was wrong because it didn't preserve certain URL special
309 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
310 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
311 whether we considered `+' reserved (it is). One of these results
312 is inevitable because by the second step we would lose information
313 on whether the `+' was originally encoded or not. Both results
314 were wrong because in CGI parameters + means space, while %2B means
315 literal plus. reencode_string correctly translates the above to
316 "a%2B+b", i.e. returns the original string.
318 This function uses an algorithm proposed by Anon Sricharoenchai:
320 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
323 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
326 ...except that this code conflates the two steps, and decides
327 whether to encode, decode, or pass through each character in turn.
328 The function still uses two passes, but their logic is the same --
329 the first pass exists merely for the sake of allocation. Another
330 small difference is that we include `+' to URL_RESERVED.
334 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
336 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
340 "foo bar" -> "foo%20bar"
341 "foo%20bar" -> "foo%20bar"
342 "foo %20bar" -> "foo%20%20bar"
343 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
344 "foo%25%20bar" -> "foo%25%20bar"
345 "foo%2%20bar" -> "foo%252%20bar"
346 "foo+bar" -> "foo+bar" (plus is reserved!)
347 "foo%2b+bar" -> "foo%2b+bar" */
350 reencode_string (const char *s)
356 int encode_count = 0;
357 int decode_count = 0;
359 /* First, pass through the string to see if there's anything to do,
360 and to calculate the new length. */
361 for (p1 = s; *p1; p1++)
363 switch (decide_copy_method (p1))
376 if (!encode_count && !decode_count)
377 /* The string is good as it is. */
378 return (char *)s; /* C const model sucks. */
381 /* Each encoding adds two characters (hex digits), while each
382 decoding removes two characters. */
383 newlen = oldlen + 2 * (encode_count - decode_count);
384 newstr = xmalloc (newlen + 1);
391 switch (decide_copy_method (p1))
395 unsigned char c = *p1++;
397 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
398 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
402 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
403 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
404 p1 += 3; /* skip %xx */
411 assert (p2 - newstr == newlen);
415 /* Run PTR_VAR through reencode_string. If a new string is consed,
416 free PTR_VAR and make it point to the new storage. Obviously,
417 PTR_VAR needs to be an lvalue. */
419 #define REENCODE(ptr_var) do { \
420 char *rf_new = reencode_string (ptr_var); \
421 if (rf_new != ptr_var) \
428 /* Returns the scheme type if the scheme is supported, or
429 SCHEME_INVALID if not. */
431 url_scheme (const char *url)
435 for (i = 0; supported_schemes[i].leading_string; i++)
436 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
437 strlen (supported_schemes[i].leading_string)))
439 if (supported_schemes[i].enabled)
440 return (enum url_scheme) i;
442 return SCHEME_INVALID;
445 return SCHEME_INVALID;
448 /* Return the number of characters needed to skip the scheme part of
449 the URL, e.g. `http://'. If no scheme is found, returns 0. */
451 url_skip_scheme (const char *url)
455 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
457 while (ISALNUM (*p) || *p == '-' || *p == '+')
464 /* Skip "//" if found. */
465 if (*p == '/' && *(p + 1) == '/')
471 /* Returns 1 if the URL begins with a scheme (supported or
472 unsupported), 0 otherwise. */
474 url_has_scheme (const char *url)
477 while (ISALNUM (*p) || *p == '-' || *p == '+')
483 scheme_default_port (enum url_scheme scheme)
485 return supported_schemes[scheme].default_port;
489 scheme_disable (enum url_scheme scheme)
491 supported_schemes[scheme].enabled = 0;
494 /* Skip the username and password, if present here. The function
495 should be called *not* with the complete URL, but with the part
496 right after the scheme.
498 If no username and password are found, return 0. */
500 url_skip_uname (const char *url)
504 /* Look for '@' that comes before '/' or '?'. */
505 p = (const char *)strpbrk (url, "/?@");
513 parse_uname (const char *str, int len, char **user, char **passwd)
518 /* Empty user name not allowed. */
521 colon = memchr (str, ':', len);
523 /* Empty user name again. */
528 int pwlen = len - (colon + 1 - str);
529 *passwd = xmalloc (pwlen + 1);
530 memcpy (*passwd, colon + 1, pwlen);
531 (*passwd)[pwlen] = '\0';
537 *user = xmalloc (len + 1);
538 memcpy (*user, str, len);
542 decode_string (*user);
544 decode_string (*passwd);
549 /* Used by main.c: detect URLs written using the "shorthand" URL forms
550 popularized by Netscape and NcFTP. HTTP shorthands look like this:
552 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
553 www.foo.com[:port] -> http://www.foo.com[:port]
555 FTP shorthands look like this:
557 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
558 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
560 If the URL needs not or cannot be rewritten, return NULL. */
562 rewrite_shorthand_url (const char *url)
566 if (url_has_scheme (url))
569 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
571 for (p = url; *p && *p != ':' && *p != '/'; p++)
581 /* If the characters after the colon and before the next slash
582 or end of string are all digits, it's HTTP. */
584 for (pp = p + 1; ISDIGIT (*pp); pp++)
586 if (digits > 0 && (*pp == '/' || *pp == '\0'))
589 /* Prepend "ftp://" to the entire URL... */
590 res = xmalloc (6 + strlen (url) + 1);
591 sprintf (res, "ftp://%s", url);
592 /* ...and replace ':' with '/'. */
593 res[6 + (p - url)] = '/';
600 /* Just prepend "http://" to what we have. */
601 res = xmalloc (7 + strlen (url) + 1);
602 sprintf (res, "http://%s", url);
607 static void parse_path PARAMS ((const char *, char **, char **));
610 strpbrk_or_eos (const char *s, const char *accept)
612 char *p = strpbrk (s, accept);
614 p = (char *)s + strlen (s);
618 /* Turn STR into lowercase; return non-zero if a character was
622 lowercase_str (char *str)
629 *str = TOLOWER (*str);
634 static char *parse_errors[] = {
635 #define PE_NO_ERROR 0
637 #define PE_UNSUPPORTED_SCHEME 1
638 "Unsupported scheme",
639 #define PE_EMPTY_HOST 2
641 #define PE_BAD_PORT_NUMBER 3
643 #define PE_INVALID_USER_NAME 4
645 #define PE_UNTERMINATED_IPV6_ADDRESS 5
646 "Unterminated IPv6 numeric address",
647 #define PE_INVALID_IPV6_ADDRESS 6
648 "Invalid char in IPv6 numeric address"
651 #define SETERR(p, v) do { \
658 Return a new struct url if successful, NULL on error. In case of
659 error, and if ERROR is not NULL, also set *ERROR to the appropriate
662 url_parse (const char *url, int *error)
666 int path_modified, host_modified;
668 enum url_scheme scheme;
670 const char *uname_b, *uname_e;
671 const char *host_b, *host_e;
672 const char *path_b, *path_e;
673 const char *params_b, *params_e;
674 const char *query_b, *query_e;
675 const char *fragment_b, *fragment_e;
678 char *user = NULL, *passwd = NULL;
682 scheme = url_scheme (url);
683 if (scheme == SCHEME_INVALID)
685 SETERR (error, PE_UNSUPPORTED_SCHEME);
689 url_encoded = reencode_string (url);
692 p += strlen (supported_schemes[scheme].leading_string);
694 p += url_skip_uname (p);
697 /* scheme://user:pass@host[:port]... */
700 /* We attempt to break down the URL into the components path,
701 params, query, and fragment. They are ordered like this:
703 scheme://host[:port][/path][;params][?query][#fragment] */
705 params_b = params_e = NULL;
706 query_b = query_e = NULL;
707 fragment_b = fragment_e = NULL;
713 /* Support http://[::1]/ used by IPv6. */
724 SETERR (error, PE_UNTERMINATED_IPV6_ADDRESS);
737 SETERR (error, PE_INVALID_IPV6_ADDRESS);
740 /* Don't include brackets in [host_b, host_p). */
746 p = strpbrk_or_eos (p, ":/;?#");
750 if (host_b == host_e)
752 SETERR (error, PE_EMPTY_HOST);
756 port = scheme_default_port (scheme);
759 const char *port_b, *port_e, *pp;
761 /* scheme://host:port/tralala */
765 p = strpbrk_or_eos (p, "/;?#");
768 if (port_b == port_e)
770 /* http://host:/whatever */
772 SETERR (error, PE_BAD_PORT_NUMBER);
776 for (port = 0, pp = port_b; pp < port_e; pp++)
780 /* http://host:12randomgarbage/blah */
782 SETERR (error, PE_BAD_PORT_NUMBER);
785 port = 10 * port + (*pp - '0');
793 p = strpbrk_or_eos (p, ";?#");
798 /* Path is not allowed not to exist. */
806 p = strpbrk_or_eos (p, "?#");
813 p = strpbrk_or_eos (p, "#");
816 /* Hack that allows users to use '?' (a wildcard character) in
817 FTP URLs without it being interpreted as a query string
819 if (scheme == SCHEME_FTP)
821 query_b = query_e = NULL;
834 if (uname_b != uname_e)
836 /* http://user:pass@host */
838 /* uname_b uname_e */
839 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
841 SETERR (error, PE_INVALID_USER_NAME);
846 u = (struct url *)xmalloc (sizeof (struct url));
847 memset (u, 0, sizeof (*u));
850 u->host = strdupdelim (host_b, host_e);
855 u->path = strdupdelim (path_b, path_e);
856 path_modified = path_simplify (u->path);
857 parse_path (u->path, &u->dir, &u->file);
859 host_modified = lowercase_str (u->host);
862 u->params = strdupdelim (params_b, params_e);
864 u->query = strdupdelim (query_b, query_e);
866 u->fragment = strdupdelim (fragment_b, fragment_e);
868 if (path_modified || u->fragment || host_modified || path_b == path_e)
870 /* If we suspect that a transformation has rendered what
871 url_string might return different from URL_ENCODED, rebuild
872 u->url using url_string. */
873 u->url = url_string (u, 0);
875 if (url_encoded != url)
876 xfree ((char *) url_encoded);
880 if (url_encoded == url)
881 u->url = xstrdup (url);
883 u->url = url_encoded;
891 url_error (int error_code)
893 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
894 return parse_errors[error_code];
898 parse_path (const char *quoted_path, char **dir, char **file)
900 char *path, *last_slash;
902 STRDUP_ALLOCA (path, quoted_path);
903 decode_string (path);
905 last_slash = strrchr (path, '/');
909 *file = xstrdup (path);
913 *dir = strdupdelim (path, last_slash);
914 *file = xstrdup (last_slash + 1);
918 /* Note: URL's "full path" is the path with the query string and
919 params appended. The "fragment" (#foo) is intentionally ignored,
920 but that might be changed. For example, if the original URL was
921 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
922 the full path will be "/foo/bar/baz;bullshit?querystring". */
924 /* Return the length of the full path, without the terminating
928 full_path_length (const struct url *url)
932 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
943 /* Write out the full path. */
946 full_path_write (const struct url *url, char *where)
948 #define FROB(el, chr) do { \
949 char *f_el = url->el; \
951 int l = strlen (f_el); \
953 memcpy (where, f_el, l); \
965 /* Public function for getting the "full path". E.g. if u->path is
966 "foo/bar" and u->query is "param=value", full_path will be
967 "/foo/bar?param=value". */
970 url_full_path (const struct url *url)
972 int length = full_path_length (url);
973 char *full_path = (char *)xmalloc(length + 1);
975 full_path_write (url, full_path);
976 full_path[length] = '\0';
981 /* Sync u->path and u->url with u->dir and u->file. */
984 sync_path (struct url *url)
992 newpath = xstrdup (url->file);
997 int dirlen = strlen (url->dir);
998 int filelen = strlen (url->file);
1000 newpath = xmalloc (dirlen + 1 + filelen + 1);
1001 memcpy (newpath, url->dir, dirlen);
1002 newpath[dirlen] = '/';
1003 memcpy (newpath + dirlen + 1, url->file, filelen);
1004 newpath[dirlen + 1 + filelen] = '\0';
1008 url->path = newpath;
1010 /* Synchronize u->url. */
1012 url->url = url_string (url, 0);
1015 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
1016 This way we can sync u->path and u->url when they get changed. */
1019 url_set_dir (struct url *url, const char *newdir)
1022 url->dir = xstrdup (newdir);
1027 url_set_file (struct url *url, const char *newfile)
1030 url->file = xstrdup (newfile);
1035 url_free (struct url *url)
1041 FREE_MAYBE (url->params);
1042 FREE_MAYBE (url->query);
1043 FREE_MAYBE (url->fragment);
1044 FREE_MAYBE (url->user);
1045 FREE_MAYBE (url->passwd);
1054 get_urls_file (const char *file)
1056 struct file_memory *fm;
1057 struct urlpos *head, *tail;
1058 const char *text, *text_end;
1060 /* Load the file. */
1061 fm = read_file (file);
1064 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1067 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1071 text_end = fm->content + fm->length;
1072 while (text < text_end)
1074 const char *line_beg = text;
1075 const char *line_end = memchr (text, '\n', text_end - text);
1077 line_end = text_end;
1082 /* Strip whitespace from the beginning and end of line. */
1083 while (line_beg < line_end && ISSPACE (*line_beg))
1085 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1088 if (line_end > line_beg)
1090 /* URL is in the [line_beg, line_end) region. */
1094 struct urlpos *entry;
1097 /* We must copy the URL to a zero-terminated string, and we
1098 can't use alloca because we're in a loop. *sigh*. */
1099 url_text = strdupdelim (line_beg, line_end);
1103 /* Merge opt.base_href with URL. */
1104 char *merged = uri_merge (opt.base_href, url_text);
1109 url = url_parse (url_text, &up_error_code);
1112 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1113 file, url_text, url_error (up_error_code));
1119 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1120 memset (entry, 0, sizeof (*entry));
1131 read_file_free (fm);
1135 /* Free the linked list of urlpos. */
1137 free_urlpos (struct urlpos *l)
1141 struct urlpos *next = l->next;
1144 FREE_MAYBE (l->local_name);
1150 /* Rotate FNAME opt.backups times */
1152 rotate_backups(const char *fname)
1154 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1155 char *from = (char *)alloca (maxlen);
1156 char *to = (char *)alloca (maxlen);
1160 if (stat (fname, &sb) == 0)
1161 if (S_ISREG (sb.st_mode) == 0)
1164 for (i = opt.backups; i > 1; i--)
1166 sprintf (from, "%s.%d", fname, i - 1);
1167 sprintf (to, "%s.%d", fname, i);
1168 /* #### This will fail on machines without the rename() system
1173 sprintf (to, "%s.%d", fname, 1);
1177 /* Create all the necessary directories for PATH (a file). Calls
1178 mkdirhier() internally. */
1180 mkalldirs (const char *path)
1187 p = path + strlen (path);
1188 for (; *p != '/' && p != path; p--);
1189 /* Don't create if it's just a file. */
1190 if ((p == path) && (*p != '/'))
1192 t = strdupdelim (path, p);
1193 /* Check whether the directory exists. */
1194 if ((stat (t, &st) == 0))
1196 if (S_ISDIR (st.st_mode))
1203 /* If the dir exists as a file name, remove it first. This
1204 is *only* for Wget to work with buggy old CERN http
1205 servers. Here is the scenario: When Wget tries to
1206 retrieve a directory without a slash, e.g.
1207 http://foo/bar (bar being a directory), CERN server will
1208 not redirect it too http://foo/bar/ -- it will generate a
1209 directory listing containing links to bar/file1,
1210 bar/file2, etc. Wget will lose because it saves this
1211 HTML listing to a file `bar', so it cannot create the
1212 directory. To work around this, if the file of the same
1213 name exists, we just remove it and create the directory
1215 DEBUGP (("Removing %s because of directory danger!\n", t));
1219 res = make_directory (t);
1221 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1227 count_slashes (const char *s)
1236 /* Return the path name of the URL-equivalent file name, with a
1237 remote-like structure of directories. */
1239 mkstruct (const struct url *u)
1242 char *res, *dirpref;
1247 char *ptr = u->dir + (*u->dir == '/');
1248 int slash_count = 1 + count_slashes (ptr);
1249 int cut = MINVAL (opt.cut_dirs, slash_count);
1250 for (; cut && *ptr; ptr++)
1253 STRDUP_ALLOCA (dir, ptr);
1256 dir = u->dir + (*u->dir == '/');
1258 /* Check for the true name (or at least a consistent name for saving
1259 to directory) of HOST, reusing the hlist if possible. */
1260 if (opt.add_hostdir)
1262 /* Add dir_prefix and hostname (if required) to the beginning of
1264 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1266 + 1 + numdigit (u->port)
1268 if (!DOTP (opt.dir_prefix))
1269 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1271 strcpy (dirpref, u->host);
1273 if (u->port != scheme_default_port (u->scheme))
1275 int len = strlen (dirpref);
1277 number_to_string (dirpref + len + 1, u->port);
1280 else /* not add_hostdir */
1282 if (!DOTP (opt.dir_prefix))
1283 dirpref = opt.dir_prefix;
1288 /* If there is a prefix, prepend it. */
1291 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1292 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1297 if (l && dir[l - 1] == '/')
1301 file = "index.html";
1305 /* Finally, construct the full name. */
1306 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1308 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1313 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1314 an escaped query string. The trick is to make sure that unsafe
1315 characters in BASE are escaped, and that slashes in QUERY are also
1319 compose_file_name (char *base, char *query)
1325 /* Copy BASE to RESULT and encode all unsafe characters. */
1327 while (*from && to - result < sizeof (result))
1329 if (UNSAFE_CHAR (*from))
1331 unsigned char c = *from++;
1333 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1334 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1340 if (query && to - result < sizeof (result))
1344 /* Copy QUERY to RESULT and encode all '/' characters. */
1346 while (*from && to - result < sizeof (result))
1360 if (to - result < sizeof (result))
1363 /* Truncate input which is too long, presumably due to a huge
1365 result[sizeof (result) - 1] = '\0';
1367 return xstrdup (result);
1370 /* Create a unique filename, corresponding to a given URL. Calls
1371 mkstruct if necessary. Does *not* actually create any directories. */
1373 url_filename (const struct url *u)
1377 char *query = u->query && *u->query ? u->query : NULL;
1381 char *base = mkstruct (u);
1382 file = compose_file_name (base, query);
1387 char *base = *u->file ? u->file : "index.html";
1388 file = compose_file_name (base, query);
1390 /* Check whether the prefix directory is something other than "."
1391 before prepending it. */
1392 if (!DOTP (opt.dir_prefix))
1394 /* #### should just realloc FILE and prepend dir_prefix. */
1395 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1396 + 1 + strlen (file) + 1);
1397 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1403 /* DOS-ish file systems don't like `%' signs in them; we change it
1408 for (p = file; *p; p++)
1412 #endif /* WINDOWS */
1414 /* Check the cases in which the unique extensions are not used:
1415 1) Clobbering is turned off (-nc).
1416 2) Retrieval with regetting.
1417 3) Timestamping is used.
1418 4) Hierarchy is built.
1420 The exception is the case when file does exist and is a
1421 directory (actually support for bad httpd-s). */
1422 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1423 && !(file_exists_p (file) && !file_non_directory_p (file)))
1426 /* Find a unique name. */
1427 name = unique_name (file);
1432 /* Return the langth of URL's path. Path is considered to be
1433 terminated by one of '?', ';', '#', or by the end of the
1436 path_length (const char *url)
1438 const char *q = strpbrk_or_eos (url, "?;#");
1442 /* Find the last occurrence of character C in the range [b, e), or
1443 NULL, if none are present. This is equivalent to strrchr(b, c),
1444 except that it accepts an END argument instead of requiring the
1445 string to be zero-terminated. Why is there no memrchr()? */
1447 find_last_char (const char *b, const char *e, char c)
1455 /* Resolve "." and ".." elements of PATH by destructively modifying
1456 PATH. "." is resolved by removing that path element, and ".." is
1457 resolved by removing the preceding path element. Leading and
1458 trailing slashes are preserved.
1460 Return non-zero if any changes have been made.
1462 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
1463 test examples are provided below. If you change anything in this
1464 function, run test_path_simplify to make sure you haven't broken a
1467 A previous version of this function was based on path_simplify()
1468 from GNU Bash, but it has been rewritten for Wget 1.8.1. */
1471 path_simplify (char *path)
1477 ++path; /* preserve the leading '/'. */
1480 end = p + strlen (p) + 1; /* position past the terminating zero. */
1485 /* P should point to the beginning of a path element. */
1487 if (*p == '.' && (*(p + 1) == '/' || *(p + 1) == '\0'))
1489 /* Handle "./foo" by moving "foo" two characters to the
1491 if (*(p + 1) == '/')
1494 memmove (p, p + 2, end - p);
1505 else if (*p == '.' && *(p + 1) == '.'
1506 && (*(p + 2) == '/' || *(p + 2) == '\0'))
1508 /* Handle "../foo" by moving "foo" one path element to the
1510 char *b = p; /* not p-1 because P can equal PATH */
1512 /* Backtrack by one path element, but not past the beginning
1515 /* foo/bar/../baz */
1521 /* Move backwards until B hits the beginning of the
1522 previous path element or the beginning of path. */
1523 for (--b; b > path && *(b - 1) != '/'; b--)
1528 if (*(p + 2) == '/')
1530 memmove (b, p + 3, end - (p + 3));
1544 /* Remove empty path elements. Not mandated by rfc1808 et
1545 al, but empty path elements are not all that useful, and
1546 the rest of Wget might not deal with them well. */
1556 memmove (p, q, end - q);
1561 /* Skip to the next path element. */
1562 while (*p && *p != '/')
1567 /* Make sure P points to the beginning of the next path element,
1568 which is location after the slash. */
1575 /* Resolve the result of "linking" a base URI (BASE) to a
1576 link-specified URI (LINK).
1578 Either of the URIs may be absolute or relative, complete with the
1579 host name, or path only. This tries to behave "reasonably" in all
1580 foreseeable cases. It employs little specific knowledge about
1581 schemes or URL-specific stuff -- it just works on strings.
1583 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1584 See uri_merge for a gentler interface to this functionality.
1586 Perhaps this function should call path_simplify so that the callers
1587 don't have to call url_parse unconditionally. */
1589 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1595 const char *end = base + path_length (base);
1599 /* Empty LINK points back to BASE, query string and all. */
1600 constr = xstrdup (base);
1602 else if (*link == '?')
1604 /* LINK points to the same location, but changes the query
1605 string. Examples: */
1606 /* uri_merge("path", "?new") -> "path?new" */
1607 /* uri_merge("path?foo", "?new") -> "path?new" */
1608 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1609 /* uri_merge("path#foo", "?new") -> "path?new" */
1610 int baselength = end - base;
1611 constr = xmalloc (baselength + linklength + 1);
1612 memcpy (constr, base, baselength);
1613 memcpy (constr + baselength, link, linklength);
1614 constr[baselength + linklength] = '\0';
1616 else if (*link == '#')
1618 /* uri_merge("path", "#new") -> "path#new" */
1619 /* uri_merge("path#foo", "#new") -> "path#new" */
1620 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1621 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1623 const char *end1 = strchr (base, '#');
1625 end1 = base + strlen (base);
1626 baselength = end1 - base;
1627 constr = xmalloc (baselength + linklength + 1);
1628 memcpy (constr, base, baselength);
1629 memcpy (constr + baselength, link, linklength);
1630 constr[baselength + linklength] = '\0';
1632 else if (linklength > 1 && *link == '/' && *(link + 1) == '/')
1634 /* LINK begins with "//" and so is a net path: we need to
1635 replace everything after (and including) the double slash
1638 /* uri_merge("foo", "//new/bar") -> "//new/bar" */
1639 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */
1640 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1644 const char *start_insert;
1646 /* Look for first slash. */
1647 slash = memchr (base, '/', end - base);
1648 /* If found slash and it is a double slash, then replace
1649 from this point, else default to replacing from the
1651 if (slash && *(slash + 1) == '/')
1652 start_insert = slash;
1654 start_insert = base;
1656 span = start_insert - base;
1657 constr = (char *)xmalloc (span + linklength + 1);
1659 memcpy (constr, base, span);
1660 memcpy (constr + span, link, linklength);
1661 constr[span + linklength] = '\0';
1663 else if (*link == '/')
1665 /* LINK is an absolute path: we need to replace everything
1666 after (and including) the FIRST slash with LINK.
1668 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1669 "/qux/xyzzy", our result should be
1670 "http://host/qux/xyzzy". */
1673 const char *start_insert = NULL; /* for gcc to shut up. */
1674 const char *pos = base;
1675 int seen_slash_slash = 0;
1676 /* We're looking for the first slash, but want to ignore
1679 slash = memchr (pos, '/', end - pos);
1680 if (slash && !seen_slash_slash)
1681 if (*(slash + 1) == '/')
1684 seen_slash_slash = 1;
1688 /* At this point, SLASH is the location of the first / after
1689 "//", or the first slash altogether. START_INSERT is the
1690 pointer to the location where LINK will be inserted. When
1691 examining the last two examples, keep in mind that LINK
1694 if (!slash && !seen_slash_slash)
1695 /* example: "foo" */
1697 start_insert = base;
1698 else if (!slash && seen_slash_slash)
1699 /* example: "http://foo" */
1702 else if (slash && !seen_slash_slash)
1703 /* example: "foo/bar" */
1705 start_insert = base;
1706 else if (slash && seen_slash_slash)
1707 /* example: "http://something/" */
1709 start_insert = slash;
1711 span = start_insert - base;
1712 constr = (char *)xmalloc (span + linklength + 1);
1714 memcpy (constr, base, span);
1716 memcpy (constr + span, link, linklength);
1717 constr[span + linklength] = '\0';
1721 /* LINK is a relative URL: we need to replace everything
1722 after last slash (possibly empty) with LINK.
1724 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1725 our result should be "whatever/foo/qux/xyzzy". */
1726 int need_explicit_slash = 0;
1728 const char *start_insert;
1729 const char *last_slash = find_last_char (base, end, '/');
1732 /* No slash found at all. Append LINK to what we have,
1733 but we'll need a slash as a separator.
1735 Example: if base == "foo" and link == "qux/xyzzy", then
1736 we cannot just append link to base, because we'd get
1737 "fooqux/xyzzy", whereas what we want is
1740 To make sure the / gets inserted, we set
1741 need_explicit_slash to 1. We also set start_insert
1742 to end + 1, so that the length calculations work out
1743 correctly for one more (slash) character. Accessing
1744 that character is fine, since it will be the
1745 delimiter, '\0' or '?'. */
1746 /* example: "foo?..." */
1747 /* ^ ('?' gets changed to '/') */
1748 start_insert = end + 1;
1749 need_explicit_slash = 1;
1751 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1753 /* example: http://host" */
1755 start_insert = end + 1;
1756 need_explicit_slash = 1;
1760 /* example: "whatever/foo/bar" */
1762 start_insert = last_slash + 1;
1765 span = start_insert - base;
1766 constr = (char *)xmalloc (span + linklength + 1);
1768 memcpy (constr, base, span);
1769 if (need_explicit_slash)
1770 constr[span - 1] = '/';
1772 memcpy (constr + span, link, linklength);
1773 constr[span + linklength] = '\0';
1776 else /* !no_scheme */
1778 constr = strdupdelim (link, link + linklength);
1783 /* Merge BASE with LINK and return the resulting URI. This is an
1784 interface to uri_merge_1 that assumes that LINK is a
1785 zero-terminated string. */
1787 uri_merge (const char *base, const char *link)
1789 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1792 #define APPEND(p, s) do { \
1793 int len = strlen (s); \
1794 memcpy (p, s, len); \
1798 /* Use this instead of password when the actual password is supposed
1799 to be hidden. We intentionally use a generic string without giving
1800 away the number of characters in the password, like previous
1802 #define HIDDEN_PASSWORD "*password*"
1804 /* Recreate the URL string from the data in URL.
1806 If HIDE is non-zero (as it is when we're calling this on a URL we
1807 plan to print, but not when calling it to canonicalize a URL for
1808 use within the program), password will be hidden. Unsafe
1809 characters in the URL will be quoted. */
1812 url_string (const struct url *url, int hide_password)
1816 char *quoted_user = NULL, *quoted_passwd = NULL;
1818 int scheme_port = supported_schemes[url->scheme].default_port;
1819 char *scheme_str = supported_schemes[url->scheme].leading_string;
1820 int fplen = full_path_length (url);
1822 int brackets_around_host = 0;
1824 assert (scheme_str != NULL);
1826 /* Make sure the user name and password are quoted. */
1829 quoted_user = encode_string_maybe (url->user);
1833 quoted_passwd = HIDDEN_PASSWORD;
1835 quoted_passwd = encode_string_maybe (url->passwd);
1839 if (strchr (url->host, ':'))
1840 brackets_around_host = 1;
1842 size = (strlen (scheme_str)
1843 + strlen (url->host)
1844 + (brackets_around_host ? 2 : 0)
1847 if (url->port != scheme_port)
1848 size += 1 + numdigit (url->port);
1851 size += 1 + strlen (quoted_user);
1853 size += 1 + strlen (quoted_passwd);
1856 p = result = xmalloc (size);
1858 APPEND (p, scheme_str);
1861 APPEND (p, quoted_user);
1865 APPEND (p, quoted_passwd);
1870 if (brackets_around_host)
1872 APPEND (p, url->host);
1873 if (brackets_around_host)
1875 if (url->port != scheme_port)
1878 p = number_to_string (p, url->port);
1881 full_path_write (url, p);
1885 assert (p - result == size);
1887 if (quoted_user && quoted_user != url->user)
1888 xfree (quoted_user);
1889 if (quoted_passwd && !hide_password
1890 && quoted_passwd != url->passwd)
1891 xfree (quoted_passwd);
1896 /* Return the URL of the proxy appropriate for url U. */
1898 getproxy (struct url *u)
1901 char *rewritten_url;
1902 static char rewritten_storage[1024];
1906 if (!no_proxy_match (u->host, (const char **)opt.no_proxy))
1912 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1916 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1920 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1922 case SCHEME_INVALID:
1925 if (!proxy || !*proxy)
1928 /* Handle shorthands. `rewritten_storage' is a kludge to allow
1929 getproxy() to return static storage. */
1930 rewritten_url = rewrite_shorthand_url (proxy);
1933 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1934 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1935 proxy = rewritten_storage;
1941 /* Should a host be accessed through proxy, concerning no_proxy? */
1943 no_proxy_match (const char *host, const char **no_proxy)
1948 return !sufmatch (no_proxy, host);
1951 /* Support for converting links for local viewing in downloaded HTML
1952 files. This should be moved to another file, because it has
1953 nothing to do with processing URLs. */
1955 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1956 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1958 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1959 const char *, int));
1960 static char *local_quote_string PARAMS ((const char *));
1962 /* Change the links in one HTML file. LINKS is a list of links in the
1963 document, along with their positions and the desired direction of
1966 convert_links (const char *file, struct urlpos *links)
1968 struct file_memory *fm;
1971 downloaded_file_t downloaded_file_return;
1973 struct urlpos *link;
1974 int to_url_count = 0, to_file_count = 0;
1976 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1979 /* First we do a "dry run": go through the list L and see whether
1980 any URL needs to be converted in the first place. If not, just
1981 leave the file alone. */
1983 struct urlpos *dry = links;
1984 for (dry = links; dry; dry = dry->next)
1985 if (dry->convert != CO_NOCONVERT)
1989 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1994 fm = read_file (file);
1997 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1998 file, strerror (errno));
2002 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
2003 if (opt.backup_converted && downloaded_file_return)
2004 write_backup_file (file, downloaded_file_return);
2006 /* Before opening the file for writing, unlink the file. This is
2007 important if the data in FM is mmaped. In such case, nulling the
2008 file, which is what fopen() below does, would make us read all
2009 zeroes from the mmaped region. */
2010 if (unlink (file) < 0 && errno != ENOENT)
2012 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
2013 file, strerror (errno));
2014 read_file_free (fm);
2017 /* Now open the file for writing. */
2018 fp = fopen (file, "wb");
2021 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
2022 file, strerror (errno));
2023 read_file_free (fm);
2027 /* Here we loop through all the URLs in file, replacing those of
2028 them that are downloaded with relative references. */
2030 for (link = links; link; link = link->next)
2032 char *url_start = fm->content + link->pos;
2034 if (link->pos >= fm->length)
2036 DEBUGP (("Something strange is going on. Please investigate."));
2039 /* If the URL is not to be converted, skip it. */
2040 if (link->convert == CO_NOCONVERT)
2042 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
2046 /* Echo the file contents, up to the offending URL's opening
2047 quote, to the outfile. */
2048 fwrite (p, 1, url_start - p, fp);
2051 switch (link->convert)
2053 case CO_CONVERT_TO_RELATIVE:
2054 /* Convert absolute URL to relative. */
2056 char *newname = construct_relative (file, link->local_name);
2057 char *quoted_newname = local_quote_string (newname);
2059 if (!link->link_refresh_p)
2060 p = replace_attr (p, link->size, fp, quoted_newname);
2062 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
2063 link->refresh_timeout);
2065 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
2066 link->url->url, newname, link->pos, file));
2068 xfree (quoted_newname);
2072 case CO_CONVERT_TO_COMPLETE:
2073 /* Convert the link to absolute URL. */
2075 char *newlink = link->url->url;
2076 char *quoted_newlink = html_quote_string (newlink);
2078 if (!link->link_refresh_p)
2079 p = replace_attr (p, link->size, fp, quoted_newlink);
2081 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
2082 link->refresh_timeout);
2084 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
2085 newlink, link->pos, file));
2086 xfree (quoted_newlink);
2090 case CO_NULLIFY_BASE:
2091 /* Change the base href to "". */
2092 p = replace_attr (p, link->size, fp, "");
2100 /* Output the rest of the file. */
2101 if (p - fm->content < fm->length)
2102 fwrite (p, 1, fm->length - (p - fm->content), fp);
2104 read_file_free (fm);
2106 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
2109 /* Construct and return a malloced copy of the relative link from two
2110 pieces of information: local name S1 of the referring file and
2111 local name S2 of the referred file.
2113 So, if S1 is "jagor.srce.hr/index.html" and S2 is
2114 "jagor.srce.hr/images/news.gif", the function will return
2117 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
2118 "fly.cc.fer.hr/images/fly.gif", the function will return
2119 "../images/fly.gif".
2121 Caveats: S1 should not begin with `/', unless S2 also begins with
2122 '/'. S1 should not contain things like ".." and such --
2123 construct_relative ("fly/ioccc/../index.html",
2124 "fly/images/fly.gif") will fail. (A workaround is to call
2125 something like path_simplify() on S1). */
2127 construct_relative (const char *s1, const char *s2)
2129 int i, cnt, sepdirs1;
2133 return xstrdup (s2);
2134 /* S1 should *not* be absolute, if S2 wasn't. */
2135 assert (*s1 != '/');
2137 /* Skip the directories common to both strings. */
2140 while (s1[i] && s2[i]
2145 if (s1[i] == '/' && s2[i] == '/')
2150 for (sepdirs1 = 0; s1[i]; i++)
2153 /* Now, construct the file as of:
2154 - ../ repeated sepdirs1 time
2155 - all the non-mutual directories of S2. */
2156 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
2157 for (i = 0; i < sepdirs1; i++)
2158 memcpy (res + 3 * i, "../", 3);
2159 strcpy (res + 3 * i, s2 + cnt);
2164 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
2166 /* Rather than just writing over the original .html file with the
2167 converted version, save the former to *.orig. Note we only do
2168 this for files we've _successfully_ downloaded, so we don't
2169 clobber .orig files sitting around from previous invocations. */
2171 /* Construct the backup filename as the original name plus ".orig". */
2172 size_t filename_len = strlen(file);
2173 char* filename_plus_orig_suffix;
2174 boolean already_wrote_backup_file = FALSE;
2175 slist* converted_file_ptr;
2176 static slist* converted_files = NULL;
2178 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
2180 /* Just write "orig" over "html". We need to do it this way
2181 because when we're checking to see if we've downloaded the
2182 file before (to see if we can skip downloading it), we don't
2183 know if it's a text/html file. Therefore we don't know yet
2184 at that stage that -E is going to cause us to tack on
2185 ".html", so we need to compare vs. the original URL plus
2186 ".orig", not the original URL plus ".html.orig". */
2187 filename_plus_orig_suffix = alloca (filename_len + 1);
2188 strcpy(filename_plus_orig_suffix, file);
2189 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
2191 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
2193 /* Append ".orig" to the name. */
2194 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
2195 strcpy(filename_plus_orig_suffix, file);
2196 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
2199 /* We can get called twice on the same URL thanks to the
2200 convert_all_links() call in main(). If we write the .orig file
2201 each time in such a case, it'll end up containing the first-pass
2202 conversion, not the original file. So, see if we've already been
2203 called on this file. */
2204 converted_file_ptr = converted_files;
2205 while (converted_file_ptr != NULL)
2206 if (strcmp(converted_file_ptr->string, file) == 0)
2208 already_wrote_backup_file = TRUE;
2212 converted_file_ptr = converted_file_ptr->next;
2214 if (!already_wrote_backup_file)
2216 /* Rename <file> to <file>.orig before former gets written over. */
2217 if (rename(file, filename_plus_orig_suffix) != 0)
2218 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
2219 file, filename_plus_orig_suffix, strerror (errno));
2221 /* Remember that we've already written a .orig backup for this file.
2222 Note that we never free this memory since we need it till the
2223 convert_all_links() call, which is one of the last things the
2224 program does before terminating. BTW, I'm not sure if it would be
2225 safe to just set 'converted_file_ptr->string' to 'file' below,
2226 rather than making a copy of the string... Another note is that I
2227 thought I could just add a field to the urlpos structure saying
2228 that we'd written a .orig file for this URL, but that didn't work,
2229 so I had to make this separate list.
2230 -- Dan Harkless <wget@harkless.org>
2232 This [adding a field to the urlpos structure] didn't work
2233 because convert_file() is called from convert_all_links at
2234 the end of the retrieval with a freshly built new urlpos
2236 -- Hrvoje Niksic <hniksic@arsdigita.com>
2238 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2239 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2240 converted_file_ptr->next = converted_files;
2241 converted_files = converted_file_ptr;
2245 static int find_fragment PARAMS ((const char *, int, const char **,
2248 /* Replace an attribute's original text with NEW_TEXT. */
2251 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2254 char quote_char = '\"'; /* use "..." for quoting, unless the
2255 original value is quoted, in which
2256 case reuse its quoting char. */
2257 const char *frag_beg, *frag_end;
2259 /* Structure of our string is:
2260 "...old-contents..."
2261 <--- size ---> (with quotes)
2264 <--- size --> (no quotes) */
2266 if (*p == '\"' || *p == '\'')
2271 size -= 2; /* disregard opening and closing quote */
2273 putc (quote_char, fp);
2274 fputs (new_text, fp);
2276 /* Look for fragment identifier, if any. */
2277 if (find_fragment (p, size, &frag_beg, &frag_end))
2278 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2282 putc (quote_char, fp);
2287 /* The same as REPLACE_ATTR, but used when replacing
2288 <meta http-equiv=refresh content="new_text"> because we need to
2289 append "timeout_value; URL=" before the next_text. */
2292 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2293 const char *new_text, int timeout)
2296 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2300 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2302 return replace_attr (p, size, fp, new_with_timeout);
2305 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2306 preceded by '&'. If the character is not found, return zero. If
2307 the character is found, return 1 and set BP and EP to point to the
2308 beginning and end of the region.
2310 This is used for finding the fragment indentifiers in URLs. */
2313 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2315 const char *end = beg + size;
2317 for (; beg < end; beg++)
2339 /* Quote FILE for use as local reference to an HTML file.
2341 We quote ? as %3F to avoid passing part of the file name as the
2342 parameter when browsing the converted file through HTTP. However,
2343 it is safe to do this only when `--html-extension' is turned on.
2344 This is because converting "index.html?foo=bar" to
2345 "index.html%3Ffoo=bar" would break local browsing, as the latter
2346 isn't even recognized as an HTML file! However, converting
2347 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2348 safe for both local and HTTP-served browsing. */
2351 local_quote_string (const char *file)
2353 const char *file_sans_qmark;
2356 if (!opt.html_extension)
2357 return html_quote_string (file);
2359 qm = count_char (file, '?');
2363 const char *from = file;
2366 /* qm * 2 because we replace each question mark with "%3F",
2367 i.e. replace one char with three, hence two more. */
2368 int fsqlen = strlen (file) + qm * 2;
2370 to = newname = (char *)alloca (fsqlen + 1);
2371 for (; *from; from++)
2382 assert (to - newname == fsqlen);
2385 file_sans_qmark = newname;
2388 file_sans_qmark = file;
2390 return html_quote_string (file_sans_qmark);
2393 /* We're storing "modes" of type downloaded_file_t in the hash table.
2394 However, our hash tables only accept pointers for keys and values.
2395 So when we need a pointer, we use the address of a
2396 downloaded_file_t variable of static storage. */
2398 static downloaded_file_t *
2399 downloaded_mode_to_ptr (downloaded_file_t mode)
2401 static downloaded_file_t
2402 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2403 v2 = FILE_DOWNLOADED_NORMALLY,
2404 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2405 v4 = CHECK_FOR_FILE;
2409 case FILE_NOT_ALREADY_DOWNLOADED:
2411 case FILE_DOWNLOADED_NORMALLY:
2413 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2415 case CHECK_FOR_FILE:
2421 /* This should really be merged with dl_file_url_map and
2422 downloaded_html_files in recur.c. This was originally a list, but
2423 I changed it to a hash table beause it was actually taking a lot of
2424 time to find things in it. */
2426 static struct hash_table *downloaded_files_hash;
2428 /* Remembers which files have been downloaded. In the standard case, should be
2429 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2430 download successfully (i.e. not for ones we have failures on or that we skip
2433 When we've downloaded a file and tacked on a ".html" extension due to -E,
2434 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2435 FILE_DOWNLOADED_NORMALLY.
2437 If you just want to check if a file has been previously added without adding
2438 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2439 with local filenames, not remote URLs. */
2441 downloaded_file (downloaded_file_t mode, const char *file)
2443 downloaded_file_t *ptr;
2445 if (mode == CHECK_FOR_FILE)
2447 if (!downloaded_files_hash)
2448 return FILE_NOT_ALREADY_DOWNLOADED;
2449 ptr = hash_table_get (downloaded_files_hash, file);
2451 return FILE_NOT_ALREADY_DOWNLOADED;
2455 if (!downloaded_files_hash)
2456 downloaded_files_hash = make_string_hash_table (0);
2458 ptr = hash_table_get (downloaded_files_hash, file);
2462 ptr = downloaded_mode_to_ptr (mode);
2463 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2465 return FILE_NOT_ALREADY_DOWNLOADED;
2469 df_free_mapper (void *key, void *value, void *ignored)
2476 downloaded_files_free (void)
2478 if (downloaded_files_hash)
2480 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2481 hash_table_destroy (downloaded_files_hash);
2482 downloaded_files_hash = NULL;
2486 /* Return non-zero if scheme a is similar to scheme b.
2488 Schemes are similar if they are equal. If SSL is supported, schemes
2489 are also similar if one is http (SCHEME_HTTP) and the other is https
2492 schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2497 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2498 || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2505 /* Debugging and testing support for path_simplify. */
2507 /* Debug: run path_simplify on PATH and return the result in a new
2508 string. Useful for calling from the debugger. */
2512 char *copy = xstrdup (path);
2513 path_simplify (copy);
2518 run_test (char *test, char *expected_result, int expected_change)
2520 char *test_copy = xstrdup (test);
2521 int modified = path_simplify (test_copy);
2523 if (0 != strcmp (test_copy, expected_result))
2525 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2526 test, expected_result, test_copy);
2528 if (modified != expected_change)
2530 if (expected_change == 1)
2531 printf ("Expected no modification with path_simplify(\"%s\").\n",
2534 printf ("Expected modification with path_simplify(\"%s\").\n",
2541 test_path_simplify (void)
2544 char *test, *result;
2550 { "foo", "foo", 0 },
2551 { "foo/bar", "foo/bar", 0 },
2552 { "foo///bar", "foo/bar", 1 },
2553 { "foo/.", "foo/", 1 },
2554 { "foo/./", "foo/", 1 },
2555 { "foo./", "foo./", 0 },
2556 { "foo/../bar", "bar", 1 },
2557 { "foo/../bar/", "bar/", 1 },
2558 { "foo/bar/..", "foo/", 1 },
2559 { "foo/bar/../x", "foo/x", 1 },
2560 { "foo/bar/../x/", "foo/x/", 1 },
2561 { "foo/..", "", 1 },
2562 { "foo/../..", "", 1 },
2563 { "a/b/../../c", "c", 1 },
2564 { "./a/../b", "b", 1 }
2568 for (i = 0; i < ARRAY_SIZE (tests); i++)
2570 char *test = tests[i].test;
2571 char *expected_result = tests[i].result;
2572 int expected_change = tests[i].should_modify;
2573 run_test (test, expected_result, expected_change);
2576 /* Now run all the tests with a leading slash before the test case,
2577 to prove that the slash is being preserved. */
2578 for (i = 0; i < ARRAY_SIZE (tests); i++)
2580 char *test, *expected_result;
2581 int expected_change = tests[i].should_modify;
2583 test = xmalloc (1 + strlen (tests[i].test) + 1);
2584 sprintf (test, "/%s", tests[i].test);
2586 expected_result = xmalloc (1 + strlen (tests[i].result) + 1);
2587 sprintf (expected_result, "/%s", tests[i].result);
2589 run_test (test, expected_result, expected_change);
2592 xfree (expected_result);