2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
46 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
48 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
50 static int urlpath_length PARAMS ((const char *));
58 /* Supported schemes: */
59 static struct scheme_data supported_schemes[] =
61 { "http://", DEFAULT_HTTP_PORT },
63 { "https://", DEFAULT_HTTPS_PORT },
65 { "ftp://", DEFAULT_FTP_PORT },
71 static char *construct_relative PARAMS ((const char *, const char *));
74 /* Support for encoding and decoding of URL strings. We determine
75 whether a character is unsafe through static table lookup. This
76 code assumes ASCII character set and 8-bit chars. */
83 #define R urlchr_reserved
84 #define U urlchr_unsafe
87 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
89 /* rfc1738 reserved chars, preserved from encoding. */
91 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
93 /* rfc1738 unsafe chars, plus some more. */
95 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
97 const static unsigned char urlchr_table[256] =
99 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
100 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
101 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
102 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
103 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
104 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
105 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
106 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
107 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
108 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
109 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
110 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
111 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
112 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
113 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
114 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
116 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
117 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
118 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
119 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
121 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
127 /* Decodes the forms %xy in a URL to the character the hexadecimal
128 code of which is xy. xy are hexadecimal digits from
129 [0123456789ABCDEF] (case-insensitive). If x or y are not
130 hex-digits or `%' precedes `\0', the sequence is inserted
134 decode_string (char *s)
136 char *t = s; /* t - tortoise */
137 char *h = s; /* h - hare */
148 /* Do nothing if '%' is not followed by two hex digits. */
149 if (!*(h + 1) || !*(h + 2)
150 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
152 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
159 /* Like encode_string, but return S if there are no unsafe chars. */
162 encode_string_maybe (const char *s)
169 for (p1 = s; *p1; p1++)
170 if (UNSAFE_CHAR (*p1))
171 addition += 2; /* Two more characters (hex digits) */
176 newlen = (p1 - s) + addition;
177 newstr = (char *)xmalloc (newlen + 1);
183 if (UNSAFE_CHAR (*p1))
185 const unsigned char c = *p1++;
187 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
188 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
194 assert (p2 - newstr == newlen);
199 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
200 given string, returning a malloc-ed %XX encoded string. */
203 encode_string (const char *s)
205 char *encoded = encode_string_maybe (s);
212 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
213 the old value of PTR is freed and PTR is made to point to the newly
214 allocated storage. */
216 #define ENCODE(ptr) do { \
217 char *e_new = encode_string_maybe (ptr); \
225 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
227 /* Decide whether to encode, decode, or pass through the char at P.
228 This used to be a macro, but it got a little too convoluted. */
229 static inline enum copy_method
230 decide_copy_method (const char *p)
234 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
236 /* %xx sequence: decode it, unless it would decode to an
237 unsafe or a reserved char; in that case, leave it as
239 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
240 XCHAR_TO_XDIGIT (*(p + 2));
242 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
243 return CM_PASSTHROUGH;
248 /* Garbled %.. sequence: encode `%'. */
251 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
254 return CM_PASSTHROUGH;
257 /* Translate a %-quoting (but possibly non-conformant) input string S
258 into a %-quoting (and conformant) output string. If no characters
259 are encoded or decoded, return the same string S; otherwise, return
260 a freshly allocated string with the new contents.
262 After a URL has been run through this function, the protocols that
263 use `%' as the quote character can use the resulting string as-is,
264 while those that don't call decode_string() to get to the intended
265 data. This function is also stable: after an input string is
266 transformed the first time, all further transformations of the
267 result yield the same result string.
269 Let's discuss why this function is needed.
271 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
272 space character would mess up the HTTP request, it needs to be
275 GET /abc%20def HTTP/1.0
277 So it appears that the unsafe chars need to be quoted, as with
278 encode_string. But what if we're requested to download
279 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
280 the user meant was a literal space, and he was kind enough to quote
281 it. In that case, Wget should obviously leave the `%20' as is, and
282 send the same request as above. So in this case we may not call
285 But what if the requested URI is `abc%20 def'? If we call
286 encode_string, we end up with `/abc%2520%20def', which is almost
287 certainly not intended. If we don't call encode_string, we are
288 left with the embedded space and cannot send the request. What the
289 user meant was for Wget to request `/abc%20%20def', and this is
290 where reencode_string kicks in.
292 Wget used to solve this by first decoding %-quotes, and then
293 encoding all the "unsafe" characters found in the resulting string.
294 This was wrong because it didn't preserve certain URL special
295 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
296 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
297 whether we considered `+' reserved (it is). One of these results
298 is inevitable because by the second step we would lose information
299 on whether the `+' was originally encoded or not. Both results
300 were wrong because in CGI parameters + means space, while %2B means
301 literal plus. reencode_string correctly translates the above to
302 "a%2B+b", i.e. returns the original string.
304 This function uses an algorithm proposed by Anon Sricharoenchai:
306 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
309 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
312 ...except that this code conflates the two steps, and decides
313 whether to encode, decode, or pass through each character in turn.
314 The function still uses two passes, but their logic is the same --
315 the first pass exists merely for the sake of allocation. Another
316 small difference is that we include `+' to URL_RESERVED.
320 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
322 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
326 "foo bar" -> "foo%20bar"
327 "foo%20bar" -> "foo%20bar"
328 "foo %20bar" -> "foo%20%20bar"
329 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
330 "foo%25%20bar" -> "foo%25%20bar"
331 "foo%2%20bar" -> "foo%252%20bar"
332 "foo+bar" -> "foo+bar" (plus is reserved!)
333 "foo%2b+bar" -> "foo%2b+bar" */
336 reencode_string (const char *s)
342 int encode_count = 0;
343 int decode_count = 0;
345 /* First, pass through the string to see if there's anything to do,
346 and to calculate the new length. */
347 for (p1 = s; *p1; p1++)
349 switch (decide_copy_method (p1))
362 if (!encode_count && !decode_count)
363 /* The string is good as it is. */
364 return (char *)s; /* C const model sucks. */
367 /* Each encoding adds two characters (hex digits), while each
368 decoding removes two characters. */
369 newlen = oldlen + 2 * (encode_count - decode_count);
370 newstr = xmalloc (newlen + 1);
377 switch (decide_copy_method (p1))
383 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
384 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
388 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
389 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
390 p1 += 3; /* skip %xx */
397 assert (p2 - newstr == newlen);
401 /* Run PTR_VAR through reencode_string. If a new string is consed,
402 free PTR_VAR and make it point to the new storage. Obviously,
403 PTR_VAR needs to be an lvalue. */
405 #define REENCODE(ptr_var) do { \
406 char *rf_new = reencode_string (ptr_var); \
407 if (rf_new != ptr_var) \
414 /* Returns the scheme type if the scheme is supported, or
415 SCHEME_INVALID if not. */
417 url_scheme (const char *url)
421 for (i = 0; supported_schemes[i].leading_string; i++)
422 if (!strncasecmp (url, supported_schemes[i].leading_string,
423 strlen (supported_schemes[i].leading_string)))
424 return (enum url_scheme)i;
425 return SCHEME_INVALID;
428 /* Return the number of characters needed to skip the scheme part of
429 the URL, e.g. `http://'. If no scheme is found, returns 0. */
431 url_skip_scheme (const char *url)
435 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
437 while (ISALNUM (*p) || *p == '-' || *p == '+')
444 /* Skip "//" if found. */
445 if (*p == '/' && *(p + 1) == '/')
451 /* Returns 1 if the URL begins with a scheme (supported or
452 unsupported), 0 otherwise. */
454 url_has_scheme (const char *url)
457 while (ISALNUM (*p) || *p == '-' || *p == '+')
463 scheme_default_port (enum url_scheme scheme)
465 return supported_schemes[scheme].default_port;
468 /* Skip the username and password, if present here. The function
469 should be called *not* with the complete URL, but with the part
470 right after the scheme.
472 If no username and password are found, return 0. */
474 url_skip_uname (const char *url)
478 /* Look for '@' that comes before '/' or '?'. */
479 p = (const char *)strpbrk (url, "/?@");
487 parse_uname (const char *str, int len, char **user, char **passwd)
492 /* Empty user name not allowed. */
495 colon = memchr (str, ':', len);
497 /* Empty user name again. */
502 int pwlen = len - (colon + 1 - str);
503 *passwd = xmalloc (pwlen + 1);
504 memcpy (*passwd, colon + 1, pwlen);
505 (*passwd)[pwlen] = '\0';
511 *user = xmalloc (len + 1);
512 memcpy (*user, str, len);
518 /* Used by main.c: detect URLs written using the "shorthand" URL forms
519 popularized by Netscape and NcFTP. HTTP shorthands look like this:
521 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
522 www.foo.com[:port] -> http://www.foo.com[:port]
524 FTP shorthands look like this:
526 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
527 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
529 If the URL needs not or cannot be rewritten, return NULL. */
531 rewrite_shorthand_url (const char *url)
535 if (url_has_scheme (url))
538 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
540 for (p = url; *p && *p != ':' && *p != '/'; p++)
548 const char *pp, *path;
550 /* If the characters after the colon and before the next slash
551 or end of string are all digits, it's HTTP. */
553 for (pp = p + 1; ISDIGIT (*pp); pp++)
556 && (*pp == '/' || *pp == '\0'))
559 /* Prepend "ftp://" to the entire URL... */
561 res = xmalloc (6 + strlen (url) + 1);
562 sprintf (res, "ftp://%s", url);
563 /* ...and replace ':' with '/'. */
564 res[6 + (p - url)] = '/';
571 /* Just prepend "http://" to what we have. */
572 res = xmalloc (7 + strlen (url) + 1);
573 sprintf (res, "http://%s", url);
578 static void parse_path PARAMS ((const char *, char **, char **));
581 strpbrk_or_eos (const char *s, const char *accept)
583 char *p = strpbrk (s, accept);
585 p = (char *)s + strlen (s);
589 static char *parse_errors[] = {
590 #define PE_NO_ERROR 0
592 #define PE_UNRECOGNIZED_SCHEME 1
593 "Unrecognized scheme",
594 #define PE_EMPTY_HOST 2
596 #define PE_BAD_PORT_NUMBER 3
598 #define PE_INVALID_USER_NAME 4
602 #define SETERR(p, v) do { \
609 Return a new struct url if successful, NULL on error. In case of
610 error, and if ERROR is not NULL, also set *ERROR to the appropriate
613 url_parse (const char *url, int *error)
618 enum url_scheme scheme;
620 const char *uname_b, *uname_e;
621 const char *host_b, *host_e;
622 const char *path_b, *path_e;
623 const char *params_b, *params_e;
624 const char *query_b, *query_e;
625 const char *fragment_b, *fragment_e;
628 char *user = NULL, *passwd = NULL;
630 const char *url_orig = url;
632 p = url = reencode_string (url);
634 scheme = url_scheme (url);
635 if (scheme == SCHEME_INVALID)
637 SETERR (error, PE_UNRECOGNIZED_SCHEME);
641 p += strlen (supported_schemes[scheme].leading_string);
643 p += url_skip_uname (p);
646 /* scheme://user:pass@host[:port]... */
649 /* We attempt to break down the URL into the components path,
650 params, query, and fragment. They are ordered like this:
652 scheme://host[:port][/path][;params][?query][#fragment] */
654 params_b = params_e = NULL;
655 query_b = query_e = NULL;
656 fragment_b = fragment_e = NULL;
659 p = strpbrk_or_eos (p, ":/;?#");
662 if (host_b == host_e)
664 SETERR (error, PE_EMPTY_HOST);
668 port = scheme_default_port (scheme);
671 const char *port_b, *port_e, *pp;
673 /* scheme://host:port/tralala */
677 p = strpbrk_or_eos (p, "/;?#");
680 if (port_b == port_e)
682 /* http://host:/whatever */
684 SETERR (error, PE_BAD_PORT_NUMBER);
688 for (port = 0, pp = port_b; pp < port_e; pp++)
692 /* http://host:12randomgarbage/blah */
694 SETERR (error, PE_BAD_PORT_NUMBER);
697 port = 10 * port + (*pp - '0');
705 p = strpbrk_or_eos (p, ";?#");
710 /* Path is not allowed not to exist. */
718 p = strpbrk_or_eos (p, "?#");
725 p = strpbrk_or_eos (p, "#");
737 if (uname_b != uname_e)
739 /* http://user:pass@host */
741 /* uname_b uname_e */
742 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
744 SETERR (error, PE_INVALID_USER_NAME);
749 u = (struct url *)xmalloc (sizeof (struct url));
750 memset (u, 0, sizeof (*u));
753 u->url = xstrdup (url);
755 u->url = (char *)url;
758 u->host = strdupdelim (host_b, host_e);
763 u->path = strdupdelim (path_b, path_e);
764 path_simplify (u->path);
767 u->params = strdupdelim (params_b, params_e);
769 u->query = strdupdelim (query_b, query_e);
771 u->fragment = strdupdelim (fragment_b, fragment_e);
773 parse_path (u->path, &u->dir, &u->file);
779 url_error (int error_code)
781 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
782 return parse_errors[error_code];
786 parse_path (const char *quoted_path, char **dir, char **file)
788 char *path, *last_slash;
790 STRDUP_ALLOCA (path, quoted_path);
791 decode_string (path);
793 last_slash = strrchr (path, '/');
797 *file = xstrdup (path);
801 *dir = strdupdelim (path, last_slash);
802 *file = xstrdup (last_slash + 1);
806 /* Note: URL's "full path" is the path with the query string and
807 params appended. The "fragment" (#foo) is intentionally ignored,
808 but that might be changed. For example, if the original URL was
809 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
810 the full path will be "/foo/bar/baz;bullshit?querystring". */
812 /* Return the length of the full path, without the terminating
816 full_path_length (const struct url *url)
820 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
831 /* Write out the full path. */
834 full_path_write (const struct url *url, char *where)
836 #define FROB(el, chr) do { \
837 char *f_el = url->el; \
839 int l = strlen (f_el); \
841 memcpy (where, f_el, l); \
853 /* Public function for getting the "full path". */
855 url_full_path (const struct url *url)
857 int length = full_path_length (url);
858 char *full_path = (char *)xmalloc(length + 1);
860 full_path_write (url, full_path);
861 full_path[length] = '\0';
866 /* Sync u->path and u->url with u->dir and u->file. */
868 sync_path (struct url *url)
876 newpath = xstrdup (url->file);
881 int dirlen = strlen (url->dir);
882 int filelen = strlen (url->file);
884 newpath = xmalloc (dirlen + 1 + filelen + 1);
885 memcpy (newpath, url->dir, dirlen);
886 newpath[dirlen] = '/';
887 memcpy (newpath + dirlen + 1, url->file, filelen);
888 newpath[dirlen + 1 + filelen] = '\0';
894 /* Synchronize u->url. */
896 url->url = url_string (url, 0);
899 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
900 This way we can sync u->path and u->url when they get changed. */
903 url_set_dir (struct url *url, const char *newdir)
906 url->dir = xstrdup (newdir);
911 url_set_file (struct url *url, const char *newfile)
914 url->file = xstrdup (newfile);
919 url_free (struct url *url)
925 FREE_MAYBE (url->params);
926 FREE_MAYBE (url->query);
927 FREE_MAYBE (url->fragment);
928 FREE_MAYBE (url->user);
929 FREE_MAYBE (url->passwd);
930 FREE_MAYBE (url->dir);
931 FREE_MAYBE (url->file);
937 get_urls_file (const char *file)
939 struct file_memory *fm;
941 const char *text, *text_end;
944 fm = read_file (file);
947 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
950 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
953 text_end = fm->content + fm->length;
954 while (text < text_end)
956 const char *line_beg = text;
957 const char *line_end = memchr (text, '\n', text_end - text);
963 while (line_beg < line_end
964 && ISSPACE (*line_beg))
966 while (line_end > line_beg + 1
967 && ISSPACE (*(line_end - 1)))
969 if (line_end > line_beg)
971 urlpos *entry = (urlpos *)xmalloc (sizeof (urlpos));
972 memset (entry, 0, sizeof (*entry));
974 entry->url = strdupdelim (line_beg, line_end);
986 /* Free the linked list of urlpos. */
988 free_urlpos (urlpos *l)
992 urlpos *next = l->next;
994 FREE_MAYBE (l->local_name);
1000 /* Rotate FNAME opt.backups times */
1002 rotate_backups(const char *fname)
1004 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1005 char *from = (char *)alloca (maxlen);
1006 char *to = (char *)alloca (maxlen);
1010 if (stat (fname, &sb) == 0)
1011 if (S_ISREG (sb.st_mode) == 0)
1014 for (i = opt.backups; i > 1; i--)
1016 sprintf (from, "%s.%d", fname, i - 1);
1017 sprintf (to, "%s.%d", fname, i);
1018 /* #### This will fail on machines without the rename() system
1023 sprintf (to, "%s.%d", fname, 1);
1027 /* Create all the necessary directories for PATH (a file). Calls
1028 mkdirhier() internally. */
1030 mkalldirs (const char *path)
1037 p = path + strlen (path);
1038 for (; *p != '/' && p != path; p--);
1039 /* Don't create if it's just a file. */
1040 if ((p == path) && (*p != '/'))
1042 t = strdupdelim (path, p);
1043 /* Check whether the directory exists. */
1044 if ((stat (t, &st) == 0))
1046 if (S_ISDIR (st.st_mode))
1053 /* If the dir exists as a file name, remove it first. This
1054 is *only* for Wget to work with buggy old CERN http
1055 servers. Here is the scenario: When Wget tries to
1056 retrieve a directory without a slash, e.g.
1057 http://foo/bar (bar being a directory), CERN server will
1058 not redirect it too http://foo/bar/ -- it will generate a
1059 directory listing containing links to bar/file1,
1060 bar/file2, etc. Wget will lose because it saves this
1061 HTML listing to a file `bar', so it cannot create the
1062 directory. To work around this, if the file of the same
1063 name exists, we just remove it and create the directory
1065 DEBUGP (("Removing %s because of directory danger!\n", t));
1069 res = make_directory (t);
1071 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1077 count_slashes (const char *s)
1086 /* Return the path name of the URL-equivalent file name, with a
1087 remote-like structure of directories. */
1089 mkstruct (const struct url *u)
1091 char *host, *dir, *file, *res, *dirpref;
1096 char *ptr = u->dir + (*u->dir == '/');
1097 int slash_count = 1 + count_slashes (ptr);
1098 int cut = MINVAL (opt.cut_dirs, slash_count);
1099 for (; cut && *ptr; ptr++)
1102 STRDUP_ALLOCA (dir, ptr);
1105 dir = u->dir + (*u->dir == '/');
1107 host = xstrdup (u->host);
1108 /* Check for the true name (or at least a consistent name for saving
1109 to directory) of HOST, reusing the hlist if possible. */
1110 if (opt.add_hostdir && !opt.simple_check)
1112 char *nhost = realhost (host);
1116 /* Add dir_prefix and hostname (if required) to the beginning of
1118 if (opt.add_hostdir)
1120 if (!DOTP (opt.dir_prefix))
1122 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1123 + strlen (host) + 1);
1124 sprintf (dirpref, "%s/%s", opt.dir_prefix, host);
1127 STRDUP_ALLOCA (dirpref, host);
1129 else /* not add_hostdir */
1131 if (!DOTP (opt.dir_prefix))
1132 dirpref = opt.dir_prefix;
1138 /* If there is a prefix, prepend it. */
1141 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1142 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1145 dir = encode_string (dir);
1147 if (l && dir[l - 1] == '/')
1151 file = "index.html";
1155 /* Finally, construct the full name. */
1156 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) + 1);
1157 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1162 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1163 an escaped query string. The trick is to make sure that unsafe
1164 characters in BASE are escaped, and that slashes in QUERY are also
1168 compose_file_name (char *base, char *query)
1174 /* Copy BASE to RESULT and encode all unsafe characters. */
1176 while (*from && to - result < sizeof (result))
1178 if (UNSAFE_CHAR (*from))
1180 const unsigned char c = *from++;
1182 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1183 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1189 if (query && to - result < sizeof (result))
1193 /* Copy QUERY to RESULT and encode all '/' characters. */
1195 while (*from && to - result < sizeof (result))
1209 if (to - result < sizeof (result))
1212 /* Truncate input which is too long, presumably due to a huge
1214 result[sizeof (result) - 1] = '\0';
1216 return xstrdup (result);
1219 /* Create a unique filename, corresponding to a given URL. Calls
1220 mkstruct if necessary. Does *not* actually create any directories. */
1222 url_filename (const struct url *u)
1225 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1229 file = mkstruct (u);
1234 char *base = *u->file ? u->file : "index.html";
1235 char *query = u->query && *u->query ? u->query : NULL;
1236 file = compose_file_name (base, query);
1241 /* Check whether the prefix directory is something other than "."
1242 before prepending it. */
1243 if (!DOTP (opt.dir_prefix))
1245 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1246 + 1 + strlen (file) + 1);
1247 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1252 /* DOS-ish file systems don't like `%' signs in them; we change it
1257 for (p = file; *p; p++)
1261 #endif /* WINDOWS */
1263 /* Check the cases in which the unique extensions are not used:
1264 1) Clobbering is turned off (-nc).
1265 2) Retrieval with regetting.
1266 3) Timestamping is used.
1267 4) Hierarchy is built.
1269 The exception is the case when file does exist and is a
1270 directory (actually support for bad httpd-s). */
1271 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1272 && !(file_exists_p (file) && !file_non_directory_p (file)))
1275 /* Find a unique name. */
1276 name = unique_name (file);
1281 /* Like strlen(), but allow the URL to be ended with '?'. */
1283 urlpath_length (const char *url)
1285 const char *q = strchr (url, '?');
1288 return strlen (url);
1291 /* Find the last occurrence of character C in the range [b, e), or
1292 NULL, if none are present. This is almost completely equivalent to
1293 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1294 the contents of the string. */
1296 find_last_char (const char *b, const char *e, char c)
1304 /* Resolve the result of "linking" a base URI (BASE) to a
1305 link-specified URI (LINK).
1307 Either of the URIs may be absolute or relative, complete with the
1308 host name, or path only. This tries to behave "reasonably" in all
1309 foreseeable cases. It employs little specific knowledge about
1310 schemes or URL-specific stuff -- it just works on strings.
1312 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1313 See uri_merge for a gentler interface to this functionality.
1315 #### This function should handle `./' and `../' so that the evil
1316 path_simplify can go. */
1318 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1324 const char *end = base + urlpath_length (base);
1328 /* LINK is a relative URL: we need to replace everything
1329 after last slash (possibly empty) with LINK.
1331 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1332 our result should be "whatever/foo/qux/xyzzy". */
1333 int need_explicit_slash = 0;
1335 const char *start_insert;
1336 const char *last_slash = find_last_char (base, end, '/');
1339 /* No slash found at all. Append LINK to what we have,
1340 but we'll need a slash as a separator.
1342 Example: if base == "foo" and link == "qux/xyzzy", then
1343 we cannot just append link to base, because we'd get
1344 "fooqux/xyzzy", whereas what we want is
1347 To make sure the / gets inserted, we set
1348 need_explicit_slash to 1. We also set start_insert
1349 to end + 1, so that the length calculations work out
1350 correctly for one more (slash) character. Accessing
1351 that character is fine, since it will be the
1352 delimiter, '\0' or '?'. */
1353 /* example: "foo?..." */
1354 /* ^ ('?' gets changed to '/') */
1355 start_insert = end + 1;
1356 need_explicit_slash = 1;
1358 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1360 /* example: http://host" */
1362 start_insert = end + 1;
1363 need_explicit_slash = 1;
1367 /* example: "whatever/foo/bar" */
1369 start_insert = last_slash + 1;
1372 span = start_insert - base;
1373 constr = (char *)xmalloc (span + linklength + 1);
1375 memcpy (constr, base, span);
1376 if (need_explicit_slash)
1377 constr[span - 1] = '/';
1379 memcpy (constr + span, link, linklength);
1380 constr[span + linklength] = '\0';
1382 else /* *link == `/' */
1384 /* LINK is an absolute path: we need to replace everything
1385 after (and including) the FIRST slash with LINK.
1387 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1388 "/qux/xyzzy", our result should be
1389 "http://host/qux/xyzzy". */
1392 const char *start_insert = NULL; /* for gcc to shut up. */
1393 const char *pos = base;
1394 int seen_slash_slash = 0;
1395 /* We're looking for the first slash, but want to ignore
1398 slash = memchr (pos, '/', end - pos);
1399 if (slash && !seen_slash_slash)
1400 if (*(slash + 1) == '/')
1403 seen_slash_slash = 1;
1407 /* At this point, SLASH is the location of the first / after
1408 "//", or the first slash altogether. START_INSERT is the
1409 pointer to the location where LINK will be inserted. When
1410 examining the last two examples, keep in mind that LINK
1413 if (!slash && !seen_slash_slash)
1414 /* example: "foo" */
1416 start_insert = base;
1417 else if (!slash && seen_slash_slash)
1418 /* example: "http://foo" */
1421 else if (slash && !seen_slash_slash)
1422 /* example: "foo/bar" */
1424 start_insert = base;
1425 else if (slash && seen_slash_slash)
1426 /* example: "http://something/" */
1428 start_insert = slash;
1430 span = start_insert - base;
1431 constr = (char *)xmalloc (span + linklength + 1);
1433 memcpy (constr, base, span);
1435 memcpy (constr + span, link, linklength);
1436 constr[span + linklength] = '\0';
1439 else /* !no_scheme */
1441 constr = strdupdelim (link, link + linklength);
1446 /* Merge BASE with LINK and return the resulting URI. This is an
1447 interface to uri_merge_1 that assumes that LINK is a
1448 zero-terminated string. */
1450 uri_merge (const char *base, const char *link)
1452 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1455 #define APPEND(p, s) do { \
1456 int len = strlen (s); \
1457 memcpy (p, s, len); \
1461 /* Use this instead of password when the actual password is supposed
1462 to be hidden. We intentionally use a generic string without giving
1463 away the number of characters in the password, like previous
1465 #define HIDDEN_PASSWORD "*password*"
1467 /* Recreate the URL string from the data in URL.
1469 If HIDE is non-zero (as it is when we're calling this on a URL we
1470 plan to print, but not when calling it to canonicalize a URL for
1471 use within the program), password will be hidden. Unsafe
1472 characters in the URL will be quoted. */
1475 url_string (const struct url *url, int hide_password)
1479 char *quoted_user = NULL, *quoted_passwd = NULL;
1481 int scheme_port = supported_schemes[url->scheme].default_port;
1482 char *scheme_str = supported_schemes[url->scheme].leading_string;
1483 int fplen = full_path_length (url);
1485 assert (scheme_str != NULL);
1487 /* Make sure the user name and password are quoted. */
1490 quoted_user = encode_string_maybe (url->user);
1494 quoted_passwd = HIDDEN_PASSWORD;
1496 quoted_passwd = encode_string_maybe (url->passwd);
1500 size = (strlen (scheme_str)
1501 + strlen (url->host)
1504 if (url->port != scheme_port)
1505 size += 1 + numdigit (url->port);
1508 size += 1 + strlen (quoted_user);
1510 size += 1 + strlen (quoted_passwd);
1513 p = result = xmalloc (size);
1515 APPEND (p, scheme_str);
1518 APPEND (p, quoted_user);
1522 APPEND (p, quoted_passwd);
1527 APPEND (p, url->host);
1528 if (url->port != scheme_port)
1531 long_to_string (p, url->port);
1535 full_path_write (url, p);
1539 assert (p - result == size);
1541 if (quoted_user && quoted_user != url->user)
1542 xfree (quoted_user);
1543 if (quoted_passwd && !hide_password
1544 && quoted_passwd != url->passwd)
1545 xfree (quoted_passwd);
1550 /* Returns proxy host address, in accordance with SCHEME. */
1552 getproxy (enum url_scheme scheme)
1555 char *rewritten_url;
1556 static char rewritten_storage[1024];
1561 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1565 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1569 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1571 case SCHEME_INVALID:
1574 if (!proxy || !*proxy)
1577 /* Handle shorthands. */
1578 rewritten_url = rewrite_shorthand_url (proxy);
1581 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1582 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1583 proxy = rewritten_storage;
1589 /* Should a host be accessed through proxy, concerning no_proxy? */
1591 no_proxy_match (const char *host, const char **no_proxy)
1596 return !sufmatch (no_proxy, host);
1599 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1600 static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
1602 /* Change the links in an HTML document. Accepts a structure that
1603 defines the positions of all the links. */
1605 convert_links (const char *file, urlpos *l)
1607 struct file_memory *fm;
1610 downloaded_file_t downloaded_file_return;
1612 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1615 /* First we do a "dry run": go through the list L and see whether
1616 any URL needs to be converted in the first place. If not, just
1617 leave the file alone. */
1620 for (dry = l; dry; dry = dry->next)
1621 if (dry->convert != CO_NOCONVERT)
1625 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1630 fm = read_file (file);
1633 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1634 file, strerror (errno));
1638 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1639 if (opt.backup_converted && downloaded_file_return)
1640 write_backup_file (file, downloaded_file_return);
1642 /* Before opening the file for writing, unlink the file. This is
1643 important if the data in FM is mmaped. In such case, nulling the
1644 file, which is what fopen() below does, would make us read all
1645 zeroes from the mmaped region. */
1646 if (unlink (file) < 0 && errno != ENOENT)
1648 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1649 file, strerror (errno));
1650 read_file_free (fm);
1653 /* Now open the file for writing. */
1654 fp = fopen (file, "wb");
1657 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1658 file, strerror (errno));
1659 read_file_free (fm);
1662 /* Here we loop through all the URLs in file, replacing those of
1663 them that are downloaded with relative references. */
1665 for (; l; l = l->next)
1667 char *url_start = fm->content + l->pos;
1669 if (l->pos >= fm->length)
1671 DEBUGP (("Something strange is going on. Please investigate."));
1674 /* If the URL is not to be converted, skip it. */
1675 if (l->convert == CO_NOCONVERT)
1677 DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos));
1681 /* Echo the file contents, up to the offending URL's opening
1682 quote, to the outfile. */
1683 fwrite (p, 1, url_start - p, fp);
1685 if (l->convert == CO_CONVERT_TO_RELATIVE)
1687 /* Convert absolute URL to relative. */
1688 char *newname = construct_relative (file, l->local_name);
1689 char *quoted_newname = html_quote_string (newname);
1690 replace_attr (&p, l->size, fp, quoted_newname);
1691 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1692 l->url, newname, l->pos, file));
1694 xfree (quoted_newname);
1696 else if (l->convert == CO_CONVERT_TO_COMPLETE)
1698 /* Convert the link to absolute URL. */
1699 char *newlink = l->url;
1700 char *quoted_newlink = html_quote_string (newlink);
1701 replace_attr (&p, l->size, fp, quoted_newlink);
1702 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1703 newlink, l->pos, file));
1704 xfree (quoted_newlink);
1707 /* Output the rest of the file. */
1708 if (p - fm->content < fm->length)
1709 fwrite (p, 1, fm->length - (p - fm->content), fp);
1711 read_file_free (fm);
1712 logputs (LOG_VERBOSE, _("done.\n"));
1715 /* Construct and return a malloced copy of the relative link from two
1716 pieces of information: local name S1 of the referring file and
1717 local name S2 of the referred file.
1719 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1720 "jagor.srce.hr/images/news.gif", the function will return
1723 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1724 "fly.cc.fer.hr/images/fly.gif", the function will return
1725 "../images/fly.gif".
1727 Caveats: S1 should not begin with `/', unless S2 also begins with
1728 '/'. S1 should not contain things like ".." and such --
1729 construct_relative ("fly/ioccc/../index.html",
1730 "fly/images/fly.gif") will fail. (A workaround is to call
1731 something like path_simplify() on S1). */
1733 construct_relative (const char *s1, const char *s2)
1735 int i, cnt, sepdirs1;
1739 return xstrdup (s2);
1740 /* S1 should *not* be absolute, if S2 wasn't. */
1741 assert (*s1 != '/');
1743 /* Skip the directories common to both strings. */
1746 while (s1[i] && s2[i]
1751 if (s1[i] == '/' && s2[i] == '/')
1756 for (sepdirs1 = 0; s1[i]; i++)
1759 /* Now, construct the file as of:
1760 - ../ repeated sepdirs1 time
1761 - all the non-mutual directories of S2. */
1762 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1763 for (i = 0; i < sepdirs1; i++)
1764 memcpy (res + 3 * i, "../", 3);
1765 strcpy (res + 3 * i, s2 + cnt);
1769 /* Add URL to the head of the list L. */
1771 add_url (urlpos *l, const char *url, const char *file)
1775 t = (urlpos *)xmalloc (sizeof (urlpos));
1776 memset (t, 0, sizeof (*t));
1777 t->url = xstrdup (url);
1778 t->local_name = xstrdup (file);
1784 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1786 /* Rather than just writing over the original .html file with the
1787 converted version, save the former to *.orig. Note we only do
1788 this for files we've _successfully_ downloaded, so we don't
1789 clobber .orig files sitting around from previous invocations. */
1791 /* Construct the backup filename as the original name plus ".orig". */
1792 size_t filename_len = strlen(file);
1793 char* filename_plus_orig_suffix;
1794 boolean already_wrote_backup_file = FALSE;
1795 slist* converted_file_ptr;
1796 static slist* converted_files = NULL;
1798 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1800 /* Just write "orig" over "html". We need to do it this way
1801 because when we're checking to see if we've downloaded the
1802 file before (to see if we can skip downloading it), we don't
1803 know if it's a text/html file. Therefore we don't know yet
1804 at that stage that -E is going to cause us to tack on
1805 ".html", so we need to compare vs. the original URL plus
1806 ".orig", not the original URL plus ".html.orig". */
1807 filename_plus_orig_suffix = alloca (filename_len + 1);
1808 strcpy(filename_plus_orig_suffix, file);
1809 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1811 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1813 /* Append ".orig" to the name. */
1814 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1815 strcpy(filename_plus_orig_suffix, file);
1816 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1819 /* We can get called twice on the same URL thanks to the
1820 convert_all_links() call in main(). If we write the .orig file
1821 each time in such a case, it'll end up containing the first-pass
1822 conversion, not the original file. So, see if we've already been
1823 called on this file. */
1824 converted_file_ptr = converted_files;
1825 while (converted_file_ptr != NULL)
1826 if (strcmp(converted_file_ptr->string, file) == 0)
1828 already_wrote_backup_file = TRUE;
1832 converted_file_ptr = converted_file_ptr->next;
1834 if (!already_wrote_backup_file)
1836 /* Rename <file> to <file>.orig before former gets written over. */
1837 if (rename(file, filename_plus_orig_suffix) != 0)
1838 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1839 file, filename_plus_orig_suffix, strerror (errno));
1841 /* Remember that we've already written a .orig backup for this file.
1842 Note that we never free this memory since we need it till the
1843 convert_all_links() call, which is one of the last things the
1844 program does before terminating. BTW, I'm not sure if it would be
1845 safe to just set 'converted_file_ptr->string' to 'file' below,
1846 rather than making a copy of the string... Another note is that I
1847 thought I could just add a field to the urlpos structure saying
1848 that we'd written a .orig file for this URL, but that didn't work,
1849 so I had to make this separate list.
1850 -- Dan Harkless <wget@harkless.org>
1852 This [adding a field to the urlpos structure] didn't work
1853 because convert_file() is called twice: once after all its
1854 sublinks have been retrieved in recursive_retrieve(), and
1855 once at the end of the day in convert_all_links(). The
1856 original linked list collected in recursive_retrieve() is
1857 lost after the first invocation of convert_links(), and
1858 convert_all_links() makes a new one (it calls get_urls_html()
1859 for each file it covers.) That's why your first approach didn't
1860 work. The way to make it work is perhaps to make this flag a
1861 field in the `urls_html' list.
1862 -- Hrvoje Niksic <hniksic@arsdigita.com>
1864 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1865 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1866 converted_file_ptr->next = converted_files;
1867 converted_files = converted_file_ptr;
1871 static int find_fragment PARAMS ((const char *, int, const char **,
1875 replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
1877 const char *p = *pp;
1879 int size = raw_size;
1880 char quote_char = '\"';
1881 const char *frag_beg, *frag_end;
1883 /* Structure of our string is:
1884 "...old-contents..."
1885 <--- l->size ---> (with quotes)
1888 <--- l->size --> (no quotes) */
1890 if (*p == '\"' || *p == '\'')
1895 size -= 2; /* disregard opening and closing quote */
1897 putc (quote_char, fp);
1898 fputs (new_str, fp);
1900 /* Look for fragment identifier, if any. */
1901 if (find_fragment (p, size, &frag_beg, &frag_end))
1902 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
1906 putc (quote_char, fp);
1910 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
1911 preceded by '&'. If the character is not found, return zero. If
1912 the character is found, return 1 and set BP and EP to point to the
1913 beginning and end of the region.
1915 This is used for finding the fragment indentifiers in URLs. */
1918 find_fragment (const char *beg, int size, const char **bp, const char **ep)
1920 const char *end = beg + size;
1922 for (; beg < end; beg++)
1944 typedef struct _downloaded_file_list {
1946 downloaded_file_t download_type;
1947 struct _downloaded_file_list* next;
1948 } downloaded_file_list;
1950 static downloaded_file_list *downloaded_files;
1952 /* Remembers which files have been downloaded. In the standard case, should be
1953 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
1954 download successfully (i.e. not for ones we have failures on or that we skip
1957 When we've downloaded a file and tacked on a ".html" extension due to -E,
1958 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
1959 FILE_DOWNLOADED_NORMALLY.
1961 If you just want to check if a file has been previously added without adding
1962 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
1963 with local filenames, not remote URLs. */
1965 downloaded_file (downloaded_file_t mode, const char* file)
1967 boolean found_file = FALSE;
1968 downloaded_file_list* rover = downloaded_files;
1970 while (rover != NULL)
1971 if (strcmp(rover->file, file) == 0)
1977 rover = rover->next;
1980 return rover->download_type; /* file had already been downloaded */
1983 if (mode != CHECK_FOR_FILE)
1985 rover = xmalloc(sizeof(*rover));
1986 rover->file = xstrdup(file); /* use xstrdup() so die on out-of-mem. */
1987 rover->download_type = mode;
1988 rover->next = downloaded_files;
1989 downloaded_files = rover;
1992 return FILE_NOT_ALREADY_DOWNLOADED;
1997 downloaded_files_free (void)
1999 downloaded_file_list* rover = downloaded_files;
2002 downloaded_file_list *next = rover->next;
2003 xfree (rover->file);