2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
51 static int urlpath_length PARAMS ((const char *));
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { "http://", DEFAULT_HTTP_PORT },
64 { "https://", DEFAULT_HTTPS_PORT },
66 { "ftp://", DEFAULT_FTP_PORT },
72 static char *construct_relative PARAMS ((const char *, const char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars, preserved from encoding. */
92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
94 /* rfc1738 unsafe chars, plus some more. */
96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
98 const static unsigned char urlchr_table[256] =
100 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
101 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
102 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
103 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
104 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
105 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
106 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
107 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
108 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
109 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
110 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
111 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
112 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
113 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
114 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
115 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
117 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
118 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
119 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 /* Decodes the forms %xy in a URL to the character the hexadecimal
129 code of which is xy. xy are hexadecimal digits from
130 [0123456789ABCDEF] (case-insensitive). If x or y are not
131 hex-digits or `%' precedes `\0', the sequence is inserted
135 decode_string (char *s)
137 char *t = s; /* t - tortoise */
138 char *h = s; /* h - hare */
149 /* Do nothing if '%' is not followed by two hex digits. */
150 if (!*(h + 1) || !*(h + 2)
151 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
153 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
160 /* Like encode_string, but return S if there are no unsafe chars. */
163 encode_string_maybe (const char *s)
170 for (p1 = s; *p1; p1++)
171 if (UNSAFE_CHAR (*p1))
172 addition += 2; /* Two more characters (hex digits) */
177 newlen = (p1 - s) + addition;
178 newstr = (char *)xmalloc (newlen + 1);
184 if (UNSAFE_CHAR (*p1))
186 unsigned char c = *p1++;
188 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
189 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
195 assert (p2 - newstr == newlen);
200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
201 given string, returning a malloc-ed %XX encoded string. */
204 encode_string (const char *s)
206 char *encoded = encode_string_maybe (s);
213 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
214 the old value of PTR is freed and PTR is made to point to the newly
215 allocated storage. */
217 #define ENCODE(ptr) do { \
218 char *e_new = encode_string_maybe (ptr); \
226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
228 /* Decide whether to encode, decode, or pass through the char at P.
229 This used to be a macro, but it got a little too convoluted. */
230 static inline enum copy_method
231 decide_copy_method (const char *p)
235 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
237 /* %xx sequence: decode it, unless it would decode to an
238 unsafe or a reserved char; in that case, leave it as
240 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
241 XCHAR_TO_XDIGIT (*(p + 2));
243 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
244 return CM_PASSTHROUGH;
249 /* Garbled %.. sequence: encode `%'. */
252 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
255 return CM_PASSTHROUGH;
258 /* Translate a %-quoting (but possibly non-conformant) input string S
259 into a %-quoting (and conformant) output string. If no characters
260 are encoded or decoded, return the same string S; otherwise, return
261 a freshly allocated string with the new contents.
263 After a URL has been run through this function, the protocols that
264 use `%' as the quote character can use the resulting string as-is,
265 while those that don't call decode_string() to get to the intended
266 data. This function is also stable: after an input string is
267 transformed the first time, all further transformations of the
268 result yield the same result string.
270 Let's discuss why this function is needed.
272 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
273 space character would mess up the HTTP request, it needs to be
276 GET /abc%20def HTTP/1.0
278 So it appears that the unsafe chars need to be quoted, as with
279 encode_string. But what if we're requested to download
280 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
281 the user meant was a literal space, and he was kind enough to quote
282 it. In that case, Wget should obviously leave the `%20' as is, and
283 send the same request as above. So in this case we may not call
286 But what if the requested URI is `abc%20 def'? If we call
287 encode_string, we end up with `/abc%2520%20def', which is almost
288 certainly not intended. If we don't call encode_string, we are
289 left with the embedded space and cannot send the request. What the
290 user meant was for Wget to request `/abc%20%20def', and this is
291 where reencode_string kicks in.
293 Wget used to solve this by first decoding %-quotes, and then
294 encoding all the "unsafe" characters found in the resulting string.
295 This was wrong because it didn't preserve certain URL special
296 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
297 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
298 whether we considered `+' reserved (it is). One of these results
299 is inevitable because by the second step we would lose information
300 on whether the `+' was originally encoded or not. Both results
301 were wrong because in CGI parameters + means space, while %2B means
302 literal plus. reencode_string correctly translates the above to
303 "a%2B+b", i.e. returns the original string.
305 This function uses an algorithm proposed by Anon Sricharoenchai:
307 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
310 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
313 ...except that this code conflates the two steps, and decides
314 whether to encode, decode, or pass through each character in turn.
315 The function still uses two passes, but their logic is the same --
316 the first pass exists merely for the sake of allocation. Another
317 small difference is that we include `+' to URL_RESERVED.
321 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
323 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
327 "foo bar" -> "foo%20bar"
328 "foo%20bar" -> "foo%20bar"
329 "foo %20bar" -> "foo%20%20bar"
330 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
331 "foo%25%20bar" -> "foo%25%20bar"
332 "foo%2%20bar" -> "foo%252%20bar"
333 "foo+bar" -> "foo+bar" (plus is reserved!)
334 "foo%2b+bar" -> "foo%2b+bar" */
337 reencode_string (const char *s)
343 int encode_count = 0;
344 int decode_count = 0;
346 /* First, pass through the string to see if there's anything to do,
347 and to calculate the new length. */
348 for (p1 = s; *p1; p1++)
350 switch (decide_copy_method (p1))
363 if (!encode_count && !decode_count)
364 /* The string is good as it is. */
365 return (char *)s; /* C const model sucks. */
368 /* Each encoding adds two characters (hex digits), while each
369 decoding removes two characters. */
370 newlen = oldlen + 2 * (encode_count - decode_count);
371 newstr = xmalloc (newlen + 1);
378 switch (decide_copy_method (p1))
382 unsigned char c = *p1++;
384 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
385 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
389 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
390 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
391 p1 += 3; /* skip %xx */
398 assert (p2 - newstr == newlen);
402 /* Run PTR_VAR through reencode_string. If a new string is consed,
403 free PTR_VAR and make it point to the new storage. Obviously,
404 PTR_VAR needs to be an lvalue. */
406 #define REENCODE(ptr_var) do { \
407 char *rf_new = reencode_string (ptr_var); \
408 if (rf_new != ptr_var) \
415 /* Returns the scheme type if the scheme is supported, or
416 SCHEME_INVALID if not. */
418 url_scheme (const char *url)
422 for (i = 0; supported_schemes[i].leading_string; i++)
423 if (!strncasecmp (url, supported_schemes[i].leading_string,
424 strlen (supported_schemes[i].leading_string)))
425 return (enum url_scheme)i;
426 return SCHEME_INVALID;
429 /* Return the number of characters needed to skip the scheme part of
430 the URL, e.g. `http://'. If no scheme is found, returns 0. */
432 url_skip_scheme (const char *url)
436 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
438 while (ISALNUM (*p) || *p == '-' || *p == '+')
445 /* Skip "//" if found. */
446 if (*p == '/' && *(p + 1) == '/')
452 /* Returns 1 if the URL begins with a scheme (supported or
453 unsupported), 0 otherwise. */
455 url_has_scheme (const char *url)
458 while (ISALNUM (*p) || *p == '-' || *p == '+')
464 scheme_default_port (enum url_scheme scheme)
466 return supported_schemes[scheme].default_port;
469 /* Skip the username and password, if present here. The function
470 should be called *not* with the complete URL, but with the part
471 right after the scheme.
473 If no username and password are found, return 0. */
475 url_skip_uname (const char *url)
479 /* Look for '@' that comes before '/' or '?'. */
480 p = (const char *)strpbrk (url, "/?@");
488 parse_uname (const char *str, int len, char **user, char **passwd)
493 /* Empty user name not allowed. */
496 colon = memchr (str, ':', len);
498 /* Empty user name again. */
503 int pwlen = len - (colon + 1 - str);
504 *passwd = xmalloc (pwlen + 1);
505 memcpy (*passwd, colon + 1, pwlen);
506 (*passwd)[pwlen] = '\0';
512 *user = xmalloc (len + 1);
513 memcpy (*user, str, len);
519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
520 popularized by Netscape and NcFTP. HTTP shorthands look like this:
522 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
523 www.foo.com[:port] -> http://www.foo.com[:port]
525 FTP shorthands look like this:
527 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
528 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
530 If the URL needs not or cannot be rewritten, return NULL. */
532 rewrite_shorthand_url (const char *url)
536 if (url_has_scheme (url))
539 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
541 for (p = url; *p && *p != ':' && *p != '/'; p++)
549 const char *pp, *path;
551 /* If the characters after the colon and before the next slash
552 or end of string are all digits, it's HTTP. */
554 for (pp = p + 1; ISDIGIT (*pp); pp++)
557 && (*pp == '/' || *pp == '\0'))
560 /* Prepend "ftp://" to the entire URL... */
562 res = xmalloc (6 + strlen (url) + 1);
563 sprintf (res, "ftp://%s", url);
564 /* ...and replace ':' with '/'. */
565 res[6 + (p - url)] = '/';
572 /* Just prepend "http://" to what we have. */
573 res = xmalloc (7 + strlen (url) + 1);
574 sprintf (res, "http://%s", url);
579 static void parse_path PARAMS ((const char *, char **, char **));
582 strpbrk_or_eos (const char *s, const char *accept)
584 char *p = strpbrk (s, accept);
586 p = (char *)s + strlen (s);
590 /* Turn STR into lowercase; return non-zero if a character was
594 lowercase_str (char *str)
601 *str = TOLOWER (*str);
606 static char *parse_errors[] = {
607 #define PE_NO_ERROR 0
609 #define PE_UNRECOGNIZED_SCHEME 1
610 "Unrecognized scheme",
611 #define PE_EMPTY_HOST 2
613 #define PE_BAD_PORT_NUMBER 3
615 #define PE_INVALID_USER_NAME 4
619 #define SETERR(p, v) do { \
626 Return a new struct url if successful, NULL on error. In case of
627 error, and if ERROR is not NULL, also set *ERROR to the appropriate
630 url_parse (const char *url, int *error)
634 int path_modified, host_modified;
636 enum url_scheme scheme;
638 const char *uname_b, *uname_e;
639 const char *host_b, *host_e;
640 const char *path_b, *path_e;
641 const char *params_b, *params_e;
642 const char *query_b, *query_e;
643 const char *fragment_b, *fragment_e;
646 char *user = NULL, *passwd = NULL;
650 scheme = url_scheme (url);
651 if (scheme == SCHEME_INVALID)
653 SETERR (error, PE_UNRECOGNIZED_SCHEME);
657 url_encoded = reencode_string (url);
660 p += strlen (supported_schemes[scheme].leading_string);
662 p += url_skip_uname (p);
665 /* scheme://user:pass@host[:port]... */
668 /* We attempt to break down the URL into the components path,
669 params, query, and fragment. They are ordered like this:
671 scheme://host[:port][/path][;params][?query][#fragment] */
673 params_b = params_e = NULL;
674 query_b = query_e = NULL;
675 fragment_b = fragment_e = NULL;
678 p = strpbrk_or_eos (p, ":/;?#");
681 if (host_b == host_e)
683 SETERR (error, PE_EMPTY_HOST);
687 port = scheme_default_port (scheme);
690 const char *port_b, *port_e, *pp;
692 /* scheme://host:port/tralala */
696 p = strpbrk_or_eos (p, "/;?#");
699 if (port_b == port_e)
701 /* http://host:/whatever */
703 SETERR (error, PE_BAD_PORT_NUMBER);
707 for (port = 0, pp = port_b; pp < port_e; pp++)
711 /* http://host:12randomgarbage/blah */
713 SETERR (error, PE_BAD_PORT_NUMBER);
716 port = 10 * port + (*pp - '0');
724 p = strpbrk_or_eos (p, ";?#");
729 /* Path is not allowed not to exist. */
737 p = strpbrk_or_eos (p, "?#");
744 p = strpbrk_or_eos (p, "#");
756 if (uname_b != uname_e)
758 /* http://user:pass@host */
760 /* uname_b uname_e */
761 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
763 SETERR (error, PE_INVALID_USER_NAME);
768 u = (struct url *)xmalloc (sizeof (struct url));
769 memset (u, 0, sizeof (*u));
772 u->host = strdupdelim (host_b, host_e);
777 u->path = strdupdelim (path_b, path_e);
778 path_modified = path_simplify (u->path);
779 parse_path (u->path, &u->dir, &u->file);
781 host_modified = lowercase_str (u->host);
784 u->params = strdupdelim (params_b, params_e);
786 u->query = strdupdelim (query_b, query_e);
788 u->fragment = strdupdelim (fragment_b, fragment_e);
790 if (path_modified || u->fragment || host_modified || path_b == path_e)
792 /* If we suspect that a transformation has rendered what
793 url_string might return different from URL_ENCODED, rebuild
794 u->url using url_string. */
795 u->url = url_string (u, 0);
797 if (url_encoded != url)
798 xfree ((char *) url_encoded);
802 if (url_encoded == url)
803 u->url = xstrdup (url);
805 u->url = url_encoded;
813 url_error (int error_code)
815 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
816 return parse_errors[error_code];
820 parse_path (const char *quoted_path, char **dir, char **file)
822 char *path, *last_slash;
824 STRDUP_ALLOCA (path, quoted_path);
825 decode_string (path);
827 last_slash = strrchr (path, '/');
831 *file = xstrdup (path);
835 *dir = strdupdelim (path, last_slash);
836 *file = xstrdup (last_slash + 1);
840 /* Note: URL's "full path" is the path with the query string and
841 params appended. The "fragment" (#foo) is intentionally ignored,
842 but that might be changed. For example, if the original URL was
843 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
844 the full path will be "/foo/bar/baz;bullshit?querystring". */
846 /* Return the length of the full path, without the terminating
850 full_path_length (const struct url *url)
854 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
865 /* Write out the full path. */
868 full_path_write (const struct url *url, char *where)
870 #define FROB(el, chr) do { \
871 char *f_el = url->el; \
873 int l = strlen (f_el); \
875 memcpy (where, f_el, l); \
887 /* Public function for getting the "full path". E.g. if u->path is
888 "foo/bar" and u->query is "param=value", full_path will be
889 "/foo/bar?param=value". */
892 url_full_path (const struct url *url)
894 int length = full_path_length (url);
895 char *full_path = (char *)xmalloc(length + 1);
897 full_path_write (url, full_path);
898 full_path[length] = '\0';
903 /* Sync u->path and u->url with u->dir and u->file. */
906 sync_path (struct url *url)
914 newpath = xstrdup (url->file);
919 int dirlen = strlen (url->dir);
920 int filelen = strlen (url->file);
922 newpath = xmalloc (dirlen + 1 + filelen + 1);
923 memcpy (newpath, url->dir, dirlen);
924 newpath[dirlen] = '/';
925 memcpy (newpath + dirlen + 1, url->file, filelen);
926 newpath[dirlen + 1 + filelen] = '\0';
932 /* Synchronize u->url. */
934 url->url = url_string (url, 0);
937 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
938 This way we can sync u->path and u->url when they get changed. */
941 url_set_dir (struct url *url, const char *newdir)
944 url->dir = xstrdup (newdir);
949 url_set_file (struct url *url, const char *newfile)
952 url->file = xstrdup (newfile);
957 url_free (struct url *url)
963 FREE_MAYBE (url->params);
964 FREE_MAYBE (url->query);
965 FREE_MAYBE (url->fragment);
966 FREE_MAYBE (url->user);
967 FREE_MAYBE (url->passwd);
976 get_urls_file (const char *file)
978 struct file_memory *fm;
979 struct urlpos *head, *tail;
980 const char *text, *text_end;
983 fm = read_file (file);
986 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
989 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
992 text_end = fm->content + fm->length;
993 while (text < text_end)
995 const char *line_beg = text;
996 const char *line_end = memchr (text, '\n', text_end - text);
1002 while (line_beg < line_end
1003 && ISSPACE (*line_beg))
1005 while (line_end > line_beg + 1
1006 && ISSPACE (*(line_end - 1)))
1008 if (line_end > line_beg)
1010 /* URL is in the [line_beg, line_end) region. */
1014 struct urlpos *entry;
1017 /* We must copy the URL to a zero-terminated string, and we
1018 can't use alloca because we're in a loop. *sigh*. */
1019 url_text = strdupdelim (line_beg, line_end);
1023 /* Merge opt.base_href with URL. */
1024 char *merged = uri_merge (opt.base_href, url_text);
1029 url = url_parse (url_text, &up_error_code);
1032 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1033 file, url_text, url_error (up_error_code));
1039 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1040 memset (entry, 0, sizeof (*entry));
1051 read_file_free (fm);
1055 /* Free the linked list of urlpos. */
1057 free_urlpos (struct urlpos *l)
1061 struct urlpos *next = l->next;
1064 FREE_MAYBE (l->local_name);
1070 /* Rotate FNAME opt.backups times */
1072 rotate_backups(const char *fname)
1074 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1075 char *from = (char *)alloca (maxlen);
1076 char *to = (char *)alloca (maxlen);
1080 if (stat (fname, &sb) == 0)
1081 if (S_ISREG (sb.st_mode) == 0)
1084 for (i = opt.backups; i > 1; i--)
1086 sprintf (from, "%s.%d", fname, i - 1);
1087 sprintf (to, "%s.%d", fname, i);
1088 /* #### This will fail on machines without the rename() system
1093 sprintf (to, "%s.%d", fname, 1);
1097 /* Create all the necessary directories for PATH (a file). Calls
1098 mkdirhier() internally. */
1100 mkalldirs (const char *path)
1107 p = path + strlen (path);
1108 for (; *p != '/' && p != path; p--);
1109 /* Don't create if it's just a file. */
1110 if ((p == path) && (*p != '/'))
1112 t = strdupdelim (path, p);
1113 /* Check whether the directory exists. */
1114 if ((stat (t, &st) == 0))
1116 if (S_ISDIR (st.st_mode))
1123 /* If the dir exists as a file name, remove it first. This
1124 is *only* for Wget to work with buggy old CERN http
1125 servers. Here is the scenario: When Wget tries to
1126 retrieve a directory without a slash, e.g.
1127 http://foo/bar (bar being a directory), CERN server will
1128 not redirect it too http://foo/bar/ -- it will generate a
1129 directory listing containing links to bar/file1,
1130 bar/file2, etc. Wget will lose because it saves this
1131 HTML listing to a file `bar', so it cannot create the
1132 directory. To work around this, if the file of the same
1133 name exists, we just remove it and create the directory
1135 DEBUGP (("Removing %s because of directory danger!\n", t));
1139 res = make_directory (t);
1141 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1147 count_slashes (const char *s)
1156 /* Return the path name of the URL-equivalent file name, with a
1157 remote-like structure of directories. */
1159 mkstruct (const struct url *u)
1161 char *dir, *dir_preencoding;
1162 char *file, *res, *dirpref;
1163 char *query = u->query && *u->query ? u->query : NULL;
1168 char *ptr = u->dir + (*u->dir == '/');
1169 int slash_count = 1 + count_slashes (ptr);
1170 int cut = MINVAL (opt.cut_dirs, slash_count);
1171 for (; cut && *ptr; ptr++)
1174 STRDUP_ALLOCA (dir, ptr);
1177 dir = u->dir + (*u->dir == '/');
1179 /* Check for the true name (or at least a consistent name for saving
1180 to directory) of HOST, reusing the hlist if possible. */
1181 if (opt.add_hostdir)
1183 /* Add dir_prefix and hostname (if required) to the beginning of
1185 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1187 + 1 + numdigit (u->port)
1189 if (!DOTP (opt.dir_prefix))
1190 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1192 strcpy (dirpref, u->host);
1194 if (u->port != scheme_default_port (u->scheme))
1196 int len = strlen (dirpref);
1198 long_to_string (dirpref + len + 1, u->port);
1201 else /* not add_hostdir */
1203 if (!DOTP (opt.dir_prefix))
1204 dirpref = opt.dir_prefix;
1209 /* If there is a prefix, prepend it. */
1212 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1213 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1217 dir_preencoding = dir;
1218 dir = reencode_string (dir_preencoding);
1221 if (l && dir[l - 1] == '/')
1225 file = "index.html";
1229 /* Finally, construct the full name. */
1230 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1231 + (query ? (1 + strlen (query)) : 0)
1233 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1237 strcat (res, query);
1239 if (dir != dir_preencoding)
1244 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1245 an escaped query string. The trick is to make sure that unsafe
1246 characters in BASE are escaped, and that slashes in QUERY are also
1250 compose_file_name (char *base, char *query)
1256 /* Copy BASE to RESULT and encode all unsafe characters. */
1258 while (*from && to - result < sizeof (result))
1260 if (UNSAFE_CHAR (*from))
1262 unsigned char c = *from++;
1264 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1265 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1271 if (query && to - result < sizeof (result))
1275 /* Copy QUERY to RESULT and encode all '/' characters. */
1277 while (*from && to - result < sizeof (result))
1291 if (to - result < sizeof (result))
1294 /* Truncate input which is too long, presumably due to a huge
1296 result[sizeof (result) - 1] = '\0';
1298 return xstrdup (result);
1301 /* Create a unique filename, corresponding to a given URL. Calls
1302 mkstruct if necessary. Does *not* actually create any directories. */
1304 url_filename (const struct url *u)
1307 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1311 file = mkstruct (u);
1316 char *base = *u->file ? u->file : "index.html";
1317 char *query = u->query && *u->query ? u->query : NULL;
1318 file = compose_file_name (base, query);
1323 /* Check whether the prefix directory is something other than "."
1324 before prepending it. */
1325 if (!DOTP (opt.dir_prefix))
1327 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1328 + 1 + strlen (file) + 1);
1329 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1334 /* DOS-ish file systems don't like `%' signs in them; we change it
1339 for (p = file; *p; p++)
1343 #endif /* WINDOWS */
1345 /* Check the cases in which the unique extensions are not used:
1346 1) Clobbering is turned off (-nc).
1347 2) Retrieval with regetting.
1348 3) Timestamping is used.
1349 4) Hierarchy is built.
1351 The exception is the case when file does exist and is a
1352 directory (actually support for bad httpd-s). */
1353 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1354 && !(file_exists_p (file) && !file_non_directory_p (file)))
1357 /* Find a unique name. */
1358 name = unique_name (file);
1363 /* Like strlen(), but allow the URL to be ended with '?'. */
1365 urlpath_length (const char *url)
1367 const char *q = strpbrk_or_eos (url, "?;#");
1371 /* Find the last occurrence of character C in the range [b, e), or
1372 NULL, if none are present. This is almost completely equivalent to
1373 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1374 the contents of the string. */
1376 find_last_char (const char *b, const char *e, char c)
1384 /* Resolve the result of "linking" a base URI (BASE) to a
1385 link-specified URI (LINK).
1387 Either of the URIs may be absolute or relative, complete with the
1388 host name, or path only. This tries to behave "reasonably" in all
1389 foreseeable cases. It employs little specific knowledge about
1390 schemes or URL-specific stuff -- it just works on strings.
1392 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1393 See uri_merge for a gentler interface to this functionality.
1395 Perhaps this function should handle `./' and `../' so that the evil
1396 path_simplify can go. */
1398 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1404 const char *end = base + urlpath_length (base);
1408 /* Empty LINK points back to BASE, query string and all. */
1409 constr = xstrdup (base);
1411 else if (*link == '?')
1413 /* LINK points to the same location, but changes the query
1414 string. Examples: */
1415 /* uri_merge("path", "?new") -> "path?new" */
1416 /* uri_merge("path?foo", "?new") -> "path?new" */
1417 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1418 /* uri_merge("path#foo", "?new") -> "path?new" */
1419 int baselength = end - base;
1420 constr = xmalloc (baselength + linklength + 1);
1421 memcpy (constr, base, baselength);
1422 memcpy (constr + baselength, link, linklength);
1423 constr[baselength + linklength] = '\0';
1425 else if (*link == '#')
1427 /* uri_merge("path", "#new") -> "path#new" */
1428 /* uri_merge("path#foo", "#new") -> "path#new" */
1429 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1430 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1432 const char *end1 = strchr (base, '#');
1434 end1 = base + strlen (base);
1435 baselength = end1 - base;
1436 constr = xmalloc (baselength + linklength + 1);
1437 memcpy (constr, base, baselength);
1438 memcpy (constr + baselength, link, linklength);
1439 constr[baselength + linklength] = '\0';
1441 else if (*link == '/')
1443 /* LINK is an absolute path: we need to replace everything
1444 after (and including) the FIRST slash with LINK.
1446 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1447 "/qux/xyzzy", our result should be
1448 "http://host/qux/xyzzy". */
1451 const char *start_insert = NULL; /* for gcc to shut up. */
1452 const char *pos = base;
1453 int seen_slash_slash = 0;
1454 /* We're looking for the first slash, but want to ignore
1457 slash = memchr (pos, '/', end - pos);
1458 if (slash && !seen_slash_slash)
1459 if (*(slash + 1) == '/')
1462 seen_slash_slash = 1;
1466 /* At this point, SLASH is the location of the first / after
1467 "//", or the first slash altogether. START_INSERT is the
1468 pointer to the location where LINK will be inserted. When
1469 examining the last two examples, keep in mind that LINK
1472 if (!slash && !seen_slash_slash)
1473 /* example: "foo" */
1475 start_insert = base;
1476 else if (!slash && seen_slash_slash)
1477 /* example: "http://foo" */
1480 else if (slash && !seen_slash_slash)
1481 /* example: "foo/bar" */
1483 start_insert = base;
1484 else if (slash && seen_slash_slash)
1485 /* example: "http://something/" */
1487 start_insert = slash;
1489 span = start_insert - base;
1490 constr = (char *)xmalloc (span + linklength + 1);
1492 memcpy (constr, base, span);
1494 memcpy (constr + span, link, linklength);
1495 constr[span + linklength] = '\0';
1499 /* LINK is a relative URL: we need to replace everything
1500 after last slash (possibly empty) with LINK.
1502 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1503 our result should be "whatever/foo/qux/xyzzy". */
1504 int need_explicit_slash = 0;
1506 const char *start_insert;
1507 const char *last_slash = find_last_char (base, end, '/');
1510 /* No slash found at all. Append LINK to what we have,
1511 but we'll need a slash as a separator.
1513 Example: if base == "foo" and link == "qux/xyzzy", then
1514 we cannot just append link to base, because we'd get
1515 "fooqux/xyzzy", whereas what we want is
1518 To make sure the / gets inserted, we set
1519 need_explicit_slash to 1. We also set start_insert
1520 to end + 1, so that the length calculations work out
1521 correctly for one more (slash) character. Accessing
1522 that character is fine, since it will be the
1523 delimiter, '\0' or '?'. */
1524 /* example: "foo?..." */
1525 /* ^ ('?' gets changed to '/') */
1526 start_insert = end + 1;
1527 need_explicit_slash = 1;
1529 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1531 /* example: http://host" */
1533 start_insert = end + 1;
1534 need_explicit_slash = 1;
1538 /* example: "whatever/foo/bar" */
1540 start_insert = last_slash + 1;
1543 span = start_insert - base;
1544 constr = (char *)xmalloc (span + linklength + 1);
1546 memcpy (constr, base, span);
1547 if (need_explicit_slash)
1548 constr[span - 1] = '/';
1550 memcpy (constr + span, link, linklength);
1551 constr[span + linklength] = '\0';
1554 else /* !no_scheme */
1556 constr = strdupdelim (link, link + linklength);
1561 /* Merge BASE with LINK and return the resulting URI. This is an
1562 interface to uri_merge_1 that assumes that LINK is a
1563 zero-terminated string. */
1565 uri_merge (const char *base, const char *link)
1567 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1570 #define APPEND(p, s) do { \
1571 int len = strlen (s); \
1572 memcpy (p, s, len); \
1576 /* Use this instead of password when the actual password is supposed
1577 to be hidden. We intentionally use a generic string without giving
1578 away the number of characters in the password, like previous
1580 #define HIDDEN_PASSWORD "*password*"
1582 /* Recreate the URL string from the data in URL.
1584 If HIDE is non-zero (as it is when we're calling this on a URL we
1585 plan to print, but not when calling it to canonicalize a URL for
1586 use within the program), password will be hidden. Unsafe
1587 characters in the URL will be quoted. */
1590 url_string (const struct url *url, int hide_password)
1594 char *quoted_user = NULL, *quoted_passwd = NULL;
1596 int scheme_port = supported_schemes[url->scheme].default_port;
1597 char *scheme_str = supported_schemes[url->scheme].leading_string;
1598 int fplen = full_path_length (url);
1600 assert (scheme_str != NULL);
1602 /* Make sure the user name and password are quoted. */
1605 quoted_user = encode_string_maybe (url->user);
1609 quoted_passwd = HIDDEN_PASSWORD;
1611 quoted_passwd = encode_string_maybe (url->passwd);
1615 size = (strlen (scheme_str)
1616 + strlen (url->host)
1619 if (url->port != scheme_port)
1620 size += 1 + numdigit (url->port);
1623 size += 1 + strlen (quoted_user);
1625 size += 1 + strlen (quoted_passwd);
1628 p = result = xmalloc (size);
1630 APPEND (p, scheme_str);
1633 APPEND (p, quoted_user);
1637 APPEND (p, quoted_passwd);
1642 APPEND (p, url->host);
1643 if (url->port != scheme_port)
1646 long_to_string (p, url->port);
1650 full_path_write (url, p);
1654 assert (p - result == size);
1656 if (quoted_user && quoted_user != url->user)
1657 xfree (quoted_user);
1658 if (quoted_passwd && !hide_password
1659 && quoted_passwd != url->passwd)
1660 xfree (quoted_passwd);
1665 /* Returns proxy host address, in accordance with SCHEME. */
1667 getproxy (enum url_scheme scheme)
1670 char *rewritten_url;
1671 static char rewritten_storage[1024];
1676 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1680 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1684 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1686 case SCHEME_INVALID:
1689 if (!proxy || !*proxy)
1692 /* Handle shorthands. */
1693 rewritten_url = rewrite_shorthand_url (proxy);
1696 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1697 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1698 proxy = rewritten_storage;
1704 /* Should a host be accessed through proxy, concerning no_proxy? */
1706 no_proxy_match (const char *host, const char **no_proxy)
1711 return !sufmatch (no_proxy, host);
1714 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1715 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1717 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1718 const char *, int));
1719 static char *local_quote_string PARAMS ((const char *));
1721 /* Change the links in one HTML file. LINKS is a list of links in the
1722 document, along with their positions and the desired direction of
1725 convert_links (const char *file, struct urlpos *links)
1727 struct file_memory *fm;
1730 downloaded_file_t downloaded_file_return;
1732 struct urlpos *link;
1733 int to_url_count = 0, to_file_count = 0;
1735 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1738 /* First we do a "dry run": go through the list L and see whether
1739 any URL needs to be converted in the first place. If not, just
1740 leave the file alone. */
1742 struct urlpos *dry = links;
1743 for (dry = links; dry; dry = dry->next)
1744 if (dry->convert != CO_NOCONVERT)
1748 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1753 fm = read_file (file);
1756 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1757 file, strerror (errno));
1761 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1762 if (opt.backup_converted && downloaded_file_return)
1763 write_backup_file (file, downloaded_file_return);
1765 /* Before opening the file for writing, unlink the file. This is
1766 important if the data in FM is mmaped. In such case, nulling the
1767 file, which is what fopen() below does, would make us read all
1768 zeroes from the mmaped region. */
1769 if (unlink (file) < 0 && errno != ENOENT)
1771 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1772 file, strerror (errno));
1773 read_file_free (fm);
1776 /* Now open the file for writing. */
1777 fp = fopen (file, "wb");
1780 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1781 file, strerror (errno));
1782 read_file_free (fm);
1786 /* Here we loop through all the URLs in file, replacing those of
1787 them that are downloaded with relative references. */
1789 for (link = links; link; link = link->next)
1791 char *url_start = fm->content + link->pos;
1793 if (link->pos >= fm->length)
1795 DEBUGP (("Something strange is going on. Please investigate."));
1798 /* If the URL is not to be converted, skip it. */
1799 if (link->convert == CO_NOCONVERT)
1801 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1805 /* Echo the file contents, up to the offending URL's opening
1806 quote, to the outfile. */
1807 fwrite (p, 1, url_start - p, fp);
1810 switch (link->convert)
1812 case CO_CONVERT_TO_RELATIVE:
1813 /* Convert absolute URL to relative. */
1815 char *newname = construct_relative (file, link->local_name);
1816 char *quoted_newname = local_quote_string (newname);
1818 if (!link->link_refresh_p)
1819 p = replace_attr (p, link->size, fp, quoted_newname);
1821 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1822 link->refresh_timeout);
1824 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1825 link->url->url, newname, link->pos, file));
1827 xfree (quoted_newname);
1831 case CO_CONVERT_TO_COMPLETE:
1832 /* Convert the link to absolute URL. */
1834 char *newlink = link->url->url;
1835 char *quoted_newlink = html_quote_string (newlink);
1837 if (!link->link_refresh_p)
1838 p = replace_attr (p, link->size, fp, quoted_newlink);
1840 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1841 link->refresh_timeout);
1843 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1844 newlink, link->pos, file));
1845 xfree (quoted_newlink);
1849 case CO_NULLIFY_BASE:
1850 /* Change the base href to "". */
1851 p = replace_attr (p, link->size, fp, "");
1859 /* Output the rest of the file. */
1860 if (p - fm->content < fm->length)
1861 fwrite (p, 1, fm->length - (p - fm->content), fp);
1863 read_file_free (fm);
1865 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1868 /* Construct and return a malloced copy of the relative link from two
1869 pieces of information: local name S1 of the referring file and
1870 local name S2 of the referred file.
1872 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1873 "jagor.srce.hr/images/news.gif", the function will return
1876 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1877 "fly.cc.fer.hr/images/fly.gif", the function will return
1878 "../images/fly.gif".
1880 Caveats: S1 should not begin with `/', unless S2 also begins with
1881 '/'. S1 should not contain things like ".." and such --
1882 construct_relative ("fly/ioccc/../index.html",
1883 "fly/images/fly.gif") will fail. (A workaround is to call
1884 something like path_simplify() on S1). */
1886 construct_relative (const char *s1, const char *s2)
1888 int i, cnt, sepdirs1;
1892 return xstrdup (s2);
1893 /* S1 should *not* be absolute, if S2 wasn't. */
1894 assert (*s1 != '/');
1896 /* Skip the directories common to both strings. */
1899 while (s1[i] && s2[i]
1904 if (s1[i] == '/' && s2[i] == '/')
1909 for (sepdirs1 = 0; s1[i]; i++)
1912 /* Now, construct the file as of:
1913 - ../ repeated sepdirs1 time
1914 - all the non-mutual directories of S2. */
1915 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1916 for (i = 0; i < sepdirs1; i++)
1917 memcpy (res + 3 * i, "../", 3);
1918 strcpy (res + 3 * i, s2 + cnt);
1923 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1925 /* Rather than just writing over the original .html file with the
1926 converted version, save the former to *.orig. Note we only do
1927 this for files we've _successfully_ downloaded, so we don't
1928 clobber .orig files sitting around from previous invocations. */
1930 /* Construct the backup filename as the original name plus ".orig". */
1931 size_t filename_len = strlen(file);
1932 char* filename_plus_orig_suffix;
1933 boolean already_wrote_backup_file = FALSE;
1934 slist* converted_file_ptr;
1935 static slist* converted_files = NULL;
1937 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1939 /* Just write "orig" over "html". We need to do it this way
1940 because when we're checking to see if we've downloaded the
1941 file before (to see if we can skip downloading it), we don't
1942 know if it's a text/html file. Therefore we don't know yet
1943 at that stage that -E is going to cause us to tack on
1944 ".html", so we need to compare vs. the original URL plus
1945 ".orig", not the original URL plus ".html.orig". */
1946 filename_plus_orig_suffix = alloca (filename_len + 1);
1947 strcpy(filename_plus_orig_suffix, file);
1948 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1950 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1952 /* Append ".orig" to the name. */
1953 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1954 strcpy(filename_plus_orig_suffix, file);
1955 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1958 /* We can get called twice on the same URL thanks to the
1959 convert_all_links() call in main(). If we write the .orig file
1960 each time in such a case, it'll end up containing the first-pass
1961 conversion, not the original file. So, see if we've already been
1962 called on this file. */
1963 converted_file_ptr = converted_files;
1964 while (converted_file_ptr != NULL)
1965 if (strcmp(converted_file_ptr->string, file) == 0)
1967 already_wrote_backup_file = TRUE;
1971 converted_file_ptr = converted_file_ptr->next;
1973 if (!already_wrote_backup_file)
1975 /* Rename <file> to <file>.orig before former gets written over. */
1976 if (rename(file, filename_plus_orig_suffix) != 0)
1977 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1978 file, filename_plus_orig_suffix, strerror (errno));
1980 /* Remember that we've already written a .orig backup for this file.
1981 Note that we never free this memory since we need it till the
1982 convert_all_links() call, which is one of the last things the
1983 program does before terminating. BTW, I'm not sure if it would be
1984 safe to just set 'converted_file_ptr->string' to 'file' below,
1985 rather than making a copy of the string... Another note is that I
1986 thought I could just add a field to the urlpos structure saying
1987 that we'd written a .orig file for this URL, but that didn't work,
1988 so I had to make this separate list.
1989 -- Dan Harkless <wget@harkless.org>
1991 This [adding a field to the urlpos structure] didn't work
1992 because convert_file() is called from convert_all_links at
1993 the end of the retrieval with a freshly built new urlpos
1995 -- Hrvoje Niksic <hniksic@arsdigita.com>
1997 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
1998 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
1999 converted_file_ptr->next = converted_files;
2000 converted_files = converted_file_ptr;
2004 static int find_fragment PARAMS ((const char *, int, const char **,
2007 /* Replace an attribute's original text with NEW_TEXT. */
2010 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2013 char quote_char = '\"'; /* use "..." for quoting, unless the
2014 original value is quoted, in which
2015 case reuse its quoting char. */
2016 const char *frag_beg, *frag_end;
2018 /* Structure of our string is:
2019 "...old-contents..."
2020 <--- size ---> (with quotes)
2023 <--- size --> (no quotes) */
2025 if (*p == '\"' || *p == '\'')
2030 size -= 2; /* disregard opening and closing quote */
2032 putc (quote_char, fp);
2033 fputs (new_text, fp);
2035 /* Look for fragment identifier, if any. */
2036 if (find_fragment (p, size, &frag_beg, &frag_end))
2037 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2041 putc (quote_char, fp);
2046 /* The same as REPLACE_ATTR, but used when replacing
2047 <meta http-equiv=refresh content="new_text"> because we need to
2048 append "timeout_value; URL=" before the next_text. */
2051 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2052 const char *new_text, int timeout)
2055 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2059 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2061 return replace_attr (p, size, fp, new_with_timeout);
2064 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2065 preceded by '&'. If the character is not found, return zero. If
2066 the character is found, return 1 and set BP and EP to point to the
2067 beginning and end of the region.
2069 This is used for finding the fragment indentifiers in URLs. */
2072 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2074 const char *end = beg + size;
2076 for (; beg < end; beg++)
2098 /* Quote FILE for use as local reference to an HTML file.
2100 We quote ? as %3F to avoid passing part of the file name as the
2101 parameter when browsing the converted file through HTTP. However,
2102 it is safe to do this only when `--html-extension' is turned on.
2103 This is because converting "index.html?foo=bar" to
2104 "index.html%3Ffoo=bar" would break local browsing, as the latter
2105 isn't even recognized as an HTML file! However, converting
2106 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2107 safe for both local and HTTP-served browsing. */
2110 local_quote_string (const char *file)
2112 const char *file_sans_qmark;
2115 if (!opt.html_extension)
2116 return html_quote_string (file);
2118 qm = count_char (file, '?');
2122 const char *from = file;
2125 /* qm * 2 because we replace each question mark with "%3F",
2126 i.e. replace one char with three, hence two more. */
2127 int fsqlen = strlen (file) + qm * 2;
2129 to = newname = (char *)alloca (fsqlen + 1);
2130 for (; *from; from++)
2141 assert (to - newname == fsqlen);
2144 file_sans_qmark = newname;
2147 file_sans_qmark = file;
2149 return html_quote_string (file_sans_qmark);
2152 /* We're storing "modes" of type downloaded_file_t in the hash table.
2153 However, our hash tables only accept pointers for keys and values.
2154 So when we need a pointer, we use the address of a
2155 downloaded_file_t variable of static storage. */
2157 static downloaded_file_t *
2158 downloaded_mode_to_ptr (downloaded_file_t mode)
2160 static downloaded_file_t
2161 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2162 v2 = FILE_DOWNLOADED_NORMALLY,
2163 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2164 v4 = CHECK_FOR_FILE;
2168 case FILE_NOT_ALREADY_DOWNLOADED:
2170 case FILE_DOWNLOADED_NORMALLY:
2172 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2174 case CHECK_FOR_FILE:
2180 /* This should really be merged with dl_file_url_map and
2181 downloaded_html_files in recur.c. This was originally a list, but
2182 I changed it to a hash table beause it was actually taking a lot of
2183 time to find things in it. */
2185 static struct hash_table *downloaded_files_hash;
2187 /* Remembers which files have been downloaded. In the standard case, should be
2188 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2189 download successfully (i.e. not for ones we have failures on or that we skip
2192 When we've downloaded a file and tacked on a ".html" extension due to -E,
2193 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2194 FILE_DOWNLOADED_NORMALLY.
2196 If you just want to check if a file has been previously added without adding
2197 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2198 with local filenames, not remote URLs. */
2200 downloaded_file (downloaded_file_t mode, const char *file)
2202 downloaded_file_t *ptr;
2204 if (mode == CHECK_FOR_FILE)
2206 if (!downloaded_files_hash)
2207 return FILE_NOT_ALREADY_DOWNLOADED;
2208 ptr = hash_table_get (downloaded_files_hash, file);
2210 return FILE_NOT_ALREADY_DOWNLOADED;
2214 if (!downloaded_files_hash)
2215 downloaded_files_hash = make_string_hash_table (0);
2217 ptr = hash_table_get (downloaded_files_hash, file);
2221 ptr = downloaded_mode_to_ptr (mode);
2222 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2224 return FILE_NOT_ALREADY_DOWNLOADED;
2228 df_free_mapper (void *key, void *value, void *ignored)
2235 downloaded_files_free (void)
2237 if (downloaded_files_hash)
2239 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2240 hash_table_destroy (downloaded_files_hash);
2241 downloaded_files_hash = NULL;