2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
51 static int urlpath_length PARAMS ((const char *));
59 /* Supported schemes: */
60 static struct scheme_data supported_schemes[] =
62 { "http://", DEFAULT_HTTP_PORT },
64 { "https://", DEFAULT_HTTPS_PORT },
66 { "ftp://", DEFAULT_FTP_PORT },
72 static char *construct_relative PARAMS ((const char *, const char *));
75 /* Support for encoding and decoding of URL strings. We determine
76 whether a character is unsafe through static table lookup. This
77 code assumes ASCII character set and 8-bit chars. */
84 #define R urlchr_reserved
85 #define U urlchr_unsafe
88 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
90 /* rfc1738 reserved chars, preserved from encoding. */
92 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
94 /* rfc1738 unsafe chars, plus some more. */
96 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
98 const static unsigned char urlchr_table[256] =
100 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
101 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
102 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
103 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
104 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
105 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
106 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
107 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
108 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
109 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
110 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
111 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
112 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
113 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
114 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
115 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
117 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
118 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
119 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
122 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
128 /* Decodes the forms %xy in a URL to the character the hexadecimal
129 code of which is xy. xy are hexadecimal digits from
130 [0123456789ABCDEF] (case-insensitive). If x or y are not
131 hex-digits or `%' precedes `\0', the sequence is inserted
135 decode_string (char *s)
137 char *t = s; /* t - tortoise */
138 char *h = s; /* h - hare */
149 /* Do nothing if '%' is not followed by two hex digits. */
150 if (!*(h + 1) || !*(h + 2)
151 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
153 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
160 /* Like encode_string, but return S if there are no unsafe chars. */
163 encode_string_maybe (const char *s)
170 for (p1 = s; *p1; p1++)
171 if (UNSAFE_CHAR (*p1))
172 addition += 2; /* Two more characters (hex digits) */
177 newlen = (p1 - s) + addition;
178 newstr = (char *)xmalloc (newlen + 1);
184 if (UNSAFE_CHAR (*p1))
186 unsigned char c = *p1++;
188 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
189 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
195 assert (p2 - newstr == newlen);
200 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
201 given string, returning a malloc-ed %XX encoded string. */
204 encode_string (const char *s)
206 char *encoded = encode_string_maybe (s);
213 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
214 the old value of PTR is freed and PTR is made to point to the newly
215 allocated storage. */
217 #define ENCODE(ptr) do { \
218 char *e_new = encode_string_maybe (ptr); \
226 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
228 /* Decide whether to encode, decode, or pass through the char at P.
229 This used to be a macro, but it got a little too convoluted. */
230 static inline enum copy_method
231 decide_copy_method (const char *p)
235 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
237 /* %xx sequence: decode it, unless it would decode to an
238 unsafe or a reserved char; in that case, leave it as
240 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
241 XCHAR_TO_XDIGIT (*(p + 2));
243 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
244 return CM_PASSTHROUGH;
249 /* Garbled %.. sequence: encode `%'. */
252 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
255 return CM_PASSTHROUGH;
258 /* Translate a %-quoting (but possibly non-conformant) input string S
259 into a %-quoting (and conformant) output string. If no characters
260 are encoded or decoded, return the same string S; otherwise, return
261 a freshly allocated string with the new contents.
263 After a URL has been run through this function, the protocols that
264 use `%' as the quote character can use the resulting string as-is,
265 while those that don't call decode_string() to get to the intended
266 data. This function is also stable: after an input string is
267 transformed the first time, all further transformations of the
268 result yield the same result string.
270 Let's discuss why this function is needed.
272 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
273 space character would mess up the HTTP request, it needs to be
276 GET /abc%20def HTTP/1.0
278 So it appears that the unsafe chars need to be quoted, as with
279 encode_string. But what if we're requested to download
280 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
281 the user meant was a literal space, and he was kind enough to quote
282 it. In that case, Wget should obviously leave the `%20' as is, and
283 send the same request as above. So in this case we may not call
286 But what if the requested URI is `abc%20 def'? If we call
287 encode_string, we end up with `/abc%2520%20def', which is almost
288 certainly not intended. If we don't call encode_string, we are
289 left with the embedded space and cannot send the request. What the
290 user meant was for Wget to request `/abc%20%20def', and this is
291 where reencode_string kicks in.
293 Wget used to solve this by first decoding %-quotes, and then
294 encoding all the "unsafe" characters found in the resulting string.
295 This was wrong because it didn't preserve certain URL special
296 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
297 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
298 whether we considered `+' reserved (it is). One of these results
299 is inevitable because by the second step we would lose information
300 on whether the `+' was originally encoded or not. Both results
301 were wrong because in CGI parameters + means space, while %2B means
302 literal plus. reencode_string correctly translates the above to
303 "a%2B+b", i.e. returns the original string.
305 This function uses an algorithm proposed by Anon Sricharoenchai:
307 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
310 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
313 ...except that this code conflates the two steps, and decides
314 whether to encode, decode, or pass through each character in turn.
315 The function still uses two passes, but their logic is the same --
316 the first pass exists merely for the sake of allocation. Another
317 small difference is that we include `+' to URL_RESERVED.
321 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
323 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
327 "foo bar" -> "foo%20bar"
328 "foo%20bar" -> "foo%20bar"
329 "foo %20bar" -> "foo%20%20bar"
330 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
331 "foo%25%20bar" -> "foo%25%20bar"
332 "foo%2%20bar" -> "foo%252%20bar"
333 "foo+bar" -> "foo+bar" (plus is reserved!)
334 "foo%2b+bar" -> "foo%2b+bar" */
337 reencode_string (const char *s)
343 int encode_count = 0;
344 int decode_count = 0;
346 /* First, pass through the string to see if there's anything to do,
347 and to calculate the new length. */
348 for (p1 = s; *p1; p1++)
350 switch (decide_copy_method (p1))
363 if (!encode_count && !decode_count)
364 /* The string is good as it is. */
365 return (char *)s; /* C const model sucks. */
368 /* Each encoding adds two characters (hex digits), while each
369 decoding removes two characters. */
370 newlen = oldlen + 2 * (encode_count - decode_count);
371 newstr = xmalloc (newlen + 1);
378 switch (decide_copy_method (p1))
382 unsigned char c = *p1++;
384 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
385 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
389 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
390 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
391 p1 += 3; /* skip %xx */
398 assert (p2 - newstr == newlen);
402 /* Run PTR_VAR through reencode_string. If a new string is consed,
403 free PTR_VAR and make it point to the new storage. Obviously,
404 PTR_VAR needs to be an lvalue. */
406 #define REENCODE(ptr_var) do { \
407 char *rf_new = reencode_string (ptr_var); \
408 if (rf_new != ptr_var) \
415 /* Returns the scheme type if the scheme is supported, or
416 SCHEME_INVALID if not. */
418 url_scheme (const char *url)
422 for (i = 0; supported_schemes[i].leading_string; i++)
423 if (!strncasecmp (url, supported_schemes[i].leading_string,
424 strlen (supported_schemes[i].leading_string)))
425 return (enum url_scheme)i;
426 return SCHEME_INVALID;
429 /* Return the number of characters needed to skip the scheme part of
430 the URL, e.g. `http://'. If no scheme is found, returns 0. */
432 url_skip_scheme (const char *url)
436 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
438 while (ISALNUM (*p) || *p == '-' || *p == '+')
445 /* Skip "//" if found. */
446 if (*p == '/' && *(p + 1) == '/')
452 /* Returns 1 if the URL begins with a scheme (supported or
453 unsupported), 0 otherwise. */
455 url_has_scheme (const char *url)
458 while (ISALNUM (*p) || *p == '-' || *p == '+')
464 scheme_default_port (enum url_scheme scheme)
466 return supported_schemes[scheme].default_port;
469 /* Skip the username and password, if present here. The function
470 should be called *not* with the complete URL, but with the part
471 right after the scheme.
473 If no username and password are found, return 0. */
475 url_skip_uname (const char *url)
479 /* Look for '@' that comes before '/' or '?'. */
480 p = (const char *)strpbrk (url, "/?@");
488 parse_uname (const char *str, int len, char **user, char **passwd)
493 /* Empty user name not allowed. */
496 colon = memchr (str, ':', len);
498 /* Empty user name again. */
503 int pwlen = len - (colon + 1 - str);
504 *passwd = xmalloc (pwlen + 1);
505 memcpy (*passwd, colon + 1, pwlen);
506 (*passwd)[pwlen] = '\0';
512 *user = xmalloc (len + 1);
513 memcpy (*user, str, len);
519 /* Used by main.c: detect URLs written using the "shorthand" URL forms
520 popularized by Netscape and NcFTP. HTTP shorthands look like this:
522 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
523 www.foo.com[:port] -> http://www.foo.com[:port]
525 FTP shorthands look like this:
527 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
528 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
530 If the URL needs not or cannot be rewritten, return NULL. */
532 rewrite_shorthand_url (const char *url)
536 if (url_has_scheme (url))
539 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
541 for (p = url; *p && *p != ':' && *p != '/'; p++)
549 const char *pp, *path;
551 /* If the characters after the colon and before the next slash
552 or end of string are all digits, it's HTTP. */
554 for (pp = p + 1; ISDIGIT (*pp); pp++)
557 && (*pp == '/' || *pp == '\0'))
560 /* Prepend "ftp://" to the entire URL... */
562 res = xmalloc (6 + strlen (url) + 1);
563 sprintf (res, "ftp://%s", url);
564 /* ...and replace ':' with '/'. */
565 res[6 + (p - url)] = '/';
572 /* Just prepend "http://" to what we have. */
573 res = xmalloc (7 + strlen (url) + 1);
574 sprintf (res, "http://%s", url);
579 static void parse_path PARAMS ((const char *, char **, char **));
582 strpbrk_or_eos (const char *s, const char *accept)
584 char *p = strpbrk (s, accept);
586 p = (char *)s + strlen (s);
590 /* Turn STR into lowercase; return non-zero if a character was
594 lowercase_str (char *str)
601 *str = TOLOWER (*str);
606 static char *parse_errors[] = {
607 #define PE_NO_ERROR 0
609 #define PE_UNRECOGNIZED_SCHEME 1
610 "Unrecognized scheme",
611 #define PE_EMPTY_HOST 2
613 #define PE_BAD_PORT_NUMBER 3
615 #define PE_INVALID_USER_NAME 4
619 #define SETERR(p, v) do { \
626 Return a new struct url if successful, NULL on error. In case of
627 error, and if ERROR is not NULL, also set *ERROR to the appropriate
630 url_parse (const char *url, int *error)
634 int path_modified, host_modified;
636 enum url_scheme scheme;
638 const char *uname_b, *uname_e;
639 const char *host_b, *host_e;
640 const char *path_b, *path_e;
641 const char *params_b, *params_e;
642 const char *query_b, *query_e;
643 const char *fragment_b, *fragment_e;
646 char *user = NULL, *passwd = NULL;
650 scheme = url_scheme (url);
651 if (scheme == SCHEME_INVALID)
653 SETERR (error, PE_UNRECOGNIZED_SCHEME);
657 url_encoded = reencode_string (url);
660 p += strlen (supported_schemes[scheme].leading_string);
662 p += url_skip_uname (p);
665 /* scheme://user:pass@host[:port]... */
668 /* We attempt to break down the URL into the components path,
669 params, query, and fragment. They are ordered like this:
671 scheme://host[:port][/path][;params][?query][#fragment] */
673 params_b = params_e = NULL;
674 query_b = query_e = NULL;
675 fragment_b = fragment_e = NULL;
678 p = strpbrk_or_eos (p, ":/;?#");
681 if (host_b == host_e)
683 SETERR (error, PE_EMPTY_HOST);
687 port = scheme_default_port (scheme);
690 const char *port_b, *port_e, *pp;
692 /* scheme://host:port/tralala */
696 p = strpbrk_or_eos (p, "/;?#");
699 if (port_b == port_e)
701 /* http://host:/whatever */
703 SETERR (error, PE_BAD_PORT_NUMBER);
707 for (port = 0, pp = port_b; pp < port_e; pp++)
711 /* http://host:12randomgarbage/blah */
713 SETERR (error, PE_BAD_PORT_NUMBER);
716 port = 10 * port + (*pp - '0');
724 p = strpbrk_or_eos (p, ";?#");
729 /* Path is not allowed not to exist. */
737 p = strpbrk_or_eos (p, "?#");
744 p = strpbrk_or_eos (p, "#");
756 if (uname_b != uname_e)
758 /* http://user:pass@host */
760 /* uname_b uname_e */
761 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
763 SETERR (error, PE_INVALID_USER_NAME);
768 u = (struct url *)xmalloc (sizeof (struct url));
769 memset (u, 0, sizeof (*u));
772 u->host = strdupdelim (host_b, host_e);
777 u->path = strdupdelim (path_b, path_e);
778 path_modified = path_simplify (u->path);
779 parse_path (u->path, &u->dir, &u->file);
781 host_modified = lowercase_str (u->host);
784 u->params = strdupdelim (params_b, params_e);
786 u->query = strdupdelim (query_b, query_e);
788 u->fragment = strdupdelim (fragment_b, fragment_e);
791 if (path_modified || u->fragment || host_modified)
793 /* If path_simplify modified the path, or if a fragment is
794 present, or if the original host name had caps in it, make
795 sure that u->url is equivalent to what would be printed by
797 u->url = url_string (u, 0);
799 if (url_encoded != url)
800 xfree ((char *) url_encoded);
804 if (url_encoded == url)
805 u->url = xstrdup (url);
807 u->url = url_encoded;
815 url_error (int error_code)
817 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
818 return parse_errors[error_code];
822 parse_path (const char *quoted_path, char **dir, char **file)
824 char *path, *last_slash;
826 STRDUP_ALLOCA (path, quoted_path);
827 decode_string (path);
829 last_slash = strrchr (path, '/');
833 *file = xstrdup (path);
837 *dir = strdupdelim (path, last_slash);
838 *file = xstrdup (last_slash + 1);
842 /* Note: URL's "full path" is the path with the query string and
843 params appended. The "fragment" (#foo) is intentionally ignored,
844 but that might be changed. For example, if the original URL was
845 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
846 the full path will be "/foo/bar/baz;bullshit?querystring". */
848 /* Return the length of the full path, without the terminating
852 full_path_length (const struct url *url)
856 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
867 /* Write out the full path. */
870 full_path_write (const struct url *url, char *where)
872 #define FROB(el, chr) do { \
873 char *f_el = url->el; \
875 int l = strlen (f_el); \
877 memcpy (where, f_el, l); \
889 /* Public function for getting the "full path". E.g. if u->path is
890 "foo/bar" and u->query is "param=value", full_path will be
891 "/foo/bar?param=value". */
894 url_full_path (const struct url *url)
896 int length = full_path_length (url);
897 char *full_path = (char *)xmalloc(length + 1);
899 full_path_write (url, full_path);
900 full_path[length] = '\0';
905 /* Sync u->path and u->url with u->dir and u->file. */
908 sync_path (struct url *url)
916 newpath = xstrdup (url->file);
921 int dirlen = strlen (url->dir);
922 int filelen = strlen (url->file);
924 newpath = xmalloc (dirlen + 1 + filelen + 1);
925 memcpy (newpath, url->dir, dirlen);
926 newpath[dirlen] = '/';
927 memcpy (newpath + dirlen + 1, url->file, filelen);
928 newpath[dirlen + 1 + filelen] = '\0';
934 /* Synchronize u->url. */
936 url->url = url_string (url, 0);
939 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
940 This way we can sync u->path and u->url when they get changed. */
943 url_set_dir (struct url *url, const char *newdir)
946 url->dir = xstrdup (newdir);
951 url_set_file (struct url *url, const char *newfile)
954 url->file = xstrdup (newfile);
959 url_free (struct url *url)
965 FREE_MAYBE (url->params);
966 FREE_MAYBE (url->query);
967 FREE_MAYBE (url->fragment);
968 FREE_MAYBE (url->user);
969 FREE_MAYBE (url->passwd);
978 get_urls_file (const char *file)
980 struct file_memory *fm;
981 struct urlpos *head, *tail;
982 const char *text, *text_end;
985 fm = read_file (file);
988 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
991 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
994 text_end = fm->content + fm->length;
995 while (text < text_end)
997 const char *line_beg = text;
998 const char *line_end = memchr (text, '\n', text_end - text);
1000 line_end = text_end;
1004 while (line_beg < line_end
1005 && ISSPACE (*line_beg))
1007 while (line_end > line_beg + 1
1008 && ISSPACE (*(line_end - 1)))
1010 if (line_end > line_beg)
1012 /* URL is in the [line_beg, line_end) region. */
1016 struct urlpos *entry;
1019 /* We must copy the URL to a zero-terminated string, and we
1020 can't use alloca because we're in a loop. *sigh*. */
1021 url_text = strdupdelim (line_beg, line_end);
1025 /* Merge opt.base_href with URL. */
1026 char *merged = uri_merge (opt.base_href, url_text);
1031 url = url_parse (url_text, &up_error_code);
1034 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1035 file, url_text, url_error (up_error_code));
1041 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1042 memset (entry, 0, sizeof (*entry));
1053 read_file_free (fm);
1057 /* Free the linked list of urlpos. */
1059 free_urlpos (struct urlpos *l)
1063 struct urlpos *next = l->next;
1066 FREE_MAYBE (l->local_name);
1072 /* Rotate FNAME opt.backups times */
1074 rotate_backups(const char *fname)
1076 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1077 char *from = (char *)alloca (maxlen);
1078 char *to = (char *)alloca (maxlen);
1082 if (stat (fname, &sb) == 0)
1083 if (S_ISREG (sb.st_mode) == 0)
1086 for (i = opt.backups; i > 1; i--)
1088 sprintf (from, "%s.%d", fname, i - 1);
1089 sprintf (to, "%s.%d", fname, i);
1090 /* #### This will fail on machines without the rename() system
1095 sprintf (to, "%s.%d", fname, 1);
1099 /* Create all the necessary directories for PATH (a file). Calls
1100 mkdirhier() internally. */
1102 mkalldirs (const char *path)
1109 p = path + strlen (path);
1110 for (; *p != '/' && p != path; p--);
1111 /* Don't create if it's just a file. */
1112 if ((p == path) && (*p != '/'))
1114 t = strdupdelim (path, p);
1115 /* Check whether the directory exists. */
1116 if ((stat (t, &st) == 0))
1118 if (S_ISDIR (st.st_mode))
1125 /* If the dir exists as a file name, remove it first. This
1126 is *only* for Wget to work with buggy old CERN http
1127 servers. Here is the scenario: When Wget tries to
1128 retrieve a directory without a slash, e.g.
1129 http://foo/bar (bar being a directory), CERN server will
1130 not redirect it too http://foo/bar/ -- it will generate a
1131 directory listing containing links to bar/file1,
1132 bar/file2, etc. Wget will lose because it saves this
1133 HTML listing to a file `bar', so it cannot create the
1134 directory. To work around this, if the file of the same
1135 name exists, we just remove it and create the directory
1137 DEBUGP (("Removing %s because of directory danger!\n", t));
1141 res = make_directory (t);
1143 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1149 count_slashes (const char *s)
1158 /* Return the path name of the URL-equivalent file name, with a
1159 remote-like structure of directories. */
1161 mkstruct (const struct url *u)
1163 char *dir, *dir_preencoding;
1164 char *file, *res, *dirpref;
1165 char *query = u->query && *u->query ? u->query : NULL;
1170 char *ptr = u->dir + (*u->dir == '/');
1171 int slash_count = 1 + count_slashes (ptr);
1172 int cut = MINVAL (opt.cut_dirs, slash_count);
1173 for (; cut && *ptr; ptr++)
1176 STRDUP_ALLOCA (dir, ptr);
1179 dir = u->dir + (*u->dir == '/');
1181 /* Check for the true name (or at least a consistent name for saving
1182 to directory) of HOST, reusing the hlist if possible. */
1183 if (opt.add_hostdir)
1185 /* Add dir_prefix and hostname (if required) to the beginning of
1187 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1189 + 1 + numdigit (u->port)
1191 if (!DOTP (opt.dir_prefix))
1192 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1194 strcpy (dirpref, u->host);
1196 if (u->port != scheme_default_port (u->scheme))
1198 int len = strlen (dirpref);
1200 long_to_string (dirpref + len + 1, u->port);
1203 else /* not add_hostdir */
1205 if (!DOTP (opt.dir_prefix))
1206 dirpref = opt.dir_prefix;
1211 /* If there is a prefix, prepend it. */
1214 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1215 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1219 dir_preencoding = dir;
1220 dir = reencode_string (dir_preencoding);
1223 if (l && dir[l - 1] == '/')
1227 file = "index.html";
1231 /* Finally, construct the full name. */
1232 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1233 + (query ? (1 + strlen (query)) : 0)
1235 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1239 strcat (res, query);
1241 if (dir != dir_preencoding)
1246 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1247 an escaped query string. The trick is to make sure that unsafe
1248 characters in BASE are escaped, and that slashes in QUERY are also
1252 compose_file_name (char *base, char *query)
1258 /* Copy BASE to RESULT and encode all unsafe characters. */
1260 while (*from && to - result < sizeof (result))
1262 if (UNSAFE_CHAR (*from))
1264 unsigned char c = *from++;
1266 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1267 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1273 if (query && to - result < sizeof (result))
1277 /* Copy QUERY to RESULT and encode all '/' characters. */
1279 while (*from && to - result < sizeof (result))
1293 if (to - result < sizeof (result))
1296 /* Truncate input which is too long, presumably due to a huge
1298 result[sizeof (result) - 1] = '\0';
1300 return xstrdup (result);
1303 /* Create a unique filename, corresponding to a given URL. Calls
1304 mkstruct if necessary. Does *not* actually create any directories. */
1306 url_filename (const struct url *u)
1309 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1313 file = mkstruct (u);
1318 char *base = *u->file ? u->file : "index.html";
1319 char *query = u->query && *u->query ? u->query : NULL;
1320 file = compose_file_name (base, query);
1325 /* Check whether the prefix directory is something other than "."
1326 before prepending it. */
1327 if (!DOTP (opt.dir_prefix))
1329 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1330 + 1 + strlen (file) + 1);
1331 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1336 /* DOS-ish file systems don't like `%' signs in them; we change it
1341 for (p = file; *p; p++)
1345 #endif /* WINDOWS */
1347 /* Check the cases in which the unique extensions are not used:
1348 1) Clobbering is turned off (-nc).
1349 2) Retrieval with regetting.
1350 3) Timestamping is used.
1351 4) Hierarchy is built.
1353 The exception is the case when file does exist and is a
1354 directory (actually support for bad httpd-s). */
1355 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1356 && !(file_exists_p (file) && !file_non_directory_p (file)))
1359 /* Find a unique name. */
1360 name = unique_name (file);
1365 /* Like strlen(), but allow the URL to be ended with '?'. */
1367 urlpath_length (const char *url)
1369 const char *q = strpbrk_or_eos (url, "?;#");
1373 /* Find the last occurrence of character C in the range [b, e), or
1374 NULL, if none are present. This is almost completely equivalent to
1375 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1376 the contents of the string. */
1378 find_last_char (const char *b, const char *e, char c)
1386 /* Resolve the result of "linking" a base URI (BASE) to a
1387 link-specified URI (LINK).
1389 Either of the URIs may be absolute or relative, complete with the
1390 host name, or path only. This tries to behave "reasonably" in all
1391 foreseeable cases. It employs little specific knowledge about
1392 schemes or URL-specific stuff -- it just works on strings.
1394 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1395 See uri_merge for a gentler interface to this functionality.
1397 Perhaps this function should handle `./' and `../' so that the evil
1398 path_simplify can go. */
1400 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1406 const char *end = base + urlpath_length (base);
1410 /* Empty LINK points back to BASE, query string and all. */
1411 constr = xstrdup (base);
1413 else if (*link == '?')
1415 /* LINK points to the same location, but changes the query
1416 string. Examples: */
1417 /* uri_merge("path", "?new") -> "path?new" */
1418 /* uri_merge("path?foo", "?new") -> "path?new" */
1419 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1420 /* uri_merge("path#foo", "?new") -> "path?new" */
1421 int baselength = end - base;
1422 constr = xmalloc (baselength + linklength + 1);
1423 memcpy (constr, base, baselength);
1424 memcpy (constr + baselength, link, linklength);
1425 constr[baselength + linklength] = '\0';
1427 else if (*link == '#')
1429 /* uri_merge("path", "#new") -> "path#new" */
1430 /* uri_merge("path#foo", "#new") -> "path#new" */
1431 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1432 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1434 const char *end1 = strchr (base, '#');
1436 end1 = base + strlen (base);
1437 baselength = end1 - base;
1438 constr = xmalloc (baselength + linklength + 1);
1439 memcpy (constr, base, baselength);
1440 memcpy (constr + baselength, link, linklength);
1441 constr[baselength + linklength] = '\0';
1443 else if (*link == '/')
1445 /* LINK is an absolute path: we need to replace everything
1446 after (and including) the FIRST slash with LINK.
1448 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1449 "/qux/xyzzy", our result should be
1450 "http://host/qux/xyzzy". */
1453 const char *start_insert = NULL; /* for gcc to shut up. */
1454 const char *pos = base;
1455 int seen_slash_slash = 0;
1456 /* We're looking for the first slash, but want to ignore
1459 slash = memchr (pos, '/', end - pos);
1460 if (slash && !seen_slash_slash)
1461 if (*(slash + 1) == '/')
1464 seen_slash_slash = 1;
1468 /* At this point, SLASH is the location of the first / after
1469 "//", or the first slash altogether. START_INSERT is the
1470 pointer to the location where LINK will be inserted. When
1471 examining the last two examples, keep in mind that LINK
1474 if (!slash && !seen_slash_slash)
1475 /* example: "foo" */
1477 start_insert = base;
1478 else if (!slash && seen_slash_slash)
1479 /* example: "http://foo" */
1482 else if (slash && !seen_slash_slash)
1483 /* example: "foo/bar" */
1485 start_insert = base;
1486 else if (slash && seen_slash_slash)
1487 /* example: "http://something/" */
1489 start_insert = slash;
1491 span = start_insert - base;
1492 constr = (char *)xmalloc (span + linklength + 1);
1494 memcpy (constr, base, span);
1496 memcpy (constr + span, link, linklength);
1497 constr[span + linklength] = '\0';
1501 /* LINK is a relative URL: we need to replace everything
1502 after last slash (possibly empty) with LINK.
1504 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1505 our result should be "whatever/foo/qux/xyzzy". */
1506 int need_explicit_slash = 0;
1508 const char *start_insert;
1509 const char *last_slash = find_last_char (base, end, '/');
1512 /* No slash found at all. Append LINK to what we have,
1513 but we'll need a slash as a separator.
1515 Example: if base == "foo" and link == "qux/xyzzy", then
1516 we cannot just append link to base, because we'd get
1517 "fooqux/xyzzy", whereas what we want is
1520 To make sure the / gets inserted, we set
1521 need_explicit_slash to 1. We also set start_insert
1522 to end + 1, so that the length calculations work out
1523 correctly for one more (slash) character. Accessing
1524 that character is fine, since it will be the
1525 delimiter, '\0' or '?'. */
1526 /* example: "foo?..." */
1527 /* ^ ('?' gets changed to '/') */
1528 start_insert = end + 1;
1529 need_explicit_slash = 1;
1531 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1533 /* example: http://host" */
1535 start_insert = end + 1;
1536 need_explicit_slash = 1;
1540 /* example: "whatever/foo/bar" */
1542 start_insert = last_slash + 1;
1545 span = start_insert - base;
1546 constr = (char *)xmalloc (span + linklength + 1);
1548 memcpy (constr, base, span);
1549 if (need_explicit_slash)
1550 constr[span - 1] = '/';
1552 memcpy (constr + span, link, linklength);
1553 constr[span + linklength] = '\0';
1556 else /* !no_scheme */
1558 constr = strdupdelim (link, link + linklength);
1563 /* Merge BASE with LINK and return the resulting URI. This is an
1564 interface to uri_merge_1 that assumes that LINK is a
1565 zero-terminated string. */
1567 uri_merge (const char *base, const char *link)
1569 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1572 #define APPEND(p, s) do { \
1573 int len = strlen (s); \
1574 memcpy (p, s, len); \
1578 /* Use this instead of password when the actual password is supposed
1579 to be hidden. We intentionally use a generic string without giving
1580 away the number of characters in the password, like previous
1582 #define HIDDEN_PASSWORD "*password*"
1584 /* Recreate the URL string from the data in URL.
1586 If HIDE is non-zero (as it is when we're calling this on a URL we
1587 plan to print, but not when calling it to canonicalize a URL for
1588 use within the program), password will be hidden. Unsafe
1589 characters in the URL will be quoted. */
1592 url_string (const struct url *url, int hide_password)
1596 char *quoted_user = NULL, *quoted_passwd = NULL;
1598 int scheme_port = supported_schemes[url->scheme].default_port;
1599 char *scheme_str = supported_schemes[url->scheme].leading_string;
1600 int fplen = full_path_length (url);
1602 assert (scheme_str != NULL);
1604 /* Make sure the user name and password are quoted. */
1607 quoted_user = encode_string_maybe (url->user);
1611 quoted_passwd = HIDDEN_PASSWORD;
1613 quoted_passwd = encode_string_maybe (url->passwd);
1617 size = (strlen (scheme_str)
1618 + strlen (url->host)
1621 if (url->port != scheme_port)
1622 size += 1 + numdigit (url->port);
1625 size += 1 + strlen (quoted_user);
1627 size += 1 + strlen (quoted_passwd);
1630 p = result = xmalloc (size);
1632 APPEND (p, scheme_str);
1635 APPEND (p, quoted_user);
1639 APPEND (p, quoted_passwd);
1644 APPEND (p, url->host);
1645 if (url->port != scheme_port)
1648 long_to_string (p, url->port);
1652 full_path_write (url, p);
1656 assert (p - result == size);
1658 if (quoted_user && quoted_user != url->user)
1659 xfree (quoted_user);
1660 if (quoted_passwd && !hide_password
1661 && quoted_passwd != url->passwd)
1662 xfree (quoted_passwd);
1667 /* Returns proxy host address, in accordance with SCHEME. */
1669 getproxy (enum url_scheme scheme)
1672 char *rewritten_url;
1673 static char rewritten_storage[1024];
1678 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1682 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1686 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1688 case SCHEME_INVALID:
1691 if (!proxy || !*proxy)
1694 /* Handle shorthands. */
1695 rewritten_url = rewrite_shorthand_url (proxy);
1698 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1699 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1700 proxy = rewritten_storage;
1706 /* Should a host be accessed through proxy, concerning no_proxy? */
1708 no_proxy_match (const char *host, const char **no_proxy)
1713 return !sufmatch (no_proxy, host);
1716 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1717 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1719 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1720 const char *, int));
1721 static char *local_quote_string PARAMS ((const char *));
1723 /* Change the links in one HTML file. LINKS is a list of links in the
1724 document, along with their positions and the desired direction of
1727 convert_links (const char *file, struct urlpos *links)
1729 struct file_memory *fm;
1732 downloaded_file_t downloaded_file_return;
1734 struct urlpos *link;
1735 int to_url_count = 0, to_file_count = 0;
1737 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1740 /* First we do a "dry run": go through the list L and see whether
1741 any URL needs to be converted in the first place. If not, just
1742 leave the file alone. */
1744 struct urlpos *dry = links;
1745 for (dry = links; dry; dry = dry->next)
1746 if (dry->convert != CO_NOCONVERT)
1750 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1755 fm = read_file (file);
1758 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1759 file, strerror (errno));
1763 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1764 if (opt.backup_converted && downloaded_file_return)
1765 write_backup_file (file, downloaded_file_return);
1767 /* Before opening the file for writing, unlink the file. This is
1768 important if the data in FM is mmaped. In such case, nulling the
1769 file, which is what fopen() below does, would make us read all
1770 zeroes from the mmaped region. */
1771 if (unlink (file) < 0 && errno != ENOENT)
1773 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1774 file, strerror (errno));
1775 read_file_free (fm);
1778 /* Now open the file for writing. */
1779 fp = fopen (file, "wb");
1782 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1783 file, strerror (errno));
1784 read_file_free (fm);
1788 /* Here we loop through all the URLs in file, replacing those of
1789 them that are downloaded with relative references. */
1791 for (link = links; link; link = link->next)
1793 char *url_start = fm->content + link->pos;
1795 if (link->pos >= fm->length)
1797 DEBUGP (("Something strange is going on. Please investigate."));
1800 /* If the URL is not to be converted, skip it. */
1801 if (link->convert == CO_NOCONVERT)
1803 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1807 /* Echo the file contents, up to the offending URL's opening
1808 quote, to the outfile. */
1809 fwrite (p, 1, url_start - p, fp);
1812 switch (link->convert)
1814 case CO_CONVERT_TO_RELATIVE:
1815 /* Convert absolute URL to relative. */
1817 char *newname = construct_relative (file, link->local_name);
1818 char *quoted_newname = local_quote_string (newname);
1820 if (!link->link_refresh_p)
1821 p = replace_attr (p, link->size, fp, quoted_newname);
1823 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1824 link->refresh_timeout);
1826 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1827 link->url->url, newname, link->pos, file));
1829 xfree (quoted_newname);
1833 case CO_CONVERT_TO_COMPLETE:
1834 /* Convert the link to absolute URL. */
1836 char *newlink = link->url->url;
1837 char *quoted_newlink = html_quote_string (newlink);
1839 if (!link->link_refresh_p)
1840 p = replace_attr (p, link->size, fp, quoted_newlink);
1842 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1843 link->refresh_timeout);
1845 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1846 newlink, link->pos, file));
1847 xfree (quoted_newlink);
1851 case CO_NULLIFY_BASE:
1852 /* Change the base href to "". */
1853 p = replace_attr (p, link->size, fp, "");
1861 /* Output the rest of the file. */
1862 if (p - fm->content < fm->length)
1863 fwrite (p, 1, fm->length - (p - fm->content), fp);
1865 read_file_free (fm);
1867 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1870 /* Construct and return a malloced copy of the relative link from two
1871 pieces of information: local name S1 of the referring file and
1872 local name S2 of the referred file.
1874 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1875 "jagor.srce.hr/images/news.gif", the function will return
1878 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1879 "fly.cc.fer.hr/images/fly.gif", the function will return
1880 "../images/fly.gif".
1882 Caveats: S1 should not begin with `/', unless S2 also begins with
1883 '/'. S1 should not contain things like ".." and such --
1884 construct_relative ("fly/ioccc/../index.html",
1885 "fly/images/fly.gif") will fail. (A workaround is to call
1886 something like path_simplify() on S1). */
1888 construct_relative (const char *s1, const char *s2)
1890 int i, cnt, sepdirs1;
1894 return xstrdup (s2);
1895 /* S1 should *not* be absolute, if S2 wasn't. */
1896 assert (*s1 != '/');
1898 /* Skip the directories common to both strings. */
1901 while (s1[i] && s2[i]
1906 if (s1[i] == '/' && s2[i] == '/')
1911 for (sepdirs1 = 0; s1[i]; i++)
1914 /* Now, construct the file as of:
1915 - ../ repeated sepdirs1 time
1916 - all the non-mutual directories of S2. */
1917 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1918 for (i = 0; i < sepdirs1; i++)
1919 memcpy (res + 3 * i, "../", 3);
1920 strcpy (res + 3 * i, s2 + cnt);
1925 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1927 /* Rather than just writing over the original .html file with the
1928 converted version, save the former to *.orig. Note we only do
1929 this for files we've _successfully_ downloaded, so we don't
1930 clobber .orig files sitting around from previous invocations. */
1932 /* Construct the backup filename as the original name plus ".orig". */
1933 size_t filename_len = strlen(file);
1934 char* filename_plus_orig_suffix;
1935 boolean already_wrote_backup_file = FALSE;
1936 slist* converted_file_ptr;
1937 static slist* converted_files = NULL;
1939 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1941 /* Just write "orig" over "html". We need to do it this way
1942 because when we're checking to see if we've downloaded the
1943 file before (to see if we can skip downloading it), we don't
1944 know if it's a text/html file. Therefore we don't know yet
1945 at that stage that -E is going to cause us to tack on
1946 ".html", so we need to compare vs. the original URL plus
1947 ".orig", not the original URL plus ".html.orig". */
1948 filename_plus_orig_suffix = alloca (filename_len + 1);
1949 strcpy(filename_plus_orig_suffix, file);
1950 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1952 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1954 /* Append ".orig" to the name. */
1955 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1956 strcpy(filename_plus_orig_suffix, file);
1957 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1960 /* We can get called twice on the same URL thanks to the
1961 convert_all_links() call in main(). If we write the .orig file
1962 each time in such a case, it'll end up containing the first-pass
1963 conversion, not the original file. So, see if we've already been
1964 called on this file. */
1965 converted_file_ptr = converted_files;
1966 while (converted_file_ptr != NULL)
1967 if (strcmp(converted_file_ptr->string, file) == 0)
1969 already_wrote_backup_file = TRUE;
1973 converted_file_ptr = converted_file_ptr->next;
1975 if (!already_wrote_backup_file)
1977 /* Rename <file> to <file>.orig before former gets written over. */
1978 if (rename(file, filename_plus_orig_suffix) != 0)
1979 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1980 file, filename_plus_orig_suffix, strerror (errno));
1982 /* Remember that we've already written a .orig backup for this file.
1983 Note that we never free this memory since we need it till the
1984 convert_all_links() call, which is one of the last things the
1985 program does before terminating. BTW, I'm not sure if it would be
1986 safe to just set 'converted_file_ptr->string' to 'file' below,
1987 rather than making a copy of the string... Another note is that I
1988 thought I could just add a field to the urlpos structure saying
1989 that we'd written a .orig file for this URL, but that didn't work,
1990 so I had to make this separate list.
1991 -- Dan Harkless <wget@harkless.org>
1993 This [adding a field to the urlpos structure] didn't work
1994 because convert_file() is called from convert_all_links at
1995 the end of the retrieval with a freshly built new urlpos
1997 -- Hrvoje Niksic <hniksic@arsdigita.com>
1999 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2000 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2001 converted_file_ptr->next = converted_files;
2002 converted_files = converted_file_ptr;
2006 static int find_fragment PARAMS ((const char *, int, const char **,
2009 /* Replace an attribute's original text with NEW_TEXT. */
2012 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2015 char quote_char = '\"'; /* use "..." for quoting, unless the
2016 original value is quoted, in which
2017 case reuse its quoting char. */
2018 const char *frag_beg, *frag_end;
2020 /* Structure of our string is:
2021 "...old-contents..."
2022 <--- size ---> (with quotes)
2025 <--- size --> (no quotes) */
2027 if (*p == '\"' || *p == '\'')
2032 size -= 2; /* disregard opening and closing quote */
2034 putc (quote_char, fp);
2035 fputs (new_text, fp);
2037 /* Look for fragment identifier, if any. */
2038 if (find_fragment (p, size, &frag_beg, &frag_end))
2039 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2043 putc (quote_char, fp);
2048 /* The same as REPLACE_ATTR, but used when replacing
2049 <meta http-equiv=refresh content="new_text"> because we need to
2050 append "timeout_value; URL=" before the next_text. */
2053 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2054 const char *new_text, int timeout)
2057 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2061 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2063 return replace_attr (p, size, fp, new_with_timeout);
2066 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2067 preceded by '&'. If the character is not found, return zero. If
2068 the character is found, return 1 and set BP and EP to point to the
2069 beginning and end of the region.
2071 This is used for finding the fragment indentifiers in URLs. */
2074 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2076 const char *end = beg + size;
2078 for (; beg < end; beg++)
2100 /* The idea here was to quote ? as %3F to avoid passing part of the
2101 file name as the parameter when browsing the converted file through
2102 HTTP. However, actually doing that breaks local browsing because
2103 "index.html%3Ffoo=bar" isn't even recognized as an HTML file!
2104 Perhaps this should be controlled by an option, but for now I'm
2105 leaving the question marks.
2107 This is the original docstring of this function:
2109 FILE should be a relative link to a local file. It should be
2110 quoted as HTML because it will be used in HTML context. However,
2111 we need to quote ? as %3F to avoid passing part of the file name as
2112 the parameter. (This is not a problem when viewing locally, but is
2113 if the downloaded and converted tree is served by an HTTP
2116 /* Quote string as HTML. */
2119 local_quote_string (const char *file)
2121 return html_quote_string (file);
2124 const char *file_sans_qmark;
2125 int qm = count_char (file, '?');
2129 const char *from = file;
2132 /* qm * 2 because we replace each question mark with "%3F",
2133 i.e. replace one char with three, hence two more. */
2134 int fsqlen = strlen (file) + qm * 2;
2136 to = newname = (char *)alloca (fsqlen + 1);
2137 for (; *from; from++)
2148 assert (to - newname == fsqlen);
2151 file_sans_qmark = newname;
2154 file_sans_qmark = file;
2156 return html_quote_string (file_sans_qmark);
2160 /* We're storing "modes" of type downloaded_file_t in the hash table.
2161 However, our hash tables only accept pointers for keys and values.
2162 So when we need a pointer, we use the address of a
2163 downloaded_file_t variable of static storage. */
2165 static downloaded_file_t *
2166 downloaded_mode_to_ptr (downloaded_file_t mode)
2168 static downloaded_file_t
2169 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2170 v2 = FILE_DOWNLOADED_NORMALLY,
2171 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2172 v4 = CHECK_FOR_FILE;
2176 case FILE_NOT_ALREADY_DOWNLOADED:
2178 case FILE_DOWNLOADED_NORMALLY:
2180 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2182 case CHECK_FOR_FILE:
2188 /* This should really be merged with dl_file_url_map and
2189 downloaded_html_files in recur.c. This was originally a list, but
2190 I changed it to a hash table beause it was actually taking a lot of
2191 time to find things in it. */
2193 static struct hash_table *downloaded_files_hash;
2195 /* Remembers which files have been downloaded. In the standard case, should be
2196 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2197 download successfully (i.e. not for ones we have failures on or that we skip
2200 When we've downloaded a file and tacked on a ".html" extension due to -E,
2201 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2202 FILE_DOWNLOADED_NORMALLY.
2204 If you just want to check if a file has been previously added without adding
2205 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2206 with local filenames, not remote URLs. */
2208 downloaded_file (downloaded_file_t mode, const char *file)
2210 downloaded_file_t *ptr;
2212 if (mode == CHECK_FOR_FILE)
2214 if (!downloaded_files_hash)
2215 return FILE_NOT_ALREADY_DOWNLOADED;
2216 ptr = hash_table_get (downloaded_files_hash, file);
2218 return FILE_NOT_ALREADY_DOWNLOADED;
2222 if (!downloaded_files_hash)
2223 downloaded_files_hash = make_string_hash_table (0);
2225 ptr = hash_table_get (downloaded_files_hash, file);
2229 ptr = downloaded_mode_to_ptr (mode);
2230 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2232 return FILE_NOT_ALREADY_DOWNLOADED;
2236 df_free_mapper (void *key, void *value, void *ignored)
2243 downloaded_files_free (void)
2245 if (downloaded_files_hash)
2247 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2248 hash_table_destroy (downloaded_files_hash);
2249 downloaded_files_hash = NULL;