2 Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
29 #include <sys/types.h>
47 #define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
49 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
51 static int urlpath_length PARAMS ((const char *));
60 /* Supported schemes: */
61 static struct scheme_data supported_schemes[] =
63 { "http://", DEFAULT_HTTP_PORT, 1 },
65 { "https://", DEFAULT_HTTPS_PORT, 1 },
67 { "ftp://", DEFAULT_FTP_PORT, 1 },
73 static char *construct_relative PARAMS ((const char *, const char *));
76 /* Support for encoding and decoding of URL strings. We determine
77 whether a character is unsafe through static table lookup. This
78 code assumes ASCII character set and 8-bit chars. */
85 #define R urlchr_reserved
86 #define U urlchr_unsafe
89 #define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
91 /* rfc1738 reserved chars, preserved from encoding. */
93 #define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
95 /* rfc1738 unsafe chars, plus some more. */
97 #define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
99 const static unsigned char urlchr_table[256] =
101 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
102 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
103 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
104 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
105 U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */
106 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */
107 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
108 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
109 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
110 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
111 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
112 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */
113 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
114 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
115 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
116 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */
118 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
119 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
120 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
121 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
123 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
124 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
125 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
126 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
129 /* Decodes the forms %xy in a URL to the character the hexadecimal
130 code of which is xy. xy are hexadecimal digits from
131 [0123456789ABCDEF] (case-insensitive). If x or y are not
132 hex-digits or `%' precedes `\0', the sequence is inserted
136 decode_string (char *s)
138 char *t = s; /* t - tortoise */
139 char *h = s; /* h - hare */
150 /* Do nothing if '%' is not followed by two hex digits. */
151 if (!*(h + 1) || !*(h + 2)
152 || !(ISXDIGIT (*(h + 1)) && ISXDIGIT (*(h + 2))))
154 *t = (XCHAR_TO_XDIGIT (*(h + 1)) << 4) + XCHAR_TO_XDIGIT (*(h + 2));
161 /* Like encode_string, but return S if there are no unsafe chars. */
164 encode_string_maybe (const char *s)
171 for (p1 = s; *p1; p1++)
172 if (UNSAFE_CHAR (*p1))
173 addition += 2; /* Two more characters (hex digits) */
178 newlen = (p1 - s) + addition;
179 newstr = (char *)xmalloc (newlen + 1);
185 if (UNSAFE_CHAR (*p1))
187 unsigned char c = *p1++;
189 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
190 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
196 assert (p2 - newstr == newlen);
201 /* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a
202 given string, returning a malloc-ed %XX encoded string. */
205 encode_string (const char *s)
207 char *encoded = encode_string_maybe (s);
214 /* Encode unsafe characters in PTR to %xx. If such encoding is done,
215 the old value of PTR is freed and PTR is made to point to the newly
216 allocated storage. */
218 #define ENCODE(ptr) do { \
219 char *e_new = encode_string_maybe (ptr); \
227 enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
229 /* Decide whether to encode, decode, or pass through the char at P.
230 This used to be a macro, but it got a little too convoluted. */
231 static inline enum copy_method
232 decide_copy_method (const char *p)
236 if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))
238 /* %xx sequence: decode it, unless it would decode to an
239 unsafe or a reserved char; in that case, leave it as
241 char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) +
242 XCHAR_TO_XDIGIT (*(p + 2));
244 if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt))
245 return CM_PASSTHROUGH;
250 /* Garbled %.. sequence: encode `%'. */
253 else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p))
256 return CM_PASSTHROUGH;
259 /* Translate a %-quoting (but possibly non-conformant) input string S
260 into a %-quoting (and conformant) output string. If no characters
261 are encoded or decoded, return the same string S; otherwise, return
262 a freshly allocated string with the new contents.
264 After a URL has been run through this function, the protocols that
265 use `%' as the quote character can use the resulting string as-is,
266 while those that don't call decode_string() to get to the intended
267 data. This function is also stable: after an input string is
268 transformed the first time, all further transformations of the
269 result yield the same result string.
271 Let's discuss why this function is needed.
273 Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw
274 space character would mess up the HTTP request, it needs to be
277 GET /abc%20def HTTP/1.0
279 So it appears that the unsafe chars need to be quoted, as with
280 encode_string. But what if we're requested to download
281 `abc%20def'? Remember that %-encoding is valid URL syntax, so what
282 the user meant was a literal space, and he was kind enough to quote
283 it. In that case, Wget should obviously leave the `%20' as is, and
284 send the same request as above. So in this case we may not call
287 But what if the requested URI is `abc%20 def'? If we call
288 encode_string, we end up with `/abc%2520%20def', which is almost
289 certainly not intended. If we don't call encode_string, we are
290 left with the embedded space and cannot send the request. What the
291 user meant was for Wget to request `/abc%20%20def', and this is
292 where reencode_string kicks in.
294 Wget used to solve this by first decoding %-quotes, and then
295 encoding all the "unsafe" characters found in the resulting string.
296 This was wrong because it didn't preserve certain URL special
297 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b
298 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
299 whether we considered `+' reserved (it is). One of these results
300 is inevitable because by the second step we would lose information
301 on whether the `+' was originally encoded or not. Both results
302 were wrong because in CGI parameters + means space, while %2B means
303 literal plus. reencode_string correctly translates the above to
304 "a%2B+b", i.e. returns the original string.
306 This function uses an algorithm proposed by Anon Sricharoenchai:
308 1. Encode all URL_UNSAFE and the "%" that are not followed by 2
311 2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and
314 ...except that this code conflates the two steps, and decides
315 whether to encode, decode, or pass through each character in turn.
316 The function still uses two passes, but their logic is the same --
317 the first pass exists merely for the sake of allocation. Another
318 small difference is that we include `+' to URL_RESERVED.
322 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
324 "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"
328 "foo bar" -> "foo%20bar"
329 "foo%20bar" -> "foo%20bar"
330 "foo %20bar" -> "foo%20%20bar"
331 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%')
332 "foo%25%20bar" -> "foo%25%20bar"
333 "foo%2%20bar" -> "foo%252%20bar"
334 "foo+bar" -> "foo+bar" (plus is reserved!)
335 "foo%2b+bar" -> "foo%2b+bar" */
338 reencode_string (const char *s)
344 int encode_count = 0;
345 int decode_count = 0;
347 /* First, pass through the string to see if there's anything to do,
348 and to calculate the new length. */
349 for (p1 = s; *p1; p1++)
351 switch (decide_copy_method (p1))
364 if (!encode_count && !decode_count)
365 /* The string is good as it is. */
366 return (char *)s; /* C const model sucks. */
369 /* Each encoding adds two characters (hex digits), while each
370 decoding removes two characters. */
371 newlen = oldlen + 2 * (encode_count - decode_count);
372 newstr = xmalloc (newlen + 1);
379 switch (decide_copy_method (p1))
383 unsigned char c = *p1++;
385 *p2++ = XDIGIT_TO_XCHAR (c >> 4);
386 *p2++ = XDIGIT_TO_XCHAR (c & 0xf);
390 *p2++ = ((XCHAR_TO_XDIGIT (*(p1 + 1)) << 4)
391 + (XCHAR_TO_XDIGIT (*(p1 + 2))));
392 p1 += 3; /* skip %xx */
399 assert (p2 - newstr == newlen);
403 /* Run PTR_VAR through reencode_string. If a new string is consed,
404 free PTR_VAR and make it point to the new storage. Obviously,
405 PTR_VAR needs to be an lvalue. */
407 #define REENCODE(ptr_var) do { \
408 char *rf_new = reencode_string (ptr_var); \
409 if (rf_new != ptr_var) \
416 /* Returns the scheme type if the scheme is supported, or
417 SCHEME_INVALID if not. */
419 url_scheme (const char *url)
423 for (i = 0; supported_schemes[i].leading_string; i++)
424 if (0 == strncasecmp (url, supported_schemes[i].leading_string,
425 strlen (supported_schemes[i].leading_string)))
427 if (supported_schemes[i].enabled)
428 return (enum url_scheme) i;
430 return SCHEME_INVALID;
433 return SCHEME_INVALID;
436 /* Return the number of characters needed to skip the scheme part of
437 the URL, e.g. `http://'. If no scheme is found, returns 0. */
439 url_skip_scheme (const char *url)
443 /* Skip the scheme name. We allow `-' and `+' because of `whois++',
445 while (ISALNUM (*p) || *p == '-' || *p == '+')
452 /* Skip "//" if found. */
453 if (*p == '/' && *(p + 1) == '/')
459 /* Returns 1 if the URL begins with a scheme (supported or
460 unsupported), 0 otherwise. */
462 url_has_scheme (const char *url)
465 while (ISALNUM (*p) || *p == '-' || *p == '+')
471 scheme_default_port (enum url_scheme scheme)
473 return supported_schemes[scheme].default_port;
477 scheme_disable (enum url_scheme scheme)
479 supported_schemes[scheme].enabled = 0;
482 /* Skip the username and password, if present here. The function
483 should be called *not* with the complete URL, but with the part
484 right after the scheme.
486 If no username and password are found, return 0. */
488 url_skip_uname (const char *url)
492 /* Look for '@' that comes before '/' or '?'. */
493 p = (const char *)strpbrk (url, "/?@");
501 parse_uname (const char *str, int len, char **user, char **passwd)
506 /* Empty user name not allowed. */
509 colon = memchr (str, ':', len);
511 /* Empty user name again. */
516 int pwlen = len - (colon + 1 - str);
517 *passwd = xmalloc (pwlen + 1);
518 memcpy (*passwd, colon + 1, pwlen);
519 (*passwd)[pwlen] = '\0';
525 *user = xmalloc (len + 1);
526 memcpy (*user, str, len);
532 /* Used by main.c: detect URLs written using the "shorthand" URL forms
533 popularized by Netscape and NcFTP. HTTP shorthands look like this:
535 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file
536 www.foo.com[:port] -> http://www.foo.com[:port]
538 FTP shorthands look like this:
540 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file
541 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file
543 If the URL needs not or cannot be rewritten, return NULL. */
545 rewrite_shorthand_url (const char *url)
549 if (url_has_scheme (url))
552 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the
554 for (p = url; *p && *p != ':' && *p != '/'; p++)
564 /* If the characters after the colon and before the next slash
565 or end of string are all digits, it's HTTP. */
567 for (pp = p + 1; ISDIGIT (*pp); pp++)
569 if (digits > 0 && (*pp == '/' || *pp == '\0'))
572 /* Prepend "ftp://" to the entire URL... */
573 res = xmalloc (6 + strlen (url) + 1);
574 sprintf (res, "ftp://%s", url);
575 /* ...and replace ':' with '/'. */
576 res[6 + (p - url)] = '/';
583 /* Just prepend "http://" to what we have. */
584 res = xmalloc (7 + strlen (url) + 1);
585 sprintf (res, "http://%s", url);
590 static void parse_path PARAMS ((const char *, char **, char **));
593 strpbrk_or_eos (const char *s, const char *accept)
595 char *p = strpbrk (s, accept);
597 p = (char *)s + strlen (s);
601 /* Turn STR into lowercase; return non-zero if a character was
605 lowercase_str (char *str)
612 *str = TOLOWER (*str);
617 static char *parse_errors[] = {
618 #define PE_NO_ERROR 0
620 #define PE_UNSUPPORTED_SCHEME 1
621 "Unsupported scheme",
622 #define PE_EMPTY_HOST 2
624 #define PE_BAD_PORT_NUMBER 3
626 #define PE_INVALID_USER_NAME 4
630 #define SETERR(p, v) do { \
637 Return a new struct url if successful, NULL on error. In case of
638 error, and if ERROR is not NULL, also set *ERROR to the appropriate
641 url_parse (const char *url, int *error)
645 int path_modified, host_modified;
647 enum url_scheme scheme;
649 const char *uname_b, *uname_e;
650 const char *host_b, *host_e;
651 const char *path_b, *path_e;
652 const char *params_b, *params_e;
653 const char *query_b, *query_e;
654 const char *fragment_b, *fragment_e;
657 char *user = NULL, *passwd = NULL;
661 scheme = url_scheme (url);
662 if (scheme == SCHEME_INVALID)
664 SETERR (error, PE_UNSUPPORTED_SCHEME);
668 url_encoded = reencode_string (url);
671 p += strlen (supported_schemes[scheme].leading_string);
673 p += url_skip_uname (p);
676 /* scheme://user:pass@host[:port]... */
679 /* We attempt to break down the URL into the components path,
680 params, query, and fragment. They are ordered like this:
682 scheme://host[:port][/path][;params][?query][#fragment] */
684 params_b = params_e = NULL;
685 query_b = query_e = NULL;
686 fragment_b = fragment_e = NULL;
689 p = strpbrk_or_eos (p, ":/;?#");
692 if (host_b == host_e)
694 SETERR (error, PE_EMPTY_HOST);
698 port = scheme_default_port (scheme);
701 const char *port_b, *port_e, *pp;
703 /* scheme://host:port/tralala */
707 p = strpbrk_or_eos (p, "/;?#");
710 if (port_b == port_e)
712 /* http://host:/whatever */
714 SETERR (error, PE_BAD_PORT_NUMBER);
718 for (port = 0, pp = port_b; pp < port_e; pp++)
722 /* http://host:12randomgarbage/blah */
724 SETERR (error, PE_BAD_PORT_NUMBER);
727 port = 10 * port + (*pp - '0');
735 p = strpbrk_or_eos (p, ";?#");
740 /* Path is not allowed not to exist. */
748 p = strpbrk_or_eos (p, "?#");
755 p = strpbrk_or_eos (p, "#");
767 if (uname_b != uname_e)
769 /* http://user:pass@host */
771 /* uname_b uname_e */
772 if (!parse_uname (uname_b, uname_e - uname_b - 1, &user, &passwd))
774 SETERR (error, PE_INVALID_USER_NAME);
779 u = (struct url *)xmalloc (sizeof (struct url));
780 memset (u, 0, sizeof (*u));
783 u->host = strdupdelim (host_b, host_e);
788 u->path = strdupdelim (path_b, path_e);
789 path_modified = path_simplify (u->path);
790 parse_path (u->path, &u->dir, &u->file);
792 host_modified = lowercase_str (u->host);
795 u->params = strdupdelim (params_b, params_e);
797 u->query = strdupdelim (query_b, query_e);
799 u->fragment = strdupdelim (fragment_b, fragment_e);
801 if (path_modified || u->fragment || host_modified || path_b == path_e)
803 /* If we suspect that a transformation has rendered what
804 url_string might return different from URL_ENCODED, rebuild
805 u->url using url_string. */
806 u->url = url_string (u, 0);
808 if (url_encoded != url)
809 xfree ((char *) url_encoded);
813 if (url_encoded == url)
814 u->url = xstrdup (url);
816 u->url = url_encoded;
824 url_error (int error_code)
826 assert (error_code >= 0 && error_code < ARRAY_SIZE (parse_errors));
827 return parse_errors[error_code];
831 parse_path (const char *quoted_path, char **dir, char **file)
833 char *path, *last_slash;
835 STRDUP_ALLOCA (path, quoted_path);
836 decode_string (path);
838 last_slash = strrchr (path, '/');
842 *file = xstrdup (path);
846 *dir = strdupdelim (path, last_slash);
847 *file = xstrdup (last_slash + 1);
851 /* Note: URL's "full path" is the path with the query string and
852 params appended. The "fragment" (#foo) is intentionally ignored,
853 but that might be changed. For example, if the original URL was
854 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
855 the full path will be "/foo/bar/baz;bullshit?querystring". */
857 /* Return the length of the full path, without the terminating
861 full_path_length (const struct url *url)
865 #define FROB(el) if (url->el) len += 1 + strlen (url->el)
876 /* Write out the full path. */
879 full_path_write (const struct url *url, char *where)
881 #define FROB(el, chr) do { \
882 char *f_el = url->el; \
884 int l = strlen (f_el); \
886 memcpy (where, f_el, l); \
898 /* Public function for getting the "full path". E.g. if u->path is
899 "foo/bar" and u->query is "param=value", full_path will be
900 "/foo/bar?param=value". */
903 url_full_path (const struct url *url)
905 int length = full_path_length (url);
906 char *full_path = (char *)xmalloc(length + 1);
908 full_path_write (url, full_path);
909 full_path[length] = '\0';
914 /* Sync u->path and u->url with u->dir and u->file. */
917 sync_path (struct url *url)
925 newpath = xstrdup (url->file);
930 int dirlen = strlen (url->dir);
931 int filelen = strlen (url->file);
933 newpath = xmalloc (dirlen + 1 + filelen + 1);
934 memcpy (newpath, url->dir, dirlen);
935 newpath[dirlen] = '/';
936 memcpy (newpath + dirlen + 1, url->file, filelen);
937 newpath[dirlen + 1 + filelen] = '\0';
943 /* Synchronize u->url. */
945 url->url = url_string (url, 0);
948 /* Mutators. Code in ftp.c insists on changing u->dir and u->file.
949 This way we can sync u->path and u->url when they get changed. */
952 url_set_dir (struct url *url, const char *newdir)
955 url->dir = xstrdup (newdir);
960 url_set_file (struct url *url, const char *newfile)
963 url->file = xstrdup (newfile);
968 url_free (struct url *url)
974 FREE_MAYBE (url->params);
975 FREE_MAYBE (url->query);
976 FREE_MAYBE (url->fragment);
977 FREE_MAYBE (url->user);
978 FREE_MAYBE (url->passwd);
987 get_urls_file (const char *file)
989 struct file_memory *fm;
990 struct urlpos *head, *tail;
991 const char *text, *text_end;
994 fm = read_file (file);
997 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
1000 DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
1004 text_end = fm->content + fm->length;
1005 while (text < text_end)
1007 const char *line_beg = text;
1008 const char *line_end = memchr (text, '\n', text_end - text);
1010 line_end = text_end;
1015 /* Strip whitespace from the beginning and end of line. */
1016 while (line_beg < line_end && ISSPACE (*line_beg))
1018 while (line_end > line_beg && ISSPACE (*(line_end - 1)))
1021 if (line_end > line_beg)
1023 /* URL is in the [line_beg, line_end) region. */
1027 struct urlpos *entry;
1030 /* We must copy the URL to a zero-terminated string, and we
1031 can't use alloca because we're in a loop. *sigh*. */
1032 url_text = strdupdelim (line_beg, line_end);
1036 /* Merge opt.base_href with URL. */
1037 char *merged = uri_merge (opt.base_href, url_text);
1042 url = url_parse (url_text, &up_error_code);
1045 logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
1046 file, url_text, url_error (up_error_code));
1052 entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
1053 memset (entry, 0, sizeof (*entry));
1064 read_file_free (fm);
1068 /* Free the linked list of urlpos. */
1070 free_urlpos (struct urlpos *l)
1074 struct urlpos *next = l->next;
1077 FREE_MAYBE (l->local_name);
1083 /* Rotate FNAME opt.backups times */
1085 rotate_backups(const char *fname)
1087 int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1088 char *from = (char *)alloca (maxlen);
1089 char *to = (char *)alloca (maxlen);
1093 if (stat (fname, &sb) == 0)
1094 if (S_ISREG (sb.st_mode) == 0)
1097 for (i = opt.backups; i > 1; i--)
1099 sprintf (from, "%s.%d", fname, i - 1);
1100 sprintf (to, "%s.%d", fname, i);
1101 /* #### This will fail on machines without the rename() system
1106 sprintf (to, "%s.%d", fname, 1);
1110 /* Create all the necessary directories for PATH (a file). Calls
1111 mkdirhier() internally. */
1113 mkalldirs (const char *path)
1120 p = path + strlen (path);
1121 for (; *p != '/' && p != path; p--);
1122 /* Don't create if it's just a file. */
1123 if ((p == path) && (*p != '/'))
1125 t = strdupdelim (path, p);
1126 /* Check whether the directory exists. */
1127 if ((stat (t, &st) == 0))
1129 if (S_ISDIR (st.st_mode))
1136 /* If the dir exists as a file name, remove it first. This
1137 is *only* for Wget to work with buggy old CERN http
1138 servers. Here is the scenario: When Wget tries to
1139 retrieve a directory without a slash, e.g.
1140 http://foo/bar (bar being a directory), CERN server will
1141 not redirect it too http://foo/bar/ -- it will generate a
1142 directory listing containing links to bar/file1,
1143 bar/file2, etc. Wget will lose because it saves this
1144 HTML listing to a file `bar', so it cannot create the
1145 directory. To work around this, if the file of the same
1146 name exists, we just remove it and create the directory
1148 DEBUGP (("Removing %s because of directory danger!\n", t));
1152 res = make_directory (t);
1154 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1160 count_slashes (const char *s)
1169 /* Return the path name of the URL-equivalent file name, with a
1170 remote-like structure of directories. */
1172 mkstruct (const struct url *u)
1174 char *dir, *dir_preencoding;
1175 char *file, *res, *dirpref;
1176 char *query = u->query && *u->query ? u->query : NULL;
1181 char *ptr = u->dir + (*u->dir == '/');
1182 int slash_count = 1 + count_slashes (ptr);
1183 int cut = MINVAL (opt.cut_dirs, slash_count);
1184 for (; cut && *ptr; ptr++)
1187 STRDUP_ALLOCA (dir, ptr);
1190 dir = u->dir + (*u->dir == '/');
1192 /* Check for the true name (or at least a consistent name for saving
1193 to directory) of HOST, reusing the hlist if possible. */
1194 if (opt.add_hostdir)
1196 /* Add dir_prefix and hostname (if required) to the beginning of
1198 dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1
1200 + 1 + numdigit (u->port)
1202 if (!DOTP (opt.dir_prefix))
1203 sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host);
1205 strcpy (dirpref, u->host);
1207 if (u->port != scheme_default_port (u->scheme))
1209 int len = strlen (dirpref);
1211 number_to_string (dirpref + len + 1, u->port);
1214 else /* not add_hostdir */
1216 if (!DOTP (opt.dir_prefix))
1217 dirpref = opt.dir_prefix;
1222 /* If there is a prefix, prepend it. */
1225 char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2);
1226 sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir);
1230 dir_preencoding = dir;
1231 dir = reencode_string (dir_preencoding);
1234 if (l && dir[l - 1] == '/')
1238 file = "index.html";
1242 /* Finally, construct the full name. */
1243 res = (char *)xmalloc (strlen (dir) + 1 + strlen (file)
1244 + (query ? (1 + strlen (query)) : 0)
1246 sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file);
1250 strcat (res, query);
1252 if (dir != dir_preencoding)
1257 /* Compose a file name out of BASE, an unescaped file name, and QUERY,
1258 an escaped query string. The trick is to make sure that unsafe
1259 characters in BASE are escaped, and that slashes in QUERY are also
1263 compose_file_name (char *base, char *query)
1269 /* Copy BASE to RESULT and encode all unsafe characters. */
1271 while (*from && to - result < sizeof (result))
1273 if (UNSAFE_CHAR (*from))
1275 unsigned char c = *from++;
1277 *to++ = XDIGIT_TO_XCHAR (c >> 4);
1278 *to++ = XDIGIT_TO_XCHAR (c & 0xf);
1284 if (query && to - result < sizeof (result))
1288 /* Copy QUERY to RESULT and encode all '/' characters. */
1290 while (*from && to - result < sizeof (result))
1304 if (to - result < sizeof (result))
1307 /* Truncate input which is too long, presumably due to a huge
1309 result[sizeof (result) - 1] = '\0';
1311 return xstrdup (result);
1314 /* Create a unique filename, corresponding to a given URL. Calls
1315 mkstruct if necessary. Does *not* actually create any directories. */
1317 url_filename (const struct url *u)
1320 int have_prefix = 0; /* whether we must prepend opt.dir_prefix */
1324 file = mkstruct (u);
1329 char *base = *u->file ? u->file : "index.html";
1330 char *query = u->query && *u->query ? u->query : NULL;
1331 file = compose_file_name (base, query);
1336 /* Check whether the prefix directory is something other than "."
1337 before prepending it. */
1338 if (!DOTP (opt.dir_prefix))
1340 char *nfile = (char *)xmalloc (strlen (opt.dir_prefix)
1341 + 1 + strlen (file) + 1);
1342 sprintf (nfile, "%s/%s", opt.dir_prefix, file);
1347 /* DOS-ish file systems don't like `%' signs in them; we change it
1352 for (p = file; *p; p++)
1356 #endif /* WINDOWS */
1358 /* Check the cases in which the unique extensions are not used:
1359 1) Clobbering is turned off (-nc).
1360 2) Retrieval with regetting.
1361 3) Timestamping is used.
1362 4) Hierarchy is built.
1364 The exception is the case when file does exist and is a
1365 directory (actually support for bad httpd-s). */
1366 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1367 && !(file_exists_p (file) && !file_non_directory_p (file)))
1370 /* Find a unique name. */
1371 name = unique_name (file);
1376 /* Like strlen(), but allow the URL to be ended with '?'. */
1378 urlpath_length (const char *url)
1380 const char *q = strpbrk_or_eos (url, "?;#");
1384 /* Find the last occurrence of character C in the range [b, e), or
1385 NULL, if none are present. This is almost completely equivalent to
1386 { *e = '\0'; return strrchr(b); }, except that it doesn't change
1387 the contents of the string. */
1389 find_last_char (const char *b, const char *e, char c)
1397 /* Resolve the result of "linking" a base URI (BASE) to a
1398 link-specified URI (LINK).
1400 Either of the URIs may be absolute or relative, complete with the
1401 host name, or path only. This tries to behave "reasonably" in all
1402 foreseeable cases. It employs little specific knowledge about
1403 schemes or URL-specific stuff -- it just works on strings.
1405 The parameters LINKLENGTH is useful if LINK is not zero-terminated.
1406 See uri_merge for a gentler interface to this functionality.
1408 Perhaps this function should handle `./' and `../' so that the evil
1409 path_simplify can go. */
1411 uri_merge_1 (const char *base, const char *link, int linklength, int no_scheme)
1417 const char *end = base + urlpath_length (base);
1421 /* Empty LINK points back to BASE, query string and all. */
1422 constr = xstrdup (base);
1424 else if (*link == '?')
1426 /* LINK points to the same location, but changes the query
1427 string. Examples: */
1428 /* uri_merge("path", "?new") -> "path?new" */
1429 /* uri_merge("path?foo", "?new") -> "path?new" */
1430 /* uri_merge("path?foo#bar", "?new") -> "path?new" */
1431 /* uri_merge("path#foo", "?new") -> "path?new" */
1432 int baselength = end - base;
1433 constr = xmalloc (baselength + linklength + 1);
1434 memcpy (constr, base, baselength);
1435 memcpy (constr + baselength, link, linklength);
1436 constr[baselength + linklength] = '\0';
1438 else if (*link == '#')
1440 /* uri_merge("path", "#new") -> "path#new" */
1441 /* uri_merge("path#foo", "#new") -> "path#new" */
1442 /* uri_merge("path?foo", "#new") -> "path?foo#new" */
1443 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1445 const char *end1 = strchr (base, '#');
1447 end1 = base + strlen (base);
1448 baselength = end1 - base;
1449 constr = xmalloc (baselength + linklength + 1);
1450 memcpy (constr, base, baselength);
1451 memcpy (constr + baselength, link, linklength);
1452 constr[baselength + linklength] = '\0';
1454 else if (*link == '/')
1456 /* LINK is an absolute path: we need to replace everything
1457 after (and including) the FIRST slash with LINK.
1459 So, if BASE is "http://host/whatever/foo/bar", and LINK is
1460 "/qux/xyzzy", our result should be
1461 "http://host/qux/xyzzy". */
1464 const char *start_insert = NULL; /* for gcc to shut up. */
1465 const char *pos = base;
1466 int seen_slash_slash = 0;
1467 /* We're looking for the first slash, but want to ignore
1470 slash = memchr (pos, '/', end - pos);
1471 if (slash && !seen_slash_slash)
1472 if (*(slash + 1) == '/')
1475 seen_slash_slash = 1;
1479 /* At this point, SLASH is the location of the first / after
1480 "//", or the first slash altogether. START_INSERT is the
1481 pointer to the location where LINK will be inserted. When
1482 examining the last two examples, keep in mind that LINK
1485 if (!slash && !seen_slash_slash)
1486 /* example: "foo" */
1488 start_insert = base;
1489 else if (!slash && seen_slash_slash)
1490 /* example: "http://foo" */
1493 else if (slash && !seen_slash_slash)
1494 /* example: "foo/bar" */
1496 start_insert = base;
1497 else if (slash && seen_slash_slash)
1498 /* example: "http://something/" */
1500 start_insert = slash;
1502 span = start_insert - base;
1503 constr = (char *)xmalloc (span + linklength + 1);
1505 memcpy (constr, base, span);
1507 memcpy (constr + span, link, linklength);
1508 constr[span + linklength] = '\0';
1512 /* LINK is a relative URL: we need to replace everything
1513 after last slash (possibly empty) with LINK.
1515 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1516 our result should be "whatever/foo/qux/xyzzy". */
1517 int need_explicit_slash = 0;
1519 const char *start_insert;
1520 const char *last_slash = find_last_char (base, end, '/');
1523 /* No slash found at all. Append LINK to what we have,
1524 but we'll need a slash as a separator.
1526 Example: if base == "foo" and link == "qux/xyzzy", then
1527 we cannot just append link to base, because we'd get
1528 "fooqux/xyzzy", whereas what we want is
1531 To make sure the / gets inserted, we set
1532 need_explicit_slash to 1. We also set start_insert
1533 to end + 1, so that the length calculations work out
1534 correctly for one more (slash) character. Accessing
1535 that character is fine, since it will be the
1536 delimiter, '\0' or '?'. */
1537 /* example: "foo?..." */
1538 /* ^ ('?' gets changed to '/') */
1539 start_insert = end + 1;
1540 need_explicit_slash = 1;
1542 else if (last_slash && last_slash != base && *(last_slash - 1) == '/')
1544 /* example: http://host" */
1546 start_insert = end + 1;
1547 need_explicit_slash = 1;
1551 /* example: "whatever/foo/bar" */
1553 start_insert = last_slash + 1;
1556 span = start_insert - base;
1557 constr = (char *)xmalloc (span + linklength + 1);
1559 memcpy (constr, base, span);
1560 if (need_explicit_slash)
1561 constr[span - 1] = '/';
1563 memcpy (constr + span, link, linklength);
1564 constr[span + linklength] = '\0';
1567 else /* !no_scheme */
1569 constr = strdupdelim (link, link + linklength);
1574 /* Merge BASE with LINK and return the resulting URI. This is an
1575 interface to uri_merge_1 that assumes that LINK is a
1576 zero-terminated string. */
1578 uri_merge (const char *base, const char *link)
1580 return uri_merge_1 (base, link, strlen (link), !url_has_scheme (link));
1583 #define APPEND(p, s) do { \
1584 int len = strlen (s); \
1585 memcpy (p, s, len); \
1589 /* Use this instead of password when the actual password is supposed
1590 to be hidden. We intentionally use a generic string without giving
1591 away the number of characters in the password, like previous
1593 #define HIDDEN_PASSWORD "*password*"
1595 /* Recreate the URL string from the data in URL.
1597 If HIDE is non-zero (as it is when we're calling this on a URL we
1598 plan to print, but not when calling it to canonicalize a URL for
1599 use within the program), password will be hidden. Unsafe
1600 characters in the URL will be quoted. */
1603 url_string (const struct url *url, int hide_password)
1607 char *quoted_user = NULL, *quoted_passwd = NULL;
1609 int scheme_port = supported_schemes[url->scheme].default_port;
1610 char *scheme_str = supported_schemes[url->scheme].leading_string;
1611 int fplen = full_path_length (url);
1613 assert (scheme_str != NULL);
1615 /* Make sure the user name and password are quoted. */
1618 quoted_user = encode_string_maybe (url->user);
1622 quoted_passwd = HIDDEN_PASSWORD;
1624 quoted_passwd = encode_string_maybe (url->passwd);
1628 size = (strlen (scheme_str)
1629 + strlen (url->host)
1632 if (url->port != scheme_port)
1633 size += 1 + numdigit (url->port);
1636 size += 1 + strlen (quoted_user);
1638 size += 1 + strlen (quoted_passwd);
1641 p = result = xmalloc (size);
1643 APPEND (p, scheme_str);
1646 APPEND (p, quoted_user);
1650 APPEND (p, quoted_passwd);
1655 APPEND (p, url->host);
1656 if (url->port != scheme_port)
1659 p = number_to_string (p, url->port);
1662 full_path_write (url, p);
1666 assert (p - result == size);
1668 if (quoted_user && quoted_user != url->user)
1669 xfree (quoted_user);
1670 if (quoted_passwd && !hide_password
1671 && quoted_passwd != url->passwd)
1672 xfree (quoted_passwd);
1677 /* Returns proxy host address, in accordance with SCHEME. */
1679 getproxy (enum url_scheme scheme)
1682 char *rewritten_url;
1683 static char rewritten_storage[1024];
1688 proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1692 proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1696 proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1698 case SCHEME_INVALID:
1701 if (!proxy || !*proxy)
1704 /* Handle shorthands. */
1705 rewritten_url = rewrite_shorthand_url (proxy);
1708 strncpy (rewritten_storage, rewritten_url, sizeof(rewritten_storage));
1709 rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1710 proxy = rewritten_storage;
1716 /* Should a host be accessed through proxy, concerning no_proxy? */
1718 no_proxy_match (const char *host, const char **no_proxy)
1723 return !sufmatch (no_proxy, host);
1726 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
1727 static const char *replace_attr PARAMS ((const char *, int, FILE *,
1729 static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
1730 const char *, int));
1731 static char *local_quote_string PARAMS ((const char *));
1733 /* Change the links in one HTML file. LINKS is a list of links in the
1734 document, along with their positions and the desired direction of
1737 convert_links (const char *file, struct urlpos *links)
1739 struct file_memory *fm;
1742 downloaded_file_t downloaded_file_return;
1744 struct urlpos *link;
1745 int to_url_count = 0, to_file_count = 0;
1747 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
1750 /* First we do a "dry run": go through the list L and see whether
1751 any URL needs to be converted in the first place. If not, just
1752 leave the file alone. */
1754 struct urlpos *dry = links;
1755 for (dry = links; dry; dry = dry->next)
1756 if (dry->convert != CO_NOCONVERT)
1760 logputs (LOG_VERBOSE, _("nothing to do.\n"));
1765 fm = read_file (file);
1768 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1769 file, strerror (errno));
1773 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
1774 if (opt.backup_converted && downloaded_file_return)
1775 write_backup_file (file, downloaded_file_return);
1777 /* Before opening the file for writing, unlink the file. This is
1778 important if the data in FM is mmaped. In such case, nulling the
1779 file, which is what fopen() below does, would make us read all
1780 zeroes from the mmaped region. */
1781 if (unlink (file) < 0 && errno != ENOENT)
1783 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
1784 file, strerror (errno));
1785 read_file_free (fm);
1788 /* Now open the file for writing. */
1789 fp = fopen (file, "wb");
1792 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
1793 file, strerror (errno));
1794 read_file_free (fm);
1798 /* Here we loop through all the URLs in file, replacing those of
1799 them that are downloaded with relative references. */
1801 for (link = links; link; link = link->next)
1803 char *url_start = fm->content + link->pos;
1805 if (link->pos >= fm->length)
1807 DEBUGP (("Something strange is going on. Please investigate."));
1810 /* If the URL is not to be converted, skip it. */
1811 if (link->convert == CO_NOCONVERT)
1813 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
1817 /* Echo the file contents, up to the offending URL's opening
1818 quote, to the outfile. */
1819 fwrite (p, 1, url_start - p, fp);
1822 switch (link->convert)
1824 case CO_CONVERT_TO_RELATIVE:
1825 /* Convert absolute URL to relative. */
1827 char *newname = construct_relative (file, link->local_name);
1828 char *quoted_newname = local_quote_string (newname);
1830 if (!link->link_refresh_p)
1831 p = replace_attr (p, link->size, fp, quoted_newname);
1833 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
1834 link->refresh_timeout);
1836 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
1837 link->url->url, newname, link->pos, file));
1839 xfree (quoted_newname);
1843 case CO_CONVERT_TO_COMPLETE:
1844 /* Convert the link to absolute URL. */
1846 char *newlink = link->url->url;
1847 char *quoted_newlink = html_quote_string (newlink);
1849 if (!link->link_refresh_p)
1850 p = replace_attr (p, link->size, fp, quoted_newlink);
1852 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
1853 link->refresh_timeout);
1855 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
1856 newlink, link->pos, file));
1857 xfree (quoted_newlink);
1861 case CO_NULLIFY_BASE:
1862 /* Change the base href to "". */
1863 p = replace_attr (p, link->size, fp, "");
1871 /* Output the rest of the file. */
1872 if (p - fm->content < fm->length)
1873 fwrite (p, 1, fm->length - (p - fm->content), fp);
1875 read_file_free (fm);
1877 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
1880 /* Construct and return a malloced copy of the relative link from two
1881 pieces of information: local name S1 of the referring file and
1882 local name S2 of the referred file.
1884 So, if S1 is "jagor.srce.hr/index.html" and S2 is
1885 "jagor.srce.hr/images/news.gif", the function will return
1888 Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
1889 "fly.cc.fer.hr/images/fly.gif", the function will return
1890 "../images/fly.gif".
1892 Caveats: S1 should not begin with `/', unless S2 also begins with
1893 '/'. S1 should not contain things like ".." and such --
1894 construct_relative ("fly/ioccc/../index.html",
1895 "fly/images/fly.gif") will fail. (A workaround is to call
1896 something like path_simplify() on S1). */
1898 construct_relative (const char *s1, const char *s2)
1900 int i, cnt, sepdirs1;
1904 return xstrdup (s2);
1905 /* S1 should *not* be absolute, if S2 wasn't. */
1906 assert (*s1 != '/');
1908 /* Skip the directories common to both strings. */
1911 while (s1[i] && s2[i]
1916 if (s1[i] == '/' && s2[i] == '/')
1921 for (sepdirs1 = 0; s1[i]; i++)
1924 /* Now, construct the file as of:
1925 - ../ repeated sepdirs1 time
1926 - all the non-mutual directories of S2. */
1927 res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
1928 for (i = 0; i < sepdirs1; i++)
1929 memcpy (res + 3 * i, "../", 3);
1930 strcpy (res + 3 * i, s2 + cnt);
1935 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
1937 /* Rather than just writing over the original .html file with the
1938 converted version, save the former to *.orig. Note we only do
1939 this for files we've _successfully_ downloaded, so we don't
1940 clobber .orig files sitting around from previous invocations. */
1942 /* Construct the backup filename as the original name plus ".orig". */
1943 size_t filename_len = strlen(file);
1944 char* filename_plus_orig_suffix;
1945 boolean already_wrote_backup_file = FALSE;
1946 slist* converted_file_ptr;
1947 static slist* converted_files = NULL;
1949 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
1951 /* Just write "orig" over "html". We need to do it this way
1952 because when we're checking to see if we've downloaded the
1953 file before (to see if we can skip downloading it), we don't
1954 know if it's a text/html file. Therefore we don't know yet
1955 at that stage that -E is going to cause us to tack on
1956 ".html", so we need to compare vs. the original URL plus
1957 ".orig", not the original URL plus ".html.orig". */
1958 filename_plus_orig_suffix = alloca (filename_len + 1);
1959 strcpy(filename_plus_orig_suffix, file);
1960 strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
1962 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
1964 /* Append ".orig" to the name. */
1965 filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
1966 strcpy(filename_plus_orig_suffix, file);
1967 strcpy(filename_plus_orig_suffix + filename_len, ".orig");
1970 /* We can get called twice on the same URL thanks to the
1971 convert_all_links() call in main(). If we write the .orig file
1972 each time in such a case, it'll end up containing the first-pass
1973 conversion, not the original file. So, see if we've already been
1974 called on this file. */
1975 converted_file_ptr = converted_files;
1976 while (converted_file_ptr != NULL)
1977 if (strcmp(converted_file_ptr->string, file) == 0)
1979 already_wrote_backup_file = TRUE;
1983 converted_file_ptr = converted_file_ptr->next;
1985 if (!already_wrote_backup_file)
1987 /* Rename <file> to <file>.orig before former gets written over. */
1988 if (rename(file, filename_plus_orig_suffix) != 0)
1989 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
1990 file, filename_plus_orig_suffix, strerror (errno));
1992 /* Remember that we've already written a .orig backup for this file.
1993 Note that we never free this memory since we need it till the
1994 convert_all_links() call, which is one of the last things the
1995 program does before terminating. BTW, I'm not sure if it would be
1996 safe to just set 'converted_file_ptr->string' to 'file' below,
1997 rather than making a copy of the string... Another note is that I
1998 thought I could just add a field to the urlpos structure saying
1999 that we'd written a .orig file for this URL, but that didn't work,
2000 so I had to make this separate list.
2001 -- Dan Harkless <wget@harkless.org>
2003 This [adding a field to the urlpos structure] didn't work
2004 because convert_file() is called from convert_all_links at
2005 the end of the retrieval with a freshly built new urlpos
2007 -- Hrvoje Niksic <hniksic@arsdigita.com>
2009 converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));
2010 converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */
2011 converted_file_ptr->next = converted_files;
2012 converted_files = converted_file_ptr;
2016 static int find_fragment PARAMS ((const char *, int, const char **,
2019 /* Replace an attribute's original text with NEW_TEXT. */
2022 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
2025 char quote_char = '\"'; /* use "..." for quoting, unless the
2026 original value is quoted, in which
2027 case reuse its quoting char. */
2028 const char *frag_beg, *frag_end;
2030 /* Structure of our string is:
2031 "...old-contents..."
2032 <--- size ---> (with quotes)
2035 <--- size --> (no quotes) */
2037 if (*p == '\"' || *p == '\'')
2042 size -= 2; /* disregard opening and closing quote */
2044 putc (quote_char, fp);
2045 fputs (new_text, fp);
2047 /* Look for fragment identifier, if any. */
2048 if (find_fragment (p, size, &frag_beg, &frag_end))
2049 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
2053 putc (quote_char, fp);
2058 /* The same as REPLACE_ATTR, but used when replacing
2059 <meta http-equiv=refresh content="new_text"> because we need to
2060 append "timeout_value; URL=" before the next_text. */
2063 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
2064 const char *new_text, int timeout)
2067 char *new_with_timeout = (char *)alloca (numdigit (timeout)
2071 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
2073 return replace_attr (p, size, fp, new_with_timeout);
2076 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
2077 preceded by '&'. If the character is not found, return zero. If
2078 the character is found, return 1 and set BP and EP to point to the
2079 beginning and end of the region.
2081 This is used for finding the fragment indentifiers in URLs. */
2084 find_fragment (const char *beg, int size, const char **bp, const char **ep)
2086 const char *end = beg + size;
2088 for (; beg < end; beg++)
2110 /* Quote FILE for use as local reference to an HTML file.
2112 We quote ? as %3F to avoid passing part of the file name as the
2113 parameter when browsing the converted file through HTTP. However,
2114 it is safe to do this only when `--html-extension' is turned on.
2115 This is because converting "index.html?foo=bar" to
2116 "index.html%3Ffoo=bar" would break local browsing, as the latter
2117 isn't even recognized as an HTML file! However, converting
2118 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
2119 safe for both local and HTTP-served browsing. */
2122 local_quote_string (const char *file)
2124 const char *file_sans_qmark;
2127 if (!opt.html_extension)
2128 return html_quote_string (file);
2130 qm = count_char (file, '?');
2134 const char *from = file;
2137 /* qm * 2 because we replace each question mark with "%3F",
2138 i.e. replace one char with three, hence two more. */
2139 int fsqlen = strlen (file) + qm * 2;
2141 to = newname = (char *)alloca (fsqlen + 1);
2142 for (; *from; from++)
2153 assert (to - newname == fsqlen);
2156 file_sans_qmark = newname;
2159 file_sans_qmark = file;
2161 return html_quote_string (file_sans_qmark);
2164 /* We're storing "modes" of type downloaded_file_t in the hash table.
2165 However, our hash tables only accept pointers for keys and values.
2166 So when we need a pointer, we use the address of a
2167 downloaded_file_t variable of static storage. */
2169 static downloaded_file_t *
2170 downloaded_mode_to_ptr (downloaded_file_t mode)
2172 static downloaded_file_t
2173 v1 = FILE_NOT_ALREADY_DOWNLOADED,
2174 v2 = FILE_DOWNLOADED_NORMALLY,
2175 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
2176 v4 = CHECK_FOR_FILE;
2180 case FILE_NOT_ALREADY_DOWNLOADED:
2182 case FILE_DOWNLOADED_NORMALLY:
2184 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
2186 case CHECK_FOR_FILE:
2192 /* This should really be merged with dl_file_url_map and
2193 downloaded_html_files in recur.c. This was originally a list, but
2194 I changed it to a hash table beause it was actually taking a lot of
2195 time to find things in it. */
2197 static struct hash_table *downloaded_files_hash;
2199 /* Remembers which files have been downloaded. In the standard case, should be
2200 called with mode == FILE_DOWNLOADED_NORMALLY for each file we actually
2201 download successfully (i.e. not for ones we have failures on or that we skip
2204 When we've downloaded a file and tacked on a ".html" extension due to -E,
2205 call this function with FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
2206 FILE_DOWNLOADED_NORMALLY.
2208 If you just want to check if a file has been previously added without adding
2209 it, call with mode == CHECK_FOR_FILE. Please be sure to call this function
2210 with local filenames, not remote URLs. */
2212 downloaded_file (downloaded_file_t mode, const char *file)
2214 downloaded_file_t *ptr;
2216 if (mode == CHECK_FOR_FILE)
2218 if (!downloaded_files_hash)
2219 return FILE_NOT_ALREADY_DOWNLOADED;
2220 ptr = hash_table_get (downloaded_files_hash, file);
2222 return FILE_NOT_ALREADY_DOWNLOADED;
2226 if (!downloaded_files_hash)
2227 downloaded_files_hash = make_string_hash_table (0);
2229 ptr = hash_table_get (downloaded_files_hash, file);
2233 ptr = downloaded_mode_to_ptr (mode);
2234 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
2236 return FILE_NOT_ALREADY_DOWNLOADED;
2240 df_free_mapper (void *key, void *value, void *ignored)
2247 downloaded_files_free (void)
2249 if (downloaded_files_hash)
2251 hash_table_map (downloaded_files_hash, df_free_mapper, NULL);
2252 hash_table_destroy (downloaded_files_hash);
2253 downloaded_files_hash = NULL;