2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001, 2002
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
35 #include <sys/types.h>
46 #if TIME_WITH_SYS_TIME
47 # include <sys/time.h>
51 # include <sys/time.h>
68 # include "gen_sslfunc.h"
76 extern char *version_string;
77 extern LARGE_INT total_downloaded_bytes;
80 # define MIN(x, y) ((x) > (y) ? (y) : (x))
84 static int cookies_loaded_p;
85 struct cookie_jar *wget_cookie_jar;
87 #define TEXTHTML_S "text/html"
88 #define TEXTXHTML_S "application/xhtml+xml"
90 /* Some status code validation macros: */
91 #define H_20X(x) (((x) >= 200) && ((x) < 300))
92 #define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS)
93 #define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \
94 || (x) == HTTP_STATUS_MOVED_TEMPORARILY \
95 || (x) == HTTP_STATUS_TEMPORARY_REDIRECT)
97 /* HTTP/1.0 status codes from RFC1945, provided for reference. */
99 #define HTTP_STATUS_OK 200
100 #define HTTP_STATUS_CREATED 201
101 #define HTTP_STATUS_ACCEPTED 202
102 #define HTTP_STATUS_NO_CONTENT 204
103 #define HTTP_STATUS_PARTIAL_CONTENTS 206
105 /* Redirection 3xx. */
106 #define HTTP_STATUS_MULTIPLE_CHOICES 300
107 #define HTTP_STATUS_MOVED_PERMANENTLY 301
108 #define HTTP_STATUS_MOVED_TEMPORARILY 302
109 #define HTTP_STATUS_NOT_MODIFIED 304
110 #define HTTP_STATUS_TEMPORARY_REDIRECT 307
112 /* Client error 4xx. */
113 #define HTTP_STATUS_BAD_REQUEST 400
114 #define HTTP_STATUS_UNAUTHORIZED 401
115 #define HTTP_STATUS_FORBIDDEN 403
116 #define HTTP_STATUS_NOT_FOUND 404
118 /* Server errors 5xx. */
119 #define HTTP_STATUS_INTERNAL 500
120 #define HTTP_STATUS_NOT_IMPLEMENTED 501
121 #define HTTP_STATUS_BAD_GATEWAY 502
122 #define HTTP_STATUS_UNAVAILABLE 503
125 rel_none, rel_name, rel_value, rel_both
132 struct request_header {
134 enum rp release_policy;
136 int hcount, hcapacity;
139 /* Create a new, empty request. At least request_set_method must be
140 called before the request can be used. */
142 static struct request *
145 struct request *req = xnew0 (struct request);
147 req->headers = xnew_array (struct request_header, req->hcapacity);
151 /* Set the request's method and its arguments. METH should be a
152 literal string (or it should outlive the request) because it will
153 not be freed. ARG will be freed by request_free. */
156 request_set_method (struct request *req, const char *meth, char *arg)
162 /* Return the method string passed with the last call to
163 request_set_method. */
166 request_method (const struct request *req)
171 /* Free one header according to the release policy specified with
172 request_set_header. */
175 release_header (struct request_header *hdr)
177 switch (hdr->release_policy)
194 /* Set the request named NAME to VALUE. Specifically, this means that
195 a "NAME: VALUE\r\n" header line will be used in the request. If a
196 header with the same name previously existed in the request, its
197 value will be replaced by this one.
199 RELEASE_POLICY determines whether NAME and VALUE should be released
200 (freed) with request_free. Allowed values are:
202 - rel_none - don't free NAME or VALUE
203 - rel_name - free NAME when done
204 - rel_value - free VALUE when done
205 - rel_both - free both NAME and VALUE when done
207 Setting release policy is useful when arguments come from different
208 sources. For example:
210 // Don't free literal strings!
211 request_set_header (req, "Pragma", "no-cache", rel_none);
213 // Don't free a global variable, we'll need it later.
214 request_set_header (req, "Referer", opt.referer, rel_none);
216 // Value freshly allocated, free it when done.
217 request_set_header (req, "Range", aprintf ("bytes=%ld-", hs->restval),
222 request_set_header (struct request *req, char *name, char *value,
223 enum rp release_policy)
225 struct request_header *hdr;
229 for (i = 0; i < req->hcount; i++)
231 hdr = &req->headers[i];
232 if (0 == strcasecmp (name, hdr->name))
234 /* Replace existing header. */
235 release_header (hdr);
238 hdr->release_policy = release_policy;
243 /* Install new header. */
245 if (req->hcount >= req->hcount)
247 req->hcapacity <<= 1;
248 req->headers = xrealloc (req->headers,
249 req->hcapacity * sizeof (struct request_header));
251 hdr = &req->headers[req->hcount++];
254 hdr->release_policy = release_policy;
257 /* Like request_set_header, but sets the whole header line, as
258 provided by the user using the `--header' option. For example,
259 request_set_user_header (req, "Foo: bar") works just like
260 request_set_header (req, "Foo", "bar"). */
263 request_set_user_header (struct request *req, const char *header)
266 const char *p = strchr (header, ':');
269 BOUNDED_TO_ALLOCA (header, p, name);
273 request_set_header (req, xstrdup (name), (char *) p, rel_name);
276 #define APPEND(p, str) do { \
277 int A_len = strlen (str); \
278 memcpy (p, str, A_len); \
282 /* Construct the request and write it to FD using fd_write. */
285 request_send (const struct request *req, int fd)
287 char *request_string, *p;
288 int i, size, write_error;
290 /* Count the request size. */
293 /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */
294 size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2;
296 for (i = 0; i < req->hcount; i++)
298 struct request_header *hdr = &req->headers[i];
299 /* NAME ": " VALUE "\r\n" */
300 size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2;
306 p = request_string = alloca_array (char, size);
308 /* Generate the request. */
310 APPEND (p, req->method); *p++ = ' ';
311 APPEND (p, req->arg); *p++ = ' ';
312 memcpy (p, "HTTP/1.0\r\n", 10); p += 10;
314 for (i = 0; i < req->hcount; i++)
316 struct request_header *hdr = &req->headers[i];
317 APPEND (p, hdr->name);
318 *p++ = ':', *p++ = ' ';
319 APPEND (p, hdr->value);
320 *p++ = '\r', *p++ = '\n';
323 *p++ = '\r', *p++ = '\n', *p++ = '\0';
324 assert (p - request_string == size);
328 DEBUGP (("\n---request begin---\n%s---request end---\n", request_string));
330 /* Send the request to the server. */
332 write_error = fd_write (fd, request_string, size - 1, -1);
334 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
339 /* Release the resources used by REQ. */
342 request_free (struct request *req)
345 xfree_null (req->arg);
346 for (i = 0; i < req->hcount; i++)
347 release_header (&req->headers[i]);
348 xfree_null (req->headers);
353 head_terminator (const char *hunk, int oldlen, int peeklen)
355 const char *start, *end;
357 /* If at first peek, verify whether HUNK starts with "HTTP". If
358 not, this is a HTTP/0.9 request and we must bail out without
360 if (oldlen == 0 && 0 != memcmp (hunk, "HTTP", MIN (peeklen, 4)))
366 start = hunk + oldlen - 4;
367 end = hunk + oldlen + peeklen;
369 for (; start < end - 1; start++)
376 if (start[1] == '\n')
382 /* Read the HTTP request head from FD and return it. The error
383 conditions are the same as with fd_read_hunk.
385 To support HTTP/0.9 responses, this function tries to make sure
386 that the data begins with "HTTP". If this is not the case, no data
387 is read and an empty request is returned, so that the remaining
388 data can be treated as body. */
391 fd_read_http_head (int fd)
393 return fd_read_hunk (fd, head_terminator, 512);
397 /* The response data. */
400 /* The array of pointers that indicate where each header starts.
401 For example, given this HTTP response:
408 The headers are located like this:
410 "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n"
412 headers[0] headers[1] headers[2] headers[3]
414 I.e. headers[0] points to the beginning of the request,
415 headers[1] points to the end of the first header and the
416 beginning of the second one, etc. */
418 const char **headers;
421 /* Create a new response object from the text of the HTTP response,
422 available in HEAD. That text is automatically split into
423 constituent header lines for fast retrieval using
424 response_header_*. */
426 static struct response *
427 response_new (const char *head)
432 struct response *resp = xnew0 (struct response);
437 /* Empty head means that we're dealing with a headerless
438 (HTTP/0.9) response. In that case, don't set HEADERS at
443 /* Split HEAD into header lines, so that response_header_* functions
444 don't need to do this over and over again. */
450 DO_REALLOC (resp->headers, size, count + 1, const char *);
451 resp->headers[count++] = hdr;
453 /* Break upon encountering an empty line. */
454 if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n')
457 /* Find the end of HDR, including continuations. */
460 const char *end = strchr (hdr, '\n');
466 while (*hdr == ' ' || *hdr == '\t');
468 DO_REALLOC (resp->headers, size, count + 1, const char *);
469 resp->headers[count++] = NULL;
474 /* Locate the header named NAME in the request data. If found, set
475 *BEGPTR to its starting, and *ENDPTR to its ending position, and
476 return 1. Otherwise return 0.
478 This function is used as a building block for response_header_copy
479 and response_header_strdup. */
482 response_header_bounds (const struct response *resp, const char *name,
483 const char **begptr, const char **endptr)
486 const char **headers = resp->headers;
489 if (!headers || !headers[1])
492 name_len = strlen (name);
494 for (i = 1; headers[i + 1]; i++)
496 const char *b = headers[i];
497 const char *e = headers[i + 1];
499 && b[name_len] == ':'
500 && 0 == strncasecmp (b, name, name_len))
503 while (b < e && ISSPACE (*b))
505 while (b < e && ISSPACE (e[-1]))
515 /* Copy the response header named NAME to buffer BUF, no longer than
516 BUFSIZE (BUFSIZE includes the terminating 0). If the header
517 exists, 1 is returned, otherwise 0. If there should be no limit on
518 the size of the header, use response_header_strdup instead.
520 If BUFSIZE is 0, no data is copied, but the boolean indication of
521 whether the header is present is still returned. */
524 response_header_copy (const struct response *resp, const char *name,
525 char *buf, int bufsize)
528 if (!response_header_bounds (resp, name, &b, &e))
532 int len = MIN (e - b, bufsize);
533 strncpy (buf, b, len);
539 /* Return the value of header named NAME in RESP, allocated with
540 malloc. If such a header does not exist in RESP, return NULL. */
543 response_header_strdup (const struct response *resp, const char *name)
546 if (!response_header_bounds (resp, name, &b, &e))
548 return strdupdelim (b, e);
551 /* Parse the HTTP status line, which is of format:
553 HTTP-Version SP Status-Code SP Reason-Phrase
555 The function returns the status-code, or -1 if the status line
556 appears malformed. The pointer to "reason-phrase" message is
557 returned in *MESSAGE. */
560 response_status (const struct response *resp, char **message)
567 /* For a HTTP/0.9 response, always assume 200 response. */
569 *message = xstrdup (_("No headers, assuming HTTP/0.9"));
573 p = resp->headers[0];
574 end = resp->headers[1];
580 if (end - p < 4 || 0 != strncmp (p, "HTTP", 4))
584 /* "/x.x" (optional because some Gnutella servers have been reported
585 as not sending the "/x.x" part. */
586 if (p < end && *p == '/')
589 while (p < end && ISDIGIT (*p))
591 if (p < end && *p == '.')
593 while (p < end && ISDIGIT (*p))
597 while (p < end && ISSPACE (*p))
599 if (end - p < 3 || !ISDIGIT (p[0]) || !ISDIGIT (p[1]) || !ISDIGIT (p[2]))
602 status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0');
607 while (p < end && ISSPACE (*p))
609 while (p < end && ISSPACE (end[-1]))
611 *message = strdupdelim (p, end);
617 /* Release the resources used by RESP. */
620 response_free (struct response *resp)
622 xfree_null (resp->headers);
626 /* Print [b, e) to the log, omitting the trailing CRLF. */
629 print_server_response_1 (const char *prefix, const char *b, const char *e)
632 if (b < e && e[-1] == '\n')
634 if (b < e && e[-1] == '\r')
636 BOUNDED_TO_ALLOCA (b, e, ln);
637 logprintf (LOG_VERBOSE, "%s%s\n", prefix, ln);
640 /* Print the server response, line by line, omitting the trailing CR
641 characters, prefixed with PREFIX. */
644 print_server_response (const struct response *resp, const char *prefix)
649 for (i = 0; resp->headers[i + 1]; i++)
650 print_server_response_1 (prefix, resp->headers[i], resp->headers[i + 1]);
653 /* Parse the `Content-Range' header and extract the information it
654 contains. Returns 1 if successful, -1 otherwise. */
656 parse_content_range (const char *hdr, long *first_byte_ptr,
657 long *last_byte_ptr, long *entity_length_ptr)
661 /* Ancient versions of Netscape proxy server, presumably predating
662 rfc2068, sent out `Content-Range' without the "bytes"
664 if (!strncasecmp (hdr, "bytes", 5))
667 /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the
671 while (ISSPACE (*hdr))
678 for (num = 0; ISDIGIT (*hdr); hdr++)
679 num = 10 * num + (*hdr - '0');
680 if (*hdr != '-' || !ISDIGIT (*(hdr + 1)))
682 *first_byte_ptr = num;
684 for (num = 0; ISDIGIT (*hdr); hdr++)
685 num = 10 * num + (*hdr - '0');
686 if (*hdr != '/' || !ISDIGIT (*(hdr + 1)))
688 *last_byte_ptr = num;
690 for (num = 0; ISDIGIT (*hdr); hdr++)
691 num = 10 * num + (*hdr - '0');
692 *entity_length_ptr = num;
696 /* Send the contents of FILE_NAME to SOCK/SSL. Make sure that exactly
697 PROMISED_SIZE bytes are sent over the wire -- if the file is
698 longer, read only that much; if the file is shorter, report an error. */
701 post_file (int sock, const char *file_name, long promised_size)
703 static char chunk[8192];
708 DEBUGP (("[writing POST file %s ... ", file_name));
710 fp = fopen (file_name, "rb");
713 while (!feof (fp) && written < promised_size)
716 int length = fread (chunk, 1, sizeof (chunk), fp);
719 towrite = MIN (promised_size - written, length);
720 write_error = fd_write (sock, chunk, towrite, -1);
730 /* If we've written less than was promised, report a (probably
731 nonsensical) error rather than break the promise. */
732 if (written < promised_size)
738 assert (written == promised_size);
739 DEBUGP (("done]\n"));
743 /* Persistent connections. Currently, we cache the most recently used
744 connection as persistent, provided that the HTTP server agrees to
745 make it such. The persistence data is stored in the variables
746 below. Ideally, it should be possible to cache an arbitrary fixed
747 number of these connections. */
749 /* Whether a persistent connection is active. */
750 static int pconn_active;
753 /* The socket of the connection. */
756 /* Host and port of the currently active persistent connection. */
760 /* Whether a ssl handshake has occoured on this connection. */
764 /* Mark the persistent connection as invalid and free the resources it
765 uses. This is used by the CLOSE_* macros after they forcefully
766 close a registered persistent connection. */
769 invalidate_persistent (void)
771 DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket));
773 fd_close (pconn.socket);
778 /* Register FD, which should be a TCP/IP connection to HOST:PORT, as
779 persistent. This will enable someone to use the same connection
780 later. In the context of HTTP, this must be called only AFTER the
781 response has been received and the server has promised that the
782 connection will remain alive.
784 If a previous connection was persistent, it is closed. */
787 register_persistent (const char *host, int port, int fd, int ssl)
791 if (pconn.socket == fd)
793 /* The connection FD is already registered. */
798 /* The old persistent connection is still active; close it
799 first. This situation arises whenever a persistent
800 connection exists, but we then connect to a different
801 host, and try to register a persistent connection to that
803 invalidate_persistent ();
809 pconn.host = xstrdup (host);
813 DEBUGP (("Registered socket %d for persistent reuse.\n", fd));
816 /* Return non-zero if a persistent connection is available for
817 connecting to HOST:PORT. */
820 persistent_available_p (const char *host, int port, int ssl,
821 int *host_lookup_failed)
823 /* First, check whether a persistent connection is active at all. */
827 /* If we want SSL and the last connection wasn't or vice versa,
828 don't use it. Checking for host and port is not enough because
829 HTTP and HTTPS can apparently coexist on the same port. */
830 if (ssl != pconn.ssl)
833 /* If we're not connecting to the same port, we're not interested. */
834 if (port != pconn.port)
837 /* If the host is the same, we're in business. If not, there is
838 still hope -- read below. */
839 if (0 != strcasecmp (host, pconn.host))
841 /* If pconn.socket is already talking to HOST, we needn't
842 reconnect. This happens often when both sites are virtual
843 hosts distinguished only by name and served by the same
844 network interface, and hence the same web server (possibly
845 set up by the ISP and serving many different web sites).
846 This admittedly non-standard optimization does not contradict
847 HTTP and works well with popular server software. */
851 struct address_list *al;
854 /* Don't try to talk to two different SSL sites over the same
855 secure connection! (Besides, it's not clear if name-based
856 virtual hosting is even possible with SSL.) */
859 /* If pconn.socket's peer is one of the IP addresses HOST
860 resolves to, pconn.socket is for all intents and purposes
861 already talking to HOST. */
863 if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER))
865 /* Can't get the peer's address -- something must be very
866 wrong with the connection. */
867 invalidate_persistent ();
870 al = lookup_host (host, 0);
873 *host_lookup_failed = 1;
877 found = address_list_contains (al, &ip);
878 address_list_release (al);
883 /* The persistent connection's peer address was found among the
884 addresses HOST resolved to; therefore, pconn.sock is in fact
885 already talking to HOST -- no need to reconnect. */
888 /* Finally, check whether the connection is still open. This is
889 important because most server implement a liberal (short) timeout
890 on persistent connections. Wget can of course always reconnect
891 if the connection doesn't work out, but it's nicer to know in
892 advance. This test is a logical followup of the first test, but
893 is "expensive" and therefore placed at the end of the list. */
895 if (!test_socket_open (pconn.socket))
897 /* Oops, the socket is no longer open. Now that we know that,
898 let's invalidate the persistent connection before returning
900 invalidate_persistent ();
907 /* The idea behind these two CLOSE macros is to distinguish between
908 two cases: one when the job we've been doing is finished, and we
909 want to close the connection and leave, and two when something is
910 seriously wrong and we're closing the connection as part of
913 In case of keep_alive, CLOSE_FINISH should leave the connection
914 open, while CLOSE_INVALIDATE should still close it.
916 Note that the semantics of the flag `keep_alive' is "this
917 connection *will* be reused (the server has promised not to close
918 the connection once we're done)", while the semantics of
919 `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an
920 active, registered connection". */
922 #define CLOSE_FINISH(fd) do { \
925 if (pconn_active && (fd) == pconn.socket) \
926 invalidate_persistent (); \
932 #define CLOSE_INVALIDATE(fd) do { \
933 if (pconn_active && (fd) == pconn.socket) \
934 invalidate_persistent (); \
942 long len; /* received length */
943 long contlen; /* expected length */
944 long restval; /* the restart value */
945 int res; /* the result of last read */
946 char *newloc; /* new location (redirection) */
947 char *remote_time; /* remote time-stamp string */
948 char *error; /* textual HTTP error */
949 int statcode; /* status code */
950 double dltime; /* time of the download in msecs */
951 int no_truncate; /* whether truncating the file is
953 const char *referer; /* value of the referer header. */
954 char **local_file; /* local file. */
958 free_hstat (struct http_stat *hs)
960 xfree_null (hs->newloc);
961 xfree_null (hs->remote_time);
962 xfree_null (hs->error);
964 /* Guard against being called twice. */
966 hs->remote_time = NULL;
970 static char *create_authorization_line PARAMS ((const char *, const char *,
971 const char *, const char *,
973 static char *basic_authentication_encode PARAMS ((const char *, const char *));
974 static int known_authentication_scheme_p PARAMS ((const char *));
976 time_t http_atotm PARAMS ((const char *));
978 #define BEGINS_WITH(line, string_constant) \
979 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
980 && (ISSPACE (line[sizeof (string_constant) - 1]) \
981 || !line[sizeof (string_constant) - 1]))
983 /* Retrieve a document through HTTP protocol. It recognizes status
984 code, and correctly handles redirections. It closes the network
985 socket. If it receives an error from the functions below it, it
986 will print it if there is enough information to do so (almost
987 always), returning the error to the caller (i.e. http_loop).
989 Various HTTP parameters are stored to hs.
991 If PROXY is non-NULL, the connection will be made to the proxy
992 server, and u->url will be requested. */
994 gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
1003 long contlen, contrange;
1009 /* Whether authorization has been already tried. */
1010 int auth_tried_already = 0;
1012 /* Whether our connection to the remote host is through SSL. */
1016 struct response *resp;
1020 /* Whether this connection will be kept alive after the HTTP request
1024 /* Flag that detects having received a keep-alive response. */
1025 int keep_alive_confirmed;
1027 /* Whether keep-alive should be inhibited. */
1028 int inhibit_keep_alive = !opt.http_keep_alive;
1030 /* Headers sent when using POST. */
1031 long post_data_size = 0;
1033 int host_lookup_failed = 0;
1036 /* Initialize the SSL context. After the first run, this is a
1038 switch (ssl_init ())
1040 case SSLERRCTXCREATE:
1042 logprintf (LOG_NOTQUIET, _("Failed to set up an SSL context\n"));
1043 return SSLERRCTXCREATE;
1044 case SSLERRCERTFILE:
1045 /* try without certfile */
1046 logprintf (LOG_NOTQUIET,
1047 _("Failed to load certificates from %s\n"),
1049 logprintf (LOG_NOTQUIET,
1050 _("Trying without the specified certificate\n"));
1053 logprintf (LOG_NOTQUIET,
1054 _("Failed to get certificate key from %s\n"),
1056 logprintf (LOG_NOTQUIET,
1057 _("Trying without the specified certificate\n"));
1062 #endif /* HAVE_SSL */
1064 if (!(*dt & HEAD_ONLY))
1065 /* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
1066 know the local filename so we can save to it. */
1067 assert (*hs->local_file != NULL);
1069 auth_tried_already = 0;
1071 /* Initialize certain elements of struct http_stat. */
1076 hs->remote_time = NULL;
1084 char *proxy_user, *proxy_passwd;
1085 /* For normal username and password, URL components override
1086 command-line/wgetrc parameters. With proxy
1087 authentication, it's the reverse, because proxy URLs are
1088 normally the "permanent" ones, so command-line args
1089 should take precedence. */
1090 if (opt.proxy_user && opt.proxy_passwd)
1092 proxy_user = opt.proxy_user;
1093 proxy_passwd = opt.proxy_passwd;
1097 proxy_user = proxy->user;
1098 proxy_passwd = proxy->passwd;
1100 /* #### This does not appear right. Can't the proxy request,
1101 say, `Digest' authentication? */
1102 if (proxy_user && proxy_passwd)
1103 proxyauth = basic_authentication_encode (proxy_user, proxy_passwd);
1105 /* If we're using a proxy, we will be connecting to the proxy
1110 /* Prepare the request to send. */
1112 req = request_new ();
1114 const char *meth = "GET";
1115 if (*dt & HEAD_ONLY)
1117 else if (opt.post_file_name || opt.post_data)
1119 /* Use the full path, i.e. one that includes the leading slash and
1120 the query string. E.g. if u->path is "foo/bar" and u->query is
1121 "param=value", full_path will be "/foo/bar?param=value". */
1122 request_set_method (req, meth,
1123 proxy ? xstrdup (u->url) : url_full_path (u));
1126 request_set_header (req, "Referer", (char *) hs->referer, rel_none);
1127 if (*dt & SEND_NOCACHE)
1128 request_set_header (req, "Pragma", "no-cache", rel_none);
1130 request_set_header (req, "Range",
1131 aprintf ("bytes=%ld-", hs->restval), rel_value);
1133 request_set_header (req, "User-Agent", opt.useragent, rel_none);
1135 request_set_header (req, "User-Agent",
1136 aprintf ("Wget/%s", version_string), rel_value);
1137 request_set_header (req, "Accept", "*/*", rel_none);
1139 /* Find the username and password for authentication. */
1142 search_netrc (u->host, (const char **)&user, (const char **)&passwd, 0);
1143 user = user ? user : opt.http_user;
1144 passwd = passwd ? passwd : opt.http_passwd;
1148 /* We have the username and the password, but haven't tried
1149 any authorization yet. Let's see if the "Basic" method
1150 works. If not, we'll come back here and construct a
1151 proper authorization method with the right challenges.
1153 If we didn't employ this kind of logic, every URL that
1154 requires authorization would have to be processed twice,
1155 which is very suboptimal and generates a bunch of false
1156 "unauthorized" errors in the server log.
1158 #### But this logic also has a serious problem when used
1159 with stronger authentications: we *first* transmit the
1160 username and the password in clear text, and *then* attempt a
1161 stronger authentication scheme. That cannot be right! We
1162 are only fortunate that almost everyone still uses the
1163 `Basic' scheme anyway.
1165 There should be an option to prevent this from happening, for
1166 those who use strong authentication schemes and value their
1168 request_set_header (req, "Authorization",
1169 basic_authentication_encode (user, passwd),
1174 /* Whether we need to print the host header with braces around
1175 host, e.g. "Host: [3ffe:8100:200:2::2]:1234" instead of the
1176 usual "Host: symbolic-name:1234". */
1177 int squares = strchr (u->host, ':') != NULL;
1178 if (u->port == scheme_default_port (u->scheme))
1179 request_set_header (req, "Host",
1180 aprintf (squares ? "[%s]" : "%s", u->host),
1183 request_set_header (req, "Host",
1184 aprintf (squares ? "[%s]:%d" : "%s:%d",
1189 if (!inhibit_keep_alive)
1190 request_set_header (req, "Connection", "Keep-Alive", rel_none);
1193 request_set_header (req, "Cookie",
1194 cookie_header (wget_cookie_jar,
1195 u->host, u->port, u->path,
1197 u->scheme == SCHEME_HTTPS
1204 if (opt.post_data || opt.post_file_name)
1206 request_set_header (req, "Content-Type",
1207 "application/x-www-form-urlencoded", rel_none);
1209 post_data_size = strlen (opt.post_data);
1212 post_data_size = file_size (opt.post_file_name);
1213 if (post_data_size == -1)
1215 logprintf (LOG_NOTQUIET, "POST data file missing: %s\n",
1216 opt.post_file_name);
1220 request_set_header (req, "Content-Length",
1221 aprintf ("Content-Length: %ld", post_data_size),
1225 /* Add the user headers. */
1226 if (opt.user_headers)
1229 for (i = 0; opt.user_headers[i]; i++)
1230 request_set_user_header (req, opt.user_headers[i]);
1234 /* We need to come back here when the initial attempt to retrieve
1235 without authorization header fails. (Expected to happen at least
1236 for the Digest authorization scheme.) */
1239 keep_alive_confirmed = 0;
1241 /* Establish the connection. */
1243 if (!inhibit_keep_alive)
1245 /* Look for a persistent connection to target host, unless a
1246 proxy is used. The exception is when SSL is in use, in which
1247 case the proxy is nothing but a passthrough to the target
1248 host, registered as a connection to the latter. */
1249 struct url *relevant = conn;
1251 if (u->scheme == SCHEME_HTTPS)
1255 if (persistent_available_p (relevant->host, relevant->port,
1257 relevant->scheme == SCHEME_HTTPS,
1261 &host_lookup_failed))
1263 sock = pconn.socket;
1264 using_ssl = pconn.ssl;
1265 logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"),
1266 pconn.host, pconn.port);
1267 DEBUGP (("Reusing fd %d.\n", sock));
1273 /* In its current implementation, persistent_available_p will
1274 look up conn->host in some cases. If that lookup failed, we
1275 don't need to bother with connect_to_host. */
1276 if (host_lookup_failed)
1279 sock = connect_to_host (conn->host, conn->port);
1283 return (retryable_socket_connect_error (errno)
1284 ? CONERROR : CONIMPOSSIBLE);
1287 if (proxy && u->scheme == SCHEME_HTTPS)
1289 /* When requesting SSL URLs through proxies, use the
1290 CONNECT method to request passthrough. */
1291 struct request *connreq = request_new ();
1292 request_set_method (connreq, "CONNECT",
1293 aprintf ("%s:%d", u->host, u->port));
1296 request_set_header (connreq, "Proxy-Authorization",
1297 proxyauth, rel_value);
1298 /* Now that PROXYAUTH is part of the CONNECT request,
1299 zero it out so we don't send proxy authorization with
1300 the regular request below. */
1304 write_error = request_send (connreq, sock);
1305 request_free (connreq);
1306 if (write_error < 0)
1308 logprintf (LOG_VERBOSE, _("Failed writing to proxy: %s.\n"),
1310 CLOSE_INVALIDATE (sock);
1314 head = fd_read_http_head (sock);
1317 logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"),
1319 CLOSE_INVALIDATE (sock);
1328 DEBUGP (("proxy responded with: [%s]\n", head));
1330 resp = response_new (head);
1331 statcode = response_status (resp, &message);
1332 response_free (resp);
1333 if (statcode != 200)
1336 logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"),
1337 message ? message : "?");
1338 xfree_null (message);
1343 /* SOCK is now *really* connected to u->host, so update CONN
1344 to reflect this. That way register_persistent will
1345 register SOCK as being connected to u->host:u->port. */
1349 if (conn->scheme == SCHEME_HTTPS)
1351 if (!ssl_connect (sock))
1358 #endif /* HAVE_SSL */
1361 /* Send the request to server. */
1362 write_error = request_send (req, sock);
1364 if (write_error >= 0)
1368 DEBUGP (("[POST data: %s]\n", opt.post_data));
1369 write_error = fd_write (sock, opt.post_data, post_data_size, -1);
1371 else if (opt.post_file_name && post_data_size != 0)
1372 write_error = post_file (sock, opt.post_file_name, post_data_size);
1374 DEBUGP (("---request end---\n"));
1376 if (write_error < 0)
1378 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
1380 CLOSE_INVALIDATE (sock);
1384 logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
1385 proxy ? "Proxy" : "HTTP");
1386 contlen = contrange = -1;
1391 head = fd_read_http_head (sock);
1396 logputs (LOG_NOTQUIET, _("No data received.\n"));
1397 CLOSE_INVALIDATE (sock);
1403 logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"),
1405 CLOSE_INVALIDATE (sock);
1410 DEBUGP (("\n---response begin---\n%s---response end---\n", head));
1412 resp = response_new (head);
1414 /* Check for status line. */
1416 statcode = response_status (resp, &message);
1417 if (!opt.server_response)
1418 logprintf (LOG_VERBOSE, "%2d %s\n", statcode, message ? message : "");
1421 logprintf (LOG_VERBOSE, "\n");
1422 print_server_response (resp, " ");
1425 if (statcode == HTTP_STATUS_UNAUTHORIZED)
1427 /* Authorization is required. */
1428 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1429 might be more bytes in the body. */
1430 if (auth_tried_already || !(user && passwd))
1432 /* If we have tried it already, then there is not point
1434 logputs (LOG_NOTQUIET, _("Authorization failed.\n"));
1438 char *www_authenticate = response_header_strdup (resp,
1439 "WWW-Authenticate");
1440 /* If the authentication scheme is unknown or if it's the
1441 "Basic" authentication (which we try by default), there's
1442 no sense in retrying. */
1443 if (!www_authenticate
1444 || !known_authentication_scheme_p (www_authenticate)
1445 || BEGINS_WITH (www_authenticate, "Basic"))
1447 xfree_null (www_authenticate);
1448 logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n"));
1453 auth_tried_already = 1;
1454 pth = url_full_path (u);
1455 request_set_header (req, "Authorization",
1456 create_authorization_line (www_authenticate,
1458 request_method (req),
1462 xfree (www_authenticate);
1463 goto retry_with_auth;
1471 hs->statcode = statcode;
1473 hs->error = xstrdup (_("Malformed status line"));
1475 hs->error = xstrdup (_("(no description)"));
1477 hs->error = xstrdup (message);
1479 if (response_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval)))
1480 contlen = strtol (hdrval, NULL, 10);
1481 type = response_header_strdup (resp, "Content-Type");
1484 char *tmp = strchr (type, ';');
1487 while (tmp > type && ISSPACE (tmp[-1]))
1492 hs->newloc = response_header_strdup (resp, "Location");
1493 hs->remote_time = response_header_strdup (resp, "Last-Modified");
1495 char *set_cookie = response_header_strdup (resp, "Set-Cookie");
1498 /* The jar should have been created by now. */
1499 assert (wget_cookie_jar != NULL);
1500 cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port, u->path,
1505 if (response_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval)))
1507 long first_byte_pos, last_byte_pos, entity_length;
1508 if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos,
1510 contrange = first_byte_pos;
1513 /* Check for keep-alive related responses. */
1514 if (!inhibit_keep_alive && contlen != -1)
1516 if (response_header_copy (resp, "Keep-Alive", NULL, 0))
1518 else if (response_header_copy (resp, "Connection", hdrval,
1521 if (0 == strcasecmp (hdrval, "Keep-Alive"))
1525 response_free (resp);
1528 /* The server has promised that it will not close the connection
1529 when we're done. This means that we can register it. */
1530 register_persistent (conn->host, conn->port, sock, using_ssl);
1532 /* 20x responses are counted among successful by default. */
1533 if (H_20X (statcode))
1536 /* Return if redirected. */
1537 if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES)
1539 /* RFC2068 says that in case of the 300 (multiple choices)
1540 response, the server can output a preferred URL through
1541 `Location' header; otherwise, the request should be treated
1542 like GET. So, if the location is set, it will be a
1543 redirection; otherwise, just proceed normally. */
1544 if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc)
1548 logprintf (LOG_VERBOSE,
1549 _("Location: %s%s\n"),
1550 hs->newloc ? hs->newloc : _("unspecified"),
1551 hs->newloc ? _(" [following]") : "");
1552 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1553 might be more bytes in the body. */
1559 /* If content-type is not given, assume text/html. This is because
1560 of the multitude of broken CGI's that "forget" to generate the
1563 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) ||
1564 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
1569 if (opt.html_extension && (*dt & TEXTHTML))
1570 /* -E / --html-extension / html_extension = on was specified, and this is a
1571 text/html file. If some case-insensitive variation on ".htm[l]" isn't
1572 already the file's suffix, tack on ".html". */
1574 char* last_period_in_local_filename = strrchr(*hs->local_file, '.');
1576 if (last_period_in_local_filename == NULL
1577 || !(0 == strcasecmp (last_period_in_local_filename, ".htm")
1578 || 0 == strcasecmp (last_period_in_local_filename, ".html")))
1580 size_t local_filename_len = strlen(*hs->local_file);
1582 *hs->local_file = xrealloc(*hs->local_file,
1583 local_filename_len + sizeof(".html"));
1584 strcpy(*hs->local_file + local_filename_len, ".html");
1586 *dt |= ADDED_HTML_EXTENSION;
1590 if (contrange == -1)
1592 /* We did not get a content-range header. This means that the
1593 server did not honor our `Range' request. Normally, this
1594 means we should reset hs->restval and continue normally. */
1596 /* However, if `-c' is used, we need to be a bit more careful:
1598 1. If `-c' is specified and the file already existed when
1599 Wget was started, it would be a bad idea for us to start
1600 downloading it from scratch, effectively truncating it. I
1601 believe this cannot happen unless `-c' was specified.
1603 2. If `-c' is used on a file that is already fully
1604 downloaded, we're requesting bytes after the end of file,
1605 which can result in server not honoring `Range'. If this is
1606 the case, `Content-Length' will be equal to the length of the
1608 if (opt.always_rest)
1610 /* Check for condition #2. */
1611 if (hs->restval > 0 /* restart was requested. */
1612 && contlen != -1 /* we got content-length. */
1613 && hs->restval >= contlen /* file fully downloaded
1617 logputs (LOG_VERBOSE, _("\
1618 \n The file is already fully retrieved; nothing to do.\n\n"));
1619 /* In case the caller inspects. */
1622 /* Mark as successfully retrieved. */
1625 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1626 might be more bytes in the body. */
1627 return RETRUNNEEDED;
1630 /* Check for condition #1. */
1631 if (hs->no_truncate)
1633 logprintf (LOG_NOTQUIET,
1636 Continued download failed on this file, which conflicts with `-c'.\n\
1637 Refusing to truncate existing file `%s'.\n\n"), *hs->local_file);
1639 CLOSE_INVALIDATE (sock);
1640 return CONTNOTSUPPORTED;
1648 else if (contrange != hs->restval ||
1649 (H_PARTIAL (statcode) && contrange == -1))
1651 /* This means the whole request was somehow misunderstood by the
1652 server. Bail out. */
1654 CLOSE_INVALIDATE (sock);
1661 contlen += contrange;
1663 contrange = -1; /* If conent-length was not sent,
1664 content-range will be ignored. */
1666 hs->contlen = contlen;
1672 /* No need to print this output if the body won't be
1673 downloaded at all, or if the original server response is
1675 logputs (LOG_VERBOSE, _("Length: "));
1678 logputs (LOG_VERBOSE, legible (contlen));
1679 if (contrange != -1)
1680 logprintf (LOG_VERBOSE, _(" (%s to go)"),
1681 legible (contlen - contrange));
1684 logputs (LOG_VERBOSE,
1685 opt.ignore_length ? _("ignored") : _("unspecified"));
1687 logprintf (LOG_VERBOSE, " [%s]\n", type);
1689 logputs (LOG_VERBOSE, "\n");
1693 type = NULL; /* We don't need it any more. */
1695 /* Return if we have no intention of further downloading. */
1696 if (!(*dt & RETROKF) || (*dt & HEAD_ONLY))
1698 /* In case the caller cares to look... */
1702 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1703 might be more bytes in the body. */
1704 return RETRFINISHED;
1707 /* Open the local file. */
1710 mkalldirs (*hs->local_file);
1712 rotate_backups (*hs->local_file);
1713 fp = fopen (*hs->local_file, hs->restval ? "ab" : "wb");
1716 logprintf (LOG_NOTQUIET, "%s: %s\n", *hs->local_file, strerror (errno));
1717 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1718 might be more bytes in the body. */
1724 extern int global_download_count;
1726 /* To ensure that repeated "from scratch" downloads work for -O
1727 files, we rewind the file pointer, unless restval is
1728 non-zero. (This works only when -O is used on regular files,
1729 but it's still a valuable feature.)
1731 However, this loses when more than one URL is specified on
1732 the command line the second rewinds eradicates the contents
1733 of the first download. Thus we disable the above trick for
1734 all the downloads except the very first one.
1736 #### A possible solution to this would be to remember the
1737 file position in the output document and to seek to that
1738 position, instead of rewinding.
1740 We don't truncate stdout, since that breaks
1741 "wget -O - [...] >> foo".
1743 if (!hs->restval && global_download_count == 0 && opt.dfp != stdout)
1745 /* This will silently fail for streams that don't correspond
1746 to regular files, but that's OK. */
1748 /* ftruncate is needed because opt.dfp is opened in append
1749 mode if opt.always_rest is set. */
1750 ftruncate (fileno (fp), 0);
1755 /* #### This confuses the code that checks for file size. There
1756 should be some overhead information. */
1757 if (opt.save_headers)
1758 fwrite (head, 1, strlen (head), fp);
1760 /* Get the contents of the document. */
1761 hs->res = fd_read_body (sock, fp, &hs->len, hs->restval,
1762 (contlen != -1 ? contlen : 0),
1763 keep_alive, &hs->dltime);
1766 CLOSE_FINISH (sock);
1768 CLOSE_INVALIDATE (sock);
1771 /* Close or flush the file. We have to be careful to check for
1772 error here. Checking the result of fwrite() is not enough --
1773 errors could go unnoticed! */
1776 flush_res = fclose (fp);
1778 flush_res = fflush (fp);
1779 if (flush_res == EOF)
1784 return RETRFINISHED;
1787 /* The genuine HTTP loop! This is the part where the retrieval is
1788 retried, and retried, and retried, and... */
1790 http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
1791 int *dt, struct url *proxy)
1794 int use_ts, got_head = 0; /* time-stamping info */
1795 char *filename_plus_orig_suffix;
1796 char *local_filename = NULL;
1797 char *tms, *locf, *tmrate;
1799 time_t tml = -1, tmr = -1; /* local and remote time-stamps */
1800 long local_size = 0; /* the size of the local file */
1801 size_t filename_len;
1802 struct http_stat hstat; /* HTTP status */
1806 /* This used to be done in main(), but it's a better idea to do it
1807 here so that we don't go through the hoops if we're just using
1811 if (!wget_cookie_jar)
1812 wget_cookie_jar = cookie_jar_new ();
1813 if (opt.cookies_input && !cookies_loaded_p)
1815 cookie_jar_load (wget_cookie_jar, opt.cookies_input);
1816 cookies_loaded_p = 1;
1822 /* Warn on (likely bogus) wildcard usage in HTTP. Don't use
1823 has_wildcards_p because it would also warn on `?', and we know that
1824 shows up in CGI paths a *lot*. */
1825 if (strchr (u->url, '*'))
1826 logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
1828 /* Determine the local filename. */
1829 if (local_file && *local_file)
1830 hstat.local_file = local_file;
1831 else if (local_file)
1833 *local_file = url_file_name (u);
1834 hstat.local_file = local_file;
1838 dummy = url_file_name (u);
1839 hstat.local_file = &dummy;
1842 if (!opt.output_document)
1843 locf = *hstat.local_file;
1845 locf = opt.output_document;
1847 hstat.referer = referer;
1849 filename_len = strlen (*hstat.local_file);
1850 filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
1852 if (opt.noclobber && file_exists_p (*hstat.local_file))
1854 /* If opt.noclobber is turned on and file already exists, do not
1855 retrieve the file */
1856 logprintf (LOG_VERBOSE, _("\
1857 File `%s' already there, will not retrieve.\n"), *hstat.local_file);
1858 /* If the file is there, we suppose it's retrieved OK. */
1861 /* #### Bogusness alert. */
1862 /* If its suffix is "html" or "htm" or similar, assume text/html. */
1863 if (has_html_suffix_p (*hstat.local_file))
1871 if (opt.timestamping)
1873 int local_dot_orig_file_exists = 0;
1875 if (opt.backup_converted)
1876 /* If -K is specified, we'll act on the assumption that it was specified
1877 last time these files were downloaded as well, and instead of just
1878 comparing local file X against server file X, we'll compare local
1879 file X.orig (if extant, else X) against server file X. If -K
1880 _wasn't_ specified last time, or the server contains files called
1881 *.orig, -N will be back to not operating correctly with -k. */
1883 /* Would a single s[n]printf() call be faster? --dan
1885 Definitely not. sprintf() is horribly slow. It's a
1886 different question whether the difference between the two
1887 affects a program. Usually I'd say "no", but at one
1888 point I profiled Wget, and found that a measurable and
1889 non-negligible amount of time was lost calling sprintf()
1890 in url.c. Replacing sprintf with inline calls to
1891 strcpy() and long_to_string() made a difference.
1893 memcpy (filename_plus_orig_suffix, *hstat.local_file, filename_len);
1894 memcpy (filename_plus_orig_suffix + filename_len,
1895 ".orig", sizeof (".orig"));
1897 /* Try to stat() the .orig file. */
1898 if (stat (filename_plus_orig_suffix, &st) == 0)
1900 local_dot_orig_file_exists = 1;
1901 local_filename = filename_plus_orig_suffix;
1905 if (!local_dot_orig_file_exists)
1906 /* Couldn't stat() <file>.orig, so try to stat() <file>. */
1907 if (stat (*hstat.local_file, &st) == 0)
1908 local_filename = *hstat.local_file;
1910 if (local_filename != NULL)
1911 /* There was a local file, so we'll check later to see if the version
1912 the server has is the same version we already have, allowing us to
1918 /* Modification time granularity is 2 seconds for Windows, so
1919 increase local time by 1 second for later comparison. */
1922 local_size = st.st_size;
1926 /* Reset the counter. */
1928 *dt = 0 | ACCEPTRANGES;
1932 /* Increment the pass counter. */
1934 sleep_between_retrievals (count);
1935 /* Get the current time string. */
1936 tms = time_str (NULL);
1937 /* Print fetch message, if opt.verbose. */
1940 char *hurl = url_string (u, 1);
1944 sprintf (tmp, _("(try:%2d)"), count);
1945 logprintf (LOG_VERBOSE, "--%s-- %s\n %s => `%s'\n",
1946 tms, hurl, tmp, locf);
1948 ws_changetitle (hurl, 1);
1953 /* Default document type is empty. However, if spider mode is
1954 on or time-stamping is employed, HEAD_ONLY commands is
1955 encoded within *dt. */
1956 if (opt.spider || (use_ts && !got_head))
1960 /* Assume no restarting. */
1962 /* Decide whether or not to restart. */
1963 if (((count > 1 && (*dt & ACCEPTRANGES)) || opt.always_rest)
1964 /* #### this calls access() and then stat(); could be optimized. */
1965 && file_exists_p (locf))
1966 if (stat (locf, &st) == 0 && S_ISREG (st.st_mode))
1967 hstat.restval = st.st_size;
1969 /* In `-c' is used and the file is existing and non-empty,
1970 refuse to truncate it if the server doesn't support continued
1972 hstat.no_truncate = 0;
1973 if (opt.always_rest && hstat.restval)
1974 hstat.no_truncate = 1;
1976 /* Decide whether to send the no-cache directive. We send it in
1978 a) we're using a proxy, and we're past our first retrieval.
1979 Some proxies are notorious for caching incomplete data, so
1980 we require a fresh get.
1981 b) caching is explicitly inhibited. */
1982 if ((proxy && count > 1) /* a */
1983 || !opt.allow_cache /* b */
1985 *dt |= SEND_NOCACHE;
1987 *dt &= ~SEND_NOCACHE;
1989 /* Try fetching the document, or at least its head. */
1990 err = gethttp (u, &hstat, dt, proxy);
1992 /* It's unfortunate that wget determines the local filename before finding
1993 out the Content-Type of the file. Barring a major restructuring of the
1994 code, we need to re-set locf here, since gethttp() may have xrealloc()d
1995 *hstat.local_file to tack on ".html". */
1996 if (!opt.output_document)
1997 locf = *hstat.local_file;
1999 locf = opt.output_document;
2002 tms = time_str (NULL);
2003 /* Get the new location (with or without the redirection). */
2005 *newloc = xstrdup (hstat.newloc);
2008 case HERR: case HEOF: case CONSOCKERR: case CONCLOSED:
2009 case CONERROR: case READERR: case WRITEFAILED:
2011 /* Non-fatal errors continue executing the loop, which will
2012 bring them to "while" statement at the end, to judge
2013 whether the number of tries was exceeded. */
2014 free_hstat (&hstat);
2015 printwhat (count, opt.ntry);
2018 case HOSTERR: case CONIMPOSSIBLE: case PROXERR: case AUTHFAILED:
2019 case SSLERRCTXCREATE: case CONTNOTSUPPORTED:
2020 /* Fatal errors just return from the function. */
2021 free_hstat (&hstat);
2025 case FWRITEERR: case FOPENERR:
2026 /* Another fatal error. */
2027 logputs (LOG_VERBOSE, "\n");
2028 logprintf (LOG_NOTQUIET, _("Cannot write to `%s' (%s).\n"),
2029 *hstat.local_file, strerror (errno));
2030 free_hstat (&hstat);
2035 /* Another fatal error. */
2036 logputs (LOG_VERBOSE, "\n");
2037 logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
2038 free_hstat (&hstat);
2043 /* Return the new location to the caller. */
2046 logprintf (LOG_NOTQUIET,
2047 _("ERROR: Redirection (%d) without location.\n"),
2049 free_hstat (&hstat);
2053 free_hstat (&hstat);
2058 /* The file was already fully retrieved. */
2059 free_hstat (&hstat);
2064 /* Deal with you later. */
2067 /* All possibilities should have been exhausted. */
2070 if (!(*dt & RETROKF))
2074 /* #### Ugly ugly ugly! */
2075 char *hurl = url_string (u, 1);
2076 logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
2079 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
2080 tms, hstat.statcode, hstat.error);
2081 logputs (LOG_VERBOSE, "\n");
2082 free_hstat (&hstat);
2087 /* Did we get the time-stamp? */
2090 if (opt.timestamping && !hstat.remote_time)
2092 logputs (LOG_NOTQUIET, _("\
2093 Last-modified header missing -- time-stamps turned off.\n"));
2095 else if (hstat.remote_time)
2097 /* Convert the date-string into struct tm. */
2098 tmr = http_atotm (hstat.remote_time);
2099 if (tmr == (time_t) (-1))
2100 logputs (LOG_VERBOSE, _("\
2101 Last-modified header invalid -- time-stamp ignored.\n"));
2105 /* The time-stamping section. */
2110 use_ts = 0; /* no more time-stamping */
2111 count = 0; /* the retrieve count for HEAD is
2113 if (hstat.remote_time && tmr != (time_t) (-1))
2115 /* Now time-stamping can be used validly. Time-stamping
2116 means that if the sizes of the local and remote file
2117 match, and local file is newer than the remote file,
2118 it will not be retrieved. Otherwise, the normal
2119 download procedure is resumed. */
2121 (hstat.contlen == -1 || local_size == hstat.contlen))
2123 logprintf (LOG_VERBOSE, _("\
2124 Server file no newer than local file `%s' -- not retrieving.\n\n"),
2126 free_hstat (&hstat);
2130 else if (tml >= tmr)
2131 logprintf (LOG_VERBOSE, _("\
2132 The sizes do not match (local %ld) -- retrieving.\n"), local_size);
2134 logputs (LOG_VERBOSE,
2135 _("Remote file is newer, retrieving.\n"));
2137 free_hstat (&hstat);
2140 if ((tmr != (time_t) (-1))
2142 && ((hstat.len == hstat.contlen) ||
2143 ((hstat.res == 0) &&
2144 ((hstat.contlen == -1) ||
2145 (hstat.len >= hstat.contlen && !opt.kill_longer)))))
2147 /* #### This code repeats in http.c and ftp.c. Move it to a
2149 const char *fl = NULL;
2150 if (opt.output_document)
2152 if (opt.od_known_regular)
2153 fl = opt.output_document;
2156 fl = *hstat.local_file;
2160 /* End of time-stamping section. */
2164 logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode, hstat.error);
2169 tmrate = retr_rate (hstat.len - hstat.restval, hstat.dltime, 0);
2171 if (hstat.len == hstat.contlen)
2175 logprintf (LOG_VERBOSE,
2176 _("%s (%s) - `%s' saved [%ld/%ld]\n\n"),
2177 tms, tmrate, locf, hstat.len, hstat.contlen);
2178 logprintf (LOG_NONVERBOSE,
2179 "%s URL:%s [%ld/%ld] -> \"%s\" [%d]\n",
2180 tms, u->url, hstat.len, hstat.contlen, locf, count);
2183 total_downloaded_bytes += hstat.len;
2185 /* Remember that we downloaded the file for later ".orig" code. */
2186 if (*dt & ADDED_HTML_EXTENSION)
2187 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2189 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2191 free_hstat (&hstat);
2195 else if (hstat.res == 0) /* No read error */
2197 if (hstat.contlen == -1) /* We don't know how much we were supposed
2198 to get, so assume we succeeded. */
2202 logprintf (LOG_VERBOSE,
2203 _("%s (%s) - `%s' saved [%ld]\n\n"),
2204 tms, tmrate, locf, hstat.len);
2205 logprintf (LOG_NONVERBOSE,
2206 "%s URL:%s [%ld] -> \"%s\" [%d]\n",
2207 tms, u->url, hstat.len, locf, count);
2210 total_downloaded_bytes += hstat.len;
2212 /* Remember that we downloaded the file for later ".orig" code. */
2213 if (*dt & ADDED_HTML_EXTENSION)
2214 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2216 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2218 free_hstat (&hstat);
2222 else if (hstat.len < hstat.contlen) /* meaning we lost the
2223 connection too soon */
2225 logprintf (LOG_VERBOSE,
2226 _("%s (%s) - Connection closed at byte %ld. "),
2227 tms, tmrate, hstat.len);
2228 printwhat (count, opt.ntry);
2229 free_hstat (&hstat);
2232 else if (!opt.kill_longer) /* meaning we got more than expected */
2234 logprintf (LOG_VERBOSE,
2235 _("%s (%s) - `%s' saved [%ld/%ld])\n\n"),
2236 tms, tmrate, locf, hstat.len, hstat.contlen);
2237 logprintf (LOG_NONVERBOSE,
2238 "%s URL:%s [%ld/%ld] -> \"%s\" [%d]\n",
2239 tms, u->url, hstat.len, hstat.contlen, locf, count);
2241 total_downloaded_bytes += hstat.len;
2243 /* Remember that we downloaded the file for later ".orig" code. */
2244 if (*dt & ADDED_HTML_EXTENSION)
2245 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2247 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2249 free_hstat (&hstat);
2253 else /* the same, but not accepted */
2255 logprintf (LOG_VERBOSE,
2256 _("%s (%s) - Connection closed at byte %ld/%ld. "),
2257 tms, tmrate, hstat.len, hstat.contlen);
2258 printwhat (count, opt.ntry);
2259 free_hstat (&hstat);
2263 else /* now hstat.res can only be -1 */
2265 if (hstat.contlen == -1)
2267 logprintf (LOG_VERBOSE,
2268 _("%s (%s) - Read error at byte %ld (%s)."),
2269 tms, tmrate, hstat.len, strerror (errno));
2270 printwhat (count, opt.ntry);
2271 free_hstat (&hstat);
2274 else /* hstat.res == -1 and contlen is given */
2276 logprintf (LOG_VERBOSE,
2277 _("%s (%s) - Read error at byte %ld/%ld (%s). "),
2278 tms, tmrate, hstat.len, hstat.contlen,
2280 printwhat (count, opt.ntry);
2281 free_hstat (&hstat);
2288 while (!opt.ntry || (count < opt.ntry));
2292 /* Converts struct tm to time_t, assuming the data in tm is UTC rather
2293 than local timezone.
2295 mktime is similar but assumes struct tm, also known as the
2296 "broken-down" form of time, is in local time zone. mktime_from_utc
2297 uses mktime to make the conversion understanding that an offset
2298 will be introduced by the local time assumption.
2300 mktime_from_utc then measures the introduced offset by applying
2301 gmtime to the initial result and applying mktime to the resulting
2302 "broken-down" form. The difference between the two mktime results
2303 is the measured offset which is then subtracted from the initial
2304 mktime result to yield a calendar time which is the value returned.
2306 tm_isdst in struct tm is set to 0 to force mktime to introduce a
2307 consistent offset (the non DST offset) since tm and tm+o might be
2308 on opposite sides of a DST change.
2310 Some implementations of mktime return -1 for the nonexistent
2311 localtime hour at the beginning of DST. In this event, use
2312 mktime(tm - 1hr) + 3600.
2316 gmtime(t+o) --> tm+o
2317 mktime(tm+o) --> t+2o
2318 t+o - (t+2o - t+o) = t
2320 Note that glibc contains a function of the same purpose named
2321 `timegm' (reverse of gmtime). But obviously, it is not universally
2322 available, and unfortunately it is not straightforwardly
2323 extractable for use here. Perhaps configure should detect timegm
2324 and use it where available.
2326 Contributed by Roger Beeman <beeman@cisco.com>, with the help of
2327 Mark Baushke <mdb@cisco.com> and the rest of the Gurus at CISCO.
2328 Further improved by Roger with assistance from Edward J. Sabol
2329 based on input by Jamie Zawinski. */
2332 mktime_from_utc (struct tm *t)
2343 return -1; /* can't deal with output from strptime */
2354 return -1; /* can't deal with output from gmtime */
2357 return (tl - (tb - tl));
2360 /* Check whether the result of strptime() indicates success.
2361 strptime() returns the pointer to how far it got to in the string.
2362 The processing has been successful if the string is at `GMT' or
2363 `+X', or at the end of the string.
2365 In extended regexp parlance, the function returns 1 if P matches
2366 "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime
2367 can return) is considered a failure and 0 is returned. */
2369 check_end (const char *p)
2373 while (ISSPACE (*p))
2376 || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T')
2377 || ((p[0] == '+' || p[0] == '-') && ISDIGIT (p[1])))
2383 /* Convert the textual specification of time in TIME_STRING to the
2384 number of seconds since the Epoch.
2386 TIME_STRING can be in any of the three formats RFC2068 allows the
2387 HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date.
2388 Timezones are ignored, and should be GMT.
2390 Return the computed time_t representation, or -1 if the conversion
2393 This function uses strptime with various string formats for parsing
2394 TIME_STRING. This results in a parser that is not as lenient in
2395 interpreting TIME_STRING as I would like it to be. Being based on
2396 strptime, it always allows shortened months, one-digit days, etc.,
2397 but due to the multitude of formats in which time can be
2398 represented, an ideal HTTP time parser would be even more
2399 forgiving. It should completely ignore things like week days and
2400 concentrate only on the various forms of representing years,
2401 months, days, hours, minutes, and seconds. For example, it would
2402 be nice if it accepted ISO 8601 out of the box.
2404 I've investigated free and PD code for this purpose, but none was
2405 usable. getdate was big and unwieldy, and had potential copyright
2406 issues, or so I was informed. Dr. Marcus Hennecke's atotm(),
2407 distributed with phttpd, is excellent, but we cannot use it because
2408 it is not assigned to the FSF. So I stuck it with strptime. */
2411 http_atotm (const char *time_string)
2413 /* NOTE: Solaris strptime man page claims that %n and %t match white
2414 space, but that's not universally available. Instead, we simply
2415 use ` ' to mean "skip all WS", which works under all strptime
2416 implementations I've tested. */
2418 static const char *time_formats[] = {
2419 "%a, %d %b %Y %T", /* RFC1123: Thu, 29 Jan 1998 22:12:57 */
2420 "%A, %d-%b-%y %T", /* RFC850: Thursday, 29-Jan-98 22:12:57 */
2421 "%a, %d-%b-%Y %T", /* pseudo-RFC850: Thu, 29-Jan-1998 22:12:57
2422 (google.com uses this for their cookies.) */
2423 "%a %b %d %T %Y" /* asctime: Thu Jan 29 22:12:57 1998 */
2429 /* According to Roger Beeman, we need to initialize tm_isdst, since
2430 strptime won't do it. */
2433 /* Note that under foreign locales Solaris strptime() fails to
2434 recognize English dates, which renders this function useless. We
2435 solve this by being careful not to affect LC_TIME when
2436 initializing locale.
2438 Another solution would be to temporarily set locale to C, invoke
2439 strptime(), and restore it back. This is slow and dirty,
2440 however, and locale support other than LC_MESSAGES can mess other
2441 things, so I rather chose to stick with just setting LC_MESSAGES.
2443 GNU strptime does not have this problem because it recognizes
2444 both international and local dates. */
2446 for (i = 0; i < countof (time_formats); i++)
2447 if (check_end (strptime (time_string, time_formats[i], &t)))
2448 return mktime_from_utc (&t);
2450 /* All formats have failed. */
2454 /* Authorization support: We support two authorization schemes:
2456 * `Basic' scheme, consisting of base64-ing USER:PASSWORD string;
2458 * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>,
2459 consisting of answering to the server's challenge with the proper
2462 /* How many bytes it will take to store LEN bytes in base64. */
2463 #define BASE64_LENGTH(len) (4 * (((len) + 2) / 3))
2465 /* Encode the string S of length LENGTH to base64 format and place it
2466 to STORE. STORE will be 0-terminated, and must point to a writable
2467 buffer of at least 1+BASE64_LENGTH(length) bytes. */
2469 base64_encode (const char *s, char *store, int length)
2471 /* Conversion table. */
2472 static char tbl[64] = {
2473 'A','B','C','D','E','F','G','H',
2474 'I','J','K','L','M','N','O','P',
2475 'Q','R','S','T','U','V','W','X',
2476 'Y','Z','a','b','c','d','e','f',
2477 'g','h','i','j','k','l','m','n',
2478 'o','p','q','r','s','t','u','v',
2479 'w','x','y','z','0','1','2','3',
2480 '4','5','6','7','8','9','+','/'
2483 unsigned char *p = (unsigned char *)store;
2485 /* Transform the 3x8 bits to 4x6 bits, as required by base64. */
2486 for (i = 0; i < length; i += 3)
2488 *p++ = tbl[s[0] >> 2];
2489 *p++ = tbl[((s[0] & 3) << 4) + (s[1] >> 4)];
2490 *p++ = tbl[((s[1] & 0xf) << 2) + (s[2] >> 6)];
2491 *p++ = tbl[s[2] & 0x3f];
2494 /* Pad the result if necessary... */
2495 if (i == length + 1)
2497 else if (i == length + 2)
2498 *(p - 1) = *(p - 2) = '=';
2499 /* ...and zero-terminate it. */
2503 /* Create the authentication header contents for the `Basic' scheme.
2504 This is done by encoding the string `USER:PASS' in base64 and
2505 prepending `HEADER: Basic ' to it. */
2507 basic_authentication_encode (const char *user, const char *passwd)
2509 char *t1, *t2, *res;
2510 int len1 = strlen (user) + 1 + strlen (passwd);
2511 int len2 = BASE64_LENGTH (len1);
2513 t1 = (char *)alloca (len1 + 1);
2514 sprintf (t1, "%s:%s", user, passwd);
2516 t2 = (char *)alloca (len2 + 1);
2517 base64_encode (t1, t2, len1);
2519 res = (char *)xmalloc (6 + len2 + 1);
2520 sprintf (res, "Basic %s", t2);
2525 #define SKIP_WS(x) do { \
2526 while (ISSPACE (*(x))) \
2531 /* Parse HTTP `WWW-Authenticate:' header. AU points to the beginning
2532 of a field in such a header. If the field is the one specified by
2533 ATTR_NAME ("realm", "opaque", and "nonce" are used by the current
2534 digest authorization code), extract its value in the (char*)
2535 variable pointed by RET. Returns negative on a malformed header,
2536 or number of bytes that have been parsed by this call. */
2538 extract_header_attr (const char *au, const char *attr_name, char **ret)
2540 const char *cp, *ep;
2544 if (strncmp (cp, attr_name, strlen (attr_name)) == 0)
2546 cp += strlen (attr_name);
2559 for (ep = cp; *ep && *ep != '\"'; ep++)
2564 *ret = strdupdelim (cp, ep);
2571 /* Dump the hexadecimal representation of HASH to BUF. HASH should be
2572 an array of 16 bytes containing the hash keys, and BUF should be a
2573 buffer of 33 writable characters (32 for hex digits plus one for
2574 zero termination). */
2576 dump_hash (unsigned char *buf, const unsigned char *hash)
2580 for (i = 0; i < MD5_HASHLEN; i++, hash++)
2582 *buf++ = XNUM_TO_digit (*hash >> 4);
2583 *buf++ = XNUM_TO_digit (*hash & 0xf);
2588 /* Take the line apart to find the challenge, and compose a digest
2589 authorization header. See RFC2069 section 2.1.2. */
2591 digest_authentication_encode (const char *au, const char *user,
2592 const char *passwd, const char *method,
2595 static char *realm, *opaque, *nonce;
2600 { "realm", &realm },
2601 { "opaque", &opaque },
2606 realm = opaque = nonce = NULL;
2608 au += 6; /* skip over `Digest' */
2614 for (i = 0; i < countof (options); i++)
2616 int skip = extract_header_attr (au, options[i].name,
2617 options[i].variable);
2621 xfree_null (opaque);
2631 if (i == countof (options))
2633 while (*au && *au != '=')
2641 while (*au && *au != '\"')
2648 while (*au && *au != ',')
2653 if (!realm || !nonce || !user || !passwd || !path || !method)
2656 xfree_null (opaque);
2661 /* Calculate the digest value. */
2663 ALLOCA_MD5_CONTEXT (ctx);
2664 unsigned char hash[MD5_HASHLEN];
2665 unsigned char a1buf[MD5_HASHLEN * 2 + 1], a2buf[MD5_HASHLEN * 2 + 1];
2666 unsigned char response_digest[MD5_HASHLEN * 2 + 1];
2668 /* A1BUF = H(user ":" realm ":" password) */
2670 gen_md5_update ((unsigned char *)user, strlen (user), ctx);
2671 gen_md5_update ((unsigned char *)":", 1, ctx);
2672 gen_md5_update ((unsigned char *)realm, strlen (realm), ctx);
2673 gen_md5_update ((unsigned char *)":", 1, ctx);
2674 gen_md5_update ((unsigned char *)passwd, strlen (passwd), ctx);
2675 gen_md5_finish (ctx, hash);
2676 dump_hash (a1buf, hash);
2678 /* A2BUF = H(method ":" path) */
2680 gen_md5_update ((unsigned char *)method, strlen (method), ctx);
2681 gen_md5_update ((unsigned char *)":", 1, ctx);
2682 gen_md5_update ((unsigned char *)path, strlen (path), ctx);
2683 gen_md5_finish (ctx, hash);
2684 dump_hash (a2buf, hash);
2686 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */
2688 gen_md5_update (a1buf, MD5_HASHLEN * 2, ctx);
2689 gen_md5_update ((unsigned char *)":", 1, ctx);
2690 gen_md5_update ((unsigned char *)nonce, strlen (nonce), ctx);
2691 gen_md5_update ((unsigned char *)":", 1, ctx);
2692 gen_md5_update (a2buf, MD5_HASHLEN * 2, ctx);
2693 gen_md5_finish (ctx, hash);
2694 dump_hash (response_digest, hash);
2696 res = (char*) xmalloc (strlen (user)
2701 + 2 * MD5_HASHLEN /*strlen (response_digest)*/
2702 + (opaque ? strlen (opaque) : 0)
2704 sprintf (res, "Digest \
2705 username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"",
2706 user, realm, nonce, path, response_digest);
2709 char *p = res + strlen (res);
2710 strcat (p, ", opaque=\"");
2717 #endif /* USE_DIGEST */
2720 #define BEGINS_WITH(line, string_constant) \
2721 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
2722 && (ISSPACE (line[sizeof (string_constant) - 1]) \
2723 || !line[sizeof (string_constant) - 1]))
2726 known_authentication_scheme_p (const char *au)
2728 return BEGINS_WITH (au, "Basic")
2729 || BEGINS_WITH (au, "Digest")
2730 || BEGINS_WITH (au, "NTLM");
2735 /* Create the HTTP authorization request header. When the
2736 `WWW-Authenticate' response header is seen, according to the
2737 authorization scheme specified in that header (`Basic' and `Digest'
2738 are supported by the current implementation), produce an
2739 appropriate HTTP authorization request header. */
2741 create_authorization_line (const char *au, const char *user,
2742 const char *passwd, const char *method,
2745 if (0 == strncasecmp (au, "Basic", 5))
2746 return basic_authentication_encode (user, passwd);
2748 if (0 == strncasecmp (au, "Digest", 6))
2749 return digest_authentication_encode (au, user, passwd, method, path);
2750 #endif /* USE_DIGEST */