2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001, 2002
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
35 #include <sys/types.h>
46 #if TIME_WITH_SYS_TIME
47 # include <sys/time.h>
51 # include <sys/time.h>
68 # include "gen_sslfunc.h"
76 extern char *version_string;
77 extern LARGE_INT total_downloaded_bytes;
80 # define MIN(x, y) ((x) > (y) ? (y) : (x))
84 static int cookies_loaded_p;
85 struct cookie_jar *wget_cookie_jar;
87 #define TEXTHTML_S "text/html"
88 #define TEXTXHTML_S "application/xhtml+xml"
90 /* Some status code validation macros: */
91 #define H_20X(x) (((x) >= 200) && ((x) < 300))
92 #define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS)
93 #define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \
94 || (x) == HTTP_STATUS_MOVED_TEMPORARILY \
95 || (x) == HTTP_STATUS_TEMPORARY_REDIRECT)
97 /* HTTP/1.0 status codes from RFC1945, provided for reference. */
99 #define HTTP_STATUS_OK 200
100 #define HTTP_STATUS_CREATED 201
101 #define HTTP_STATUS_ACCEPTED 202
102 #define HTTP_STATUS_NO_CONTENT 204
103 #define HTTP_STATUS_PARTIAL_CONTENTS 206
105 /* Redirection 3xx. */
106 #define HTTP_STATUS_MULTIPLE_CHOICES 300
107 #define HTTP_STATUS_MOVED_PERMANENTLY 301
108 #define HTTP_STATUS_MOVED_TEMPORARILY 302
109 #define HTTP_STATUS_NOT_MODIFIED 304
110 #define HTTP_STATUS_TEMPORARY_REDIRECT 307
112 /* Client error 4xx. */
113 #define HTTP_STATUS_BAD_REQUEST 400
114 #define HTTP_STATUS_UNAUTHORIZED 401
115 #define HTTP_STATUS_FORBIDDEN 403
116 #define HTTP_STATUS_NOT_FOUND 404
118 /* Server errors 5xx. */
119 #define HTTP_STATUS_INTERNAL 500
120 #define HTTP_STATUS_NOT_IMPLEMENTED 501
121 #define HTTP_STATUS_BAD_GATEWAY 502
122 #define HTTP_STATUS_UNAVAILABLE 503
125 rel_none, rel_name, rel_value, rel_both
132 struct request_header {
134 enum rp release_policy;
136 int hcount, hcapacity;
139 /* Create a new, empty request. At least request_set_method must be
140 called before the request can be used. */
142 static struct request *
145 struct request *req = xnew0 (struct request);
147 req->headers = xnew_array (struct request_header, req->hcapacity);
151 /* Set the request's method and its arguments. METH should be a
152 literal string (or it should outlive the request) because it will
153 not be freed. ARG will be freed by request_free. */
156 request_set_method (struct request *req, const char *meth, char *arg)
162 /* Return the method string passed with the last call to
163 request_set_method. */
166 request_method (const struct request *req)
171 /* Free one header according to the release policy specified with
172 request_set_header. */
175 release_header (struct request_header *hdr)
177 switch (hdr->release_policy)
194 /* Set the request named NAME to VALUE. Specifically, this means that
195 a "NAME: VALUE\r\n" header line will be used in the request. If a
196 header with the same name previously existed in the request, its
197 value will be replaced by this one.
199 RELEASE_POLICY determines whether NAME and VALUE should be released
200 (freed) with request_free. Allowed values are:
202 - rel_none - don't free NAME or VALUE
203 - rel_name - free NAME when done
204 - rel_value - free VALUE when done
205 - rel_both - free both NAME and VALUE when done
207 Setting release policy is useful when arguments come from different
208 sources. For example:
210 // Don't free literal strings!
211 request_set_header (req, "Pragma", "no-cache", rel_none);
213 // Don't free a global variable, we'll need it later.
214 request_set_header (req, "Referer", opt.referer, rel_none);
216 // Value freshly allocated, free it when done.
217 request_set_header (req, "Range", aprintf ("bytes=%ld-", hs->restval),
222 request_set_header (struct request *req, char *name, char *value,
223 enum rp release_policy)
225 struct request_header *hdr;
229 for (i = 0; i < req->hcount; i++)
231 hdr = &req->headers[i];
232 if (0 == strcasecmp (name, hdr->name))
234 /* Replace existing header. */
235 release_header (hdr);
238 hdr->release_policy = release_policy;
243 /* Install new header. */
245 if (req->hcount >= req->hcount)
247 req->hcapacity <<= 1;
248 req->headers = xrealloc (req->headers,
249 req->hcapacity * sizeof (struct request_header));
251 hdr = &req->headers[req->hcount++];
254 hdr->release_policy = release_policy;
257 /* Like request_set_header, but sets the whole header line, as
258 provided by the user using the `--header' option. For example,
259 request_set_user_header (req, "Foo: bar") works just like
260 request_set_header (req, "Foo", "bar"). */
263 request_set_user_header (struct request *req, const char *header)
266 const char *p = strchr (header, ':');
269 BOUNDED_TO_ALLOCA (header, p, name);
273 request_set_header (req, xstrdup (name), (char *) p, rel_name);
276 #define APPEND(p, str) do { \
277 int A_len = strlen (str); \
278 memcpy (p, str, A_len); \
282 /* Construct the request and write it to FD using fd_write. */
285 request_send (const struct request *req, int fd)
287 char *request_string, *p;
288 int i, size, write_error;
290 /* Count the request size. */
293 /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */
294 size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2;
296 for (i = 0; i < req->hcount; i++)
298 struct request_header *hdr = &req->headers[i];
299 /* NAME ": " VALUE "\r\n" */
300 size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2;
306 p = request_string = alloca_array (char, size);
308 /* Generate the request. */
310 APPEND (p, req->method); *p++ = ' ';
311 APPEND (p, req->arg); *p++ = ' ';
312 memcpy (p, "HTTP/1.0\r\n", 10); p += 10;
314 for (i = 0; i < req->hcount; i++)
316 struct request_header *hdr = &req->headers[i];
317 APPEND (p, hdr->name);
318 *p++ = ':', *p++ = ' ';
319 APPEND (p, hdr->value);
320 *p++ = '\r', *p++ = '\n';
323 *p++ = '\r', *p++ = '\n', *p++ = '\0';
324 assert (p - request_string == size);
328 DEBUGP (("\n---request begin---\n%s---request end---\n", request_string));
330 /* Send the request to the server. */
332 write_error = fd_write (fd, request_string, size - 1, -1);
334 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
339 /* Release the resources used by REQ. */
342 request_free (struct request *req)
345 xfree_null (req->arg);
346 for (i = 0; i < req->hcount; i++)
347 release_header (&req->headers[i]);
348 xfree_null (req->headers);
352 /* Send the contents of FILE_NAME to SOCK/SSL. Make sure that exactly
353 PROMISED_SIZE bytes are sent over the wire -- if the file is
354 longer, read only that much; if the file is shorter, report an error. */
357 post_file (int sock, const char *file_name, long promised_size)
359 static char chunk[8192];
364 DEBUGP (("[writing POST file %s ... ", file_name));
366 fp = fopen (file_name, "rb");
369 while (!feof (fp) && written < promised_size)
372 int length = fread (chunk, 1, sizeof (chunk), fp);
375 towrite = MIN (promised_size - written, length);
376 write_error = fd_write (sock, chunk, towrite, -1);
386 /* If we've written less than was promised, report a (probably
387 nonsensical) error rather than break the promise. */
388 if (written < promised_size)
394 assert (written == promised_size);
395 DEBUGP (("done]\n"));
400 head_terminator (const char *hunk, int oldlen, int peeklen)
402 const char *start, *end;
404 /* If at first peek, verify whether HUNK starts with "HTTP". If
405 not, this is a HTTP/0.9 request and we must bail out without
407 if (oldlen == 0 && 0 != memcmp (hunk, "HTTP", MIN (peeklen, 4)))
413 start = hunk + oldlen - 4;
414 end = hunk + oldlen + peeklen;
416 for (; start < end - 1; start++)
423 if (start[1] == '\n')
429 /* Read the HTTP request head from FD and return it. The error
430 conditions are the same as with fd_read_hunk.
432 To support HTTP/0.9 responses, this function tries to make sure
433 that the data begins with "HTTP". If this is not the case, no data
434 is read and an empty request is returned, so that the remaining
435 data can be treated as body. */
438 fd_read_http_head (int fd)
440 return fd_read_hunk (fd, head_terminator, 512);
444 /* The response data. */
447 /* The array of pointers that indicate where each header starts.
448 For example, given this HTTP response:
455 The headers are located like this:
457 "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n"
459 headers[0] headers[1] headers[2] headers[3]
461 I.e. headers[0] points to the beginning of the request,
462 headers[1] points to the end of the first header and the
463 beginning of the second one, etc. */
465 const char **headers;
468 /* Create a new response object from the text of the HTTP response,
469 available in HEAD. That text is automatically split into
470 constituent header lines for fast retrieval using
471 response_header_*. */
473 static struct response *
474 response_new (const char *head)
479 struct response *resp = xnew0 (struct response);
484 /* Empty head means that we're dealing with a headerless
485 (HTTP/0.9) response. In that case, don't set HEADERS at
490 /* Split HEAD into header lines, so that response_header_* functions
491 don't need to do this over and over again. */
497 DO_REALLOC (resp->headers, size, count + 1, const char *);
498 resp->headers[count++] = hdr;
500 /* Break upon encountering an empty line. */
501 if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n')
504 /* Find the end of HDR, including continuations. */
507 const char *end = strchr (hdr, '\n');
513 while (*hdr == ' ' || *hdr == '\t');
515 DO_REALLOC (resp->headers, size, count + 1, const char *);
516 resp->headers[count++] = NULL;
521 /* Locate the header named NAME in the request data. If found, set
522 *BEGPTR to its starting, and *ENDPTR to its ending position, and
523 return 1. Otherwise return 0.
525 This function is used as a building block for response_header_copy
526 and response_header_strdup. */
529 response_header_bounds (const struct response *resp, const char *name,
530 const char **begptr, const char **endptr)
533 const char **headers = resp->headers;
536 if (!headers || !headers[1])
539 name_len = strlen (name);
541 for (i = 1; headers[i + 1]; i++)
543 const char *b = headers[i];
544 const char *e = headers[i + 1];
546 && b[name_len] == ':'
547 && 0 == strncasecmp (b, name, name_len))
550 while (b < e && ISSPACE (*b))
552 while (b < e && ISSPACE (e[-1]))
562 /* Copy the response header named NAME to buffer BUF, no longer than
563 BUFSIZE (BUFSIZE includes the terminating 0). If the header
564 exists, 1 is returned, otherwise 0. If there should be no limit on
565 the size of the header, use response_header_strdup instead.
567 If BUFSIZE is 0, no data is copied, but the boolean indication of
568 whether the header is present is still returned. */
571 response_header_copy (const struct response *resp, const char *name,
572 char *buf, int bufsize)
575 if (!response_header_bounds (resp, name, &b, &e))
579 int len = MIN (e - b, bufsize);
580 strncpy (buf, b, len);
586 /* Return the value of header named NAME in RESP, allocated with
587 malloc. If such a header does not exist in RESP, return NULL. */
590 response_header_strdup (const struct response *resp, const char *name)
593 if (!response_header_bounds (resp, name, &b, &e))
595 return strdupdelim (b, e);
598 /* Parse the HTTP status line, which is of format:
600 HTTP-Version SP Status-Code SP Reason-Phrase
602 The function returns the status-code, or -1 if the status line
603 appears malformed. The pointer to "reason-phrase" message is
604 returned in *MESSAGE. */
607 response_status (const struct response *resp, char **message)
614 /* For a HTTP/0.9 response, assume status 200. */
616 *message = xstrdup (_("No headers, assuming HTTP/0.9"));
620 p = resp->headers[0];
621 end = resp->headers[1];
627 if (end - p < 4 || 0 != strncmp (p, "HTTP", 4))
631 /* Match the HTTP version. This is optional because Gnutella
632 servers have been reported to not specify HTTP version. */
633 if (p < end && *p == '/')
636 while (p < end && ISDIGIT (*p))
638 if (p < end && *p == '.')
640 while (p < end && ISDIGIT (*p))
644 while (p < end && ISSPACE (*p))
646 if (end - p < 3 || !ISDIGIT (p[0]) || !ISDIGIT (p[1]) || !ISDIGIT (p[2]))
649 status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0');
654 while (p < end && ISSPACE (*p))
656 while (p < end && ISSPACE (end[-1]))
658 *message = strdupdelim (p, end);
664 /* Release the resources used by RESP. */
667 response_free (struct response *resp)
669 xfree_null (resp->headers);
673 /* Print [b, e) to the log, omitting the trailing CRLF. */
676 print_server_response_1 (const char *prefix, const char *b, const char *e)
679 if (b < e && e[-1] == '\n')
681 if (b < e && e[-1] == '\r')
683 BOUNDED_TO_ALLOCA (b, e, ln);
684 logprintf (LOG_VERBOSE, "%s%s\n", prefix, ln);
687 /* Print the server response, line by line, omitting the trailing CR
688 characters, prefixed with PREFIX. */
691 print_server_response (const struct response *resp, const char *prefix)
696 for (i = 0; resp->headers[i + 1]; i++)
697 print_server_response_1 (prefix, resp->headers[i], resp->headers[i + 1]);
700 /* Parse the `Content-Range' header and extract the information it
701 contains. Returns 1 if successful, -1 otherwise. */
703 parse_content_range (const char *hdr, long *first_byte_ptr,
704 long *last_byte_ptr, long *entity_length_ptr)
708 /* Ancient versions of Netscape proxy server, presumably predating
709 rfc2068, sent out `Content-Range' without the "bytes"
711 if (!strncasecmp (hdr, "bytes", 5))
714 /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the
718 while (ISSPACE (*hdr))
725 for (num = 0; ISDIGIT (*hdr); hdr++)
726 num = 10 * num + (*hdr - '0');
727 if (*hdr != '-' || !ISDIGIT (*(hdr + 1)))
729 *first_byte_ptr = num;
731 for (num = 0; ISDIGIT (*hdr); hdr++)
732 num = 10 * num + (*hdr - '0');
733 if (*hdr != '/' || !ISDIGIT (*(hdr + 1)))
735 *last_byte_ptr = num;
737 for (num = 0; ISDIGIT (*hdr); hdr++)
738 num = 10 * num + (*hdr - '0');
739 *entity_length_ptr = num;
743 /* Read the body of the request, but don't store it anywhere. This is
744 useful when reading error responses that are not logged anywhere,
745 but which need to be read so the same connection can be reused. */
748 skip_body (int fd, long contlen)
753 /* Skipping the body doesn't make sense if the content length is
754 unknown because, in that case, persistent connections cannot be
755 used. (#### This is not the case with HTTP/1.1 where they can
756 still be used with the magic of the "chunked" transfer!) */
760 oldverbose = opt.verbose;
762 fd_read_body (fd, NULL, &dummy, 0, contlen, 1, NULL);
763 opt.verbose = oldverbose;
766 /* Persistent connections. Currently, we cache the most recently used
767 connection as persistent, provided that the HTTP server agrees to
768 make it such. The persistence data is stored in the variables
769 below. Ideally, it should be possible to cache an arbitrary fixed
770 number of these connections. */
772 /* Whether a persistent connection is active. */
773 static int pconn_active;
776 /* The socket of the connection. */
779 /* Host and port of the currently active persistent connection. */
783 /* Whether a ssl handshake has occoured on this connection. */
787 /* Mark the persistent connection as invalid and free the resources it
788 uses. This is used by the CLOSE_* macros after they forcefully
789 close a registered persistent connection. */
792 invalidate_persistent (void)
794 DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket));
796 fd_close (pconn.socket);
801 /* Register FD, which should be a TCP/IP connection to HOST:PORT, as
802 persistent. This will enable someone to use the same connection
803 later. In the context of HTTP, this must be called only AFTER the
804 response has been received and the server has promised that the
805 connection will remain alive.
807 If a previous connection was persistent, it is closed. */
810 register_persistent (const char *host, int port, int fd, int ssl)
814 if (pconn.socket == fd)
816 /* The connection FD is already registered. */
821 /* The old persistent connection is still active; close it
822 first. This situation arises whenever a persistent
823 connection exists, but we then connect to a different
824 host, and try to register a persistent connection to that
826 invalidate_persistent ();
832 pconn.host = xstrdup (host);
836 DEBUGP (("Registered socket %d for persistent reuse.\n", fd));
839 /* Return non-zero if a persistent connection is available for
840 connecting to HOST:PORT. */
843 persistent_available_p (const char *host, int port, int ssl,
844 int *host_lookup_failed)
846 /* First, check whether a persistent connection is active at all. */
850 /* If we want SSL and the last connection wasn't or vice versa,
851 don't use it. Checking for host and port is not enough because
852 HTTP and HTTPS can apparently coexist on the same port. */
853 if (ssl != pconn.ssl)
856 /* If we're not connecting to the same port, we're not interested. */
857 if (port != pconn.port)
860 /* If the host is the same, we're in business. If not, there is
861 still hope -- read below. */
862 if (0 != strcasecmp (host, pconn.host))
864 /* If pconn.socket is already talking to HOST, we needn't
865 reconnect. This happens often when both sites are virtual
866 hosts distinguished only by name and served by the same
867 network interface, and hence the same web server (possibly
868 set up by the ISP and serving many different web sites).
869 This admittedly non-standard optimization does not contradict
870 HTTP and works well with popular server software. */
874 struct address_list *al;
877 /* Don't try to talk to two different SSL sites over the same
878 secure connection! (Besides, it's not clear if name-based
879 virtual hosting is even possible with SSL.) */
882 /* If pconn.socket's peer is one of the IP addresses HOST
883 resolves to, pconn.socket is for all intents and purposes
884 already talking to HOST. */
886 if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER))
888 /* Can't get the peer's address -- something must be very
889 wrong with the connection. */
890 invalidate_persistent ();
893 al = lookup_host (host, 0);
896 *host_lookup_failed = 1;
900 found = address_list_contains (al, &ip);
901 address_list_release (al);
906 /* The persistent connection's peer address was found among the
907 addresses HOST resolved to; therefore, pconn.sock is in fact
908 already talking to HOST -- no need to reconnect. */
911 /* Finally, check whether the connection is still open. This is
912 important because most server implement a liberal (short) timeout
913 on persistent connections. Wget can of course always reconnect
914 if the connection doesn't work out, but it's nicer to know in
915 advance. This test is a logical followup of the first test, but
916 is "expensive" and therefore placed at the end of the list. */
918 if (!test_socket_open (pconn.socket))
920 /* Oops, the socket is no longer open. Now that we know that,
921 let's invalidate the persistent connection before returning
923 invalidate_persistent ();
930 /* The idea behind these two CLOSE macros is to distinguish between
931 two cases: one when the job we've been doing is finished, and we
932 want to close the connection and leave, and two when something is
933 seriously wrong and we're closing the connection as part of
936 In case of keep_alive, CLOSE_FINISH should leave the connection
937 open, while CLOSE_INVALIDATE should still close it.
939 Note that the semantics of the flag `keep_alive' is "this
940 connection *will* be reused (the server has promised not to close
941 the connection once we're done)", while the semantics of
942 `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an
943 active, registered connection". */
945 #define CLOSE_FINISH(fd) do { \
948 if (pconn_active && (fd) == pconn.socket) \
949 invalidate_persistent (); \
958 #define CLOSE_INVALIDATE(fd) do { \
959 if (pconn_active && (fd) == pconn.socket) \
960 invalidate_persistent (); \
968 long len; /* received length */
969 long contlen; /* expected length */
970 long restval; /* the restart value */
971 int res; /* the result of last read */
972 char *newloc; /* new location (redirection) */
973 char *remote_time; /* remote time-stamp string */
974 char *error; /* textual HTTP error */
975 int statcode; /* status code */
976 double dltime; /* time of the download in msecs */
977 int no_truncate; /* whether truncating the file is
979 const char *referer; /* value of the referer header. */
980 char **local_file; /* local file. */
984 free_hstat (struct http_stat *hs)
986 xfree_null (hs->newloc);
987 xfree_null (hs->remote_time);
988 xfree_null (hs->error);
990 /* Guard against being called twice. */
992 hs->remote_time = NULL;
996 static char *create_authorization_line PARAMS ((const char *, const char *,
997 const char *, const char *,
999 static char *basic_authentication_encode PARAMS ((const char *, const char *));
1000 static int known_authentication_scheme_p PARAMS ((const char *));
1002 time_t http_atotm PARAMS ((const char *));
1004 #define BEGINS_WITH(line, string_constant) \
1005 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
1006 && (ISSPACE (line[sizeof (string_constant) - 1]) \
1007 || !line[sizeof (string_constant) - 1]))
1009 /* Retrieve a document through HTTP protocol. It recognizes status
1010 code, and correctly handles redirections. It closes the network
1011 socket. If it receives an error from the functions below it, it
1012 will print it if there is enough information to do so (almost
1013 always), returning the error to the caller (i.e. http_loop).
1015 Various HTTP parameters are stored to hs.
1017 If PROXY is non-NULL, the connection will be made to the proxy
1018 server, and u->url will be requested. */
1020 gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
1022 struct request *req;
1025 char *user, *passwd;
1029 long contlen, contrange;
1035 /* Whether authorization has been already tried. */
1036 int auth_tried_already = 0;
1038 /* Whether our connection to the remote host is through SSL. */
1042 struct response *resp;
1046 /* Whether this connection will be kept alive after the HTTP request
1050 /* Whether keep-alive should be inhibited. */
1051 int inhibit_keep_alive = !opt.http_keep_alive;
1053 /* Headers sent when using POST. */
1054 long post_data_size = 0;
1056 int host_lookup_failed = 0;
1059 if (u->scheme == SCHEME_HTTPS)
1061 /* Initialize the SSL context. After this has once been done,
1062 it becomes a no-op. */
1063 switch (ssl_init ())
1065 case SSLERRCTXCREATE:
1067 logprintf (LOG_NOTQUIET, _("Failed to set up an SSL context\n"));
1068 return SSLERRCTXCREATE;
1069 case SSLERRCERTFILE:
1070 /* try without certfile */
1071 logprintf (LOG_NOTQUIET,
1072 _("Failed to load certificates from %s\n"),
1074 logprintf (LOG_NOTQUIET,
1075 _("Trying without the specified certificate\n"));
1078 logprintf (LOG_NOTQUIET,
1079 _("Failed to get certificate key from %s\n"),
1081 logprintf (LOG_NOTQUIET,
1082 _("Trying without the specified certificate\n"));
1088 #endif /* HAVE_SSL */
1090 if (!(*dt & HEAD_ONLY))
1091 /* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
1092 know the local filename so we can save to it. */
1093 assert (*hs->local_file != NULL);
1095 auth_tried_already = 0;
1097 /* Initialize certain elements of struct http_stat. */
1102 hs->remote_time = NULL;
1110 char *proxy_user, *proxy_passwd;
1111 /* For normal username and password, URL components override
1112 command-line/wgetrc parameters. With proxy
1113 authentication, it's the reverse, because proxy URLs are
1114 normally the "permanent" ones, so command-line args
1115 should take precedence. */
1116 if (opt.proxy_user && opt.proxy_passwd)
1118 proxy_user = opt.proxy_user;
1119 proxy_passwd = opt.proxy_passwd;
1123 proxy_user = proxy->user;
1124 proxy_passwd = proxy->passwd;
1126 /* #### This does not appear right. Can't the proxy request,
1127 say, `Digest' authentication? */
1128 if (proxy_user && proxy_passwd)
1129 proxyauth = basic_authentication_encode (proxy_user, proxy_passwd);
1131 /* If we're using a proxy, we will be connecting to the proxy
1136 /* Prepare the request to send. */
1138 req = request_new ();
1140 const char *meth = "GET";
1141 if (*dt & HEAD_ONLY)
1143 else if (opt.post_file_name || opt.post_data)
1145 /* Use the full path, i.e. one that includes the leading slash and
1146 the query string. E.g. if u->path is "foo/bar" and u->query is
1147 "param=value", full_path will be "/foo/bar?param=value". */
1148 request_set_method (req, meth,
1149 proxy ? xstrdup (u->url) : url_full_path (u));
1152 request_set_header (req, "Referer", (char *) hs->referer, rel_none);
1153 if (*dt & SEND_NOCACHE)
1154 request_set_header (req, "Pragma", "no-cache", rel_none);
1156 request_set_header (req, "Range",
1157 aprintf ("bytes=%ld-", hs->restval), rel_value);
1159 request_set_header (req, "User-Agent", opt.useragent, rel_none);
1161 request_set_header (req, "User-Agent",
1162 aprintf ("Wget/%s", version_string), rel_value);
1163 request_set_header (req, "Accept", "*/*", rel_none);
1165 /* Find the username and password for authentication. */
1168 search_netrc (u->host, (const char **)&user, (const char **)&passwd, 0);
1169 user = user ? user : opt.http_user;
1170 passwd = passwd ? passwd : opt.http_passwd;
1174 /* We have the username and the password, but haven't tried
1175 any authorization yet. Let's see if the "Basic" method
1176 works. If not, we'll come back here and construct a
1177 proper authorization method with the right challenges.
1179 If we didn't employ this kind of logic, every URL that
1180 requires authorization would have to be processed twice,
1181 which is very suboptimal and generates a bunch of false
1182 "unauthorized" errors in the server log.
1184 #### But this logic also has a serious problem when used
1185 with stronger authentications: we *first* transmit the
1186 username and the password in clear text, and *then* attempt a
1187 stronger authentication scheme. That cannot be right! We
1188 are only fortunate that almost everyone still uses the
1189 `Basic' scheme anyway.
1191 There should be an option to prevent this from happening, for
1192 those who use strong authentication schemes and value their
1194 request_set_header (req, "Authorization",
1195 basic_authentication_encode (user, passwd),
1200 /* Whether we need to print the host header with braces around
1201 host, e.g. "Host: [3ffe:8100:200:2::2]:1234" instead of the
1202 usual "Host: symbolic-name:1234". */
1203 int squares = strchr (u->host, ':') != NULL;
1204 if (u->port == scheme_default_port (u->scheme))
1205 request_set_header (req, "Host",
1206 aprintf (squares ? "[%s]" : "%s", u->host),
1209 request_set_header (req, "Host",
1210 aprintf (squares ? "[%s]:%d" : "%s:%d",
1215 if (!inhibit_keep_alive)
1216 request_set_header (req, "Connection", "Keep-Alive", rel_none);
1219 request_set_header (req, "Cookie",
1220 cookie_header (wget_cookie_jar,
1221 u->host, u->port, u->path,
1223 u->scheme == SCHEME_HTTPS
1230 if (opt.post_data || opt.post_file_name)
1232 request_set_header (req, "Content-Type",
1233 "application/x-www-form-urlencoded", rel_none);
1235 post_data_size = strlen (opt.post_data);
1238 post_data_size = file_size (opt.post_file_name);
1239 if (post_data_size == -1)
1241 logprintf (LOG_NOTQUIET, "POST data file missing: %s\n",
1242 opt.post_file_name);
1246 request_set_header (req, "Content-Length",
1247 aprintf ("Content-Length: %ld", post_data_size),
1251 /* Add the user headers. */
1252 if (opt.user_headers)
1255 for (i = 0; opt.user_headers[i]; i++)
1256 request_set_user_header (req, opt.user_headers[i]);
1260 /* We need to come back here when the initial attempt to retrieve
1261 without authorization header fails. (Expected to happen at least
1262 for the Digest authorization scheme.) */
1266 /* Establish the connection. */
1268 if (!inhibit_keep_alive)
1270 /* Look for a persistent connection to target host, unless a
1271 proxy is used. The exception is when SSL is in use, in which
1272 case the proxy is nothing but a passthrough to the target
1273 host, registered as a connection to the latter. */
1274 struct url *relevant = conn;
1276 if (u->scheme == SCHEME_HTTPS)
1280 if (persistent_available_p (relevant->host, relevant->port,
1282 relevant->scheme == SCHEME_HTTPS,
1286 &host_lookup_failed))
1288 sock = pconn.socket;
1289 using_ssl = pconn.ssl;
1290 logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"),
1291 pconn.host, pconn.port);
1292 DEBUGP (("Reusing fd %d.\n", sock));
1298 /* In its current implementation, persistent_available_p will
1299 look up conn->host in some cases. If that lookup failed, we
1300 don't need to bother with connect_to_host. */
1301 if (host_lookup_failed)
1304 sock = connect_to_host (conn->host, conn->port);
1308 return (retryable_socket_connect_error (errno)
1309 ? CONERROR : CONIMPOSSIBLE);
1312 if (proxy && u->scheme == SCHEME_HTTPS)
1314 /* When requesting SSL URLs through proxies, use the
1315 CONNECT method to request passthrough. */
1316 struct request *connreq = request_new ();
1317 request_set_method (connreq, "CONNECT",
1318 aprintf ("%s:%d", u->host, u->port));
1321 request_set_header (connreq, "Proxy-Authorization",
1322 proxyauth, rel_value);
1323 /* Now that PROXYAUTH is part of the CONNECT request,
1324 zero it out so we don't send proxy authorization with
1325 the regular request below. */
1329 write_error = request_send (connreq, sock);
1330 request_free (connreq);
1331 if (write_error < 0)
1333 logprintf (LOG_VERBOSE, _("Failed writing to proxy: %s.\n"),
1335 CLOSE_INVALIDATE (sock);
1339 head = fd_read_http_head (sock);
1342 logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"),
1344 CLOSE_INVALIDATE (sock);
1353 DEBUGP (("proxy responded with: [%s]\n", head));
1355 resp = response_new (head);
1356 statcode = response_status (resp, &message);
1357 response_free (resp);
1358 if (statcode != 200)
1361 logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"),
1362 message ? message : "?");
1363 xfree_null (message);
1368 /* SOCK is now *really* connected to u->host, so update CONN
1369 to reflect this. That way register_persistent will
1370 register SOCK as being connected to u->host:u->port. */
1374 if (conn->scheme == SCHEME_HTTPS)
1376 if (!ssl_connect (sock))
1383 #endif /* HAVE_SSL */
1386 /* Send the request to server. */
1387 write_error = request_send (req, sock);
1389 if (write_error >= 0)
1393 DEBUGP (("[POST data: %s]\n", opt.post_data));
1394 write_error = fd_write (sock, opt.post_data, post_data_size, -1);
1396 else if (opt.post_file_name && post_data_size != 0)
1397 write_error = post_file (sock, opt.post_file_name, post_data_size);
1400 if (write_error < 0)
1402 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
1404 CLOSE_INVALIDATE (sock);
1408 logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
1409 proxy ? "Proxy" : "HTTP");
1410 contlen = contrange = -1;
1415 head = fd_read_http_head (sock);
1420 logputs (LOG_NOTQUIET, _("No data received.\n"));
1421 CLOSE_INVALIDATE (sock);
1427 logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"),
1429 CLOSE_INVALIDATE (sock);
1434 DEBUGP (("\n---response begin---\n%s---response end---\n", head));
1436 resp = response_new (head);
1438 /* Check for status line. */
1440 statcode = response_status (resp, &message);
1441 if (!opt.server_response)
1442 logprintf (LOG_VERBOSE, "%2d %s\n", statcode, message ? message : "");
1445 logprintf (LOG_VERBOSE, "\n");
1446 print_server_response (resp, " ");
1449 if (response_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval)))
1450 contlen = strtol (hdrval, NULL, 10);
1452 /* Check for keep-alive related responses. */
1453 if (!inhibit_keep_alive && contlen != -1)
1455 if (response_header_copy (resp, "Keep-Alive", NULL, 0))
1457 else if (response_header_copy (resp, "Connection", hdrval,
1460 if (0 == strcasecmp (hdrval, "Keep-Alive"))
1465 /* The server has promised that it will not close the connection
1466 when we're done. This means that we can register it. */
1467 register_persistent (conn->host, conn->port, sock, using_ssl);
1469 if (statcode == HTTP_STATUS_UNAUTHORIZED)
1471 /* Authorization is required. */
1472 skip_body (sock, contlen);
1473 CLOSE_FINISH (sock);
1474 if (auth_tried_already || !(user && passwd))
1476 /* If we have tried it already, then there is not point
1478 logputs (LOG_NOTQUIET, _("Authorization failed.\n"));
1482 char *www_authenticate = response_header_strdup (resp,
1483 "WWW-Authenticate");
1484 /* If the authentication scheme is unknown or if it's the
1485 "Basic" authentication (which we try by default), there's
1486 no sense in retrying. */
1487 if (!www_authenticate
1488 || !known_authentication_scheme_p (www_authenticate)
1489 || BEGINS_WITH (www_authenticate, "Basic"))
1491 xfree_null (www_authenticate);
1492 logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n"));
1497 auth_tried_already = 1;
1498 pth = url_full_path (u);
1499 request_set_header (req, "Authorization",
1500 create_authorization_line (www_authenticate,
1502 request_method (req),
1506 xfree (www_authenticate);
1507 goto retry_with_auth;
1515 hs->statcode = statcode;
1517 hs->error = xstrdup (_("Malformed status line"));
1519 hs->error = xstrdup (_("(no description)"));
1521 hs->error = xstrdup (message);
1523 type = response_header_strdup (resp, "Content-Type");
1526 char *tmp = strchr (type, ';');
1529 while (tmp > type && ISSPACE (tmp[-1]))
1534 hs->newloc = response_header_strdup (resp, "Location");
1535 hs->remote_time = response_header_strdup (resp, "Last-Modified");
1537 char *set_cookie = response_header_strdup (resp, "Set-Cookie");
1540 /* The jar should have been created by now. */
1541 assert (wget_cookie_jar != NULL);
1542 cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port, u->path,
1547 if (response_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval)))
1549 long first_byte_pos, last_byte_pos, entity_length;
1550 if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos,
1552 contrange = first_byte_pos;
1554 response_free (resp);
1556 /* 20x responses are counted among successful by default. */
1557 if (H_20X (statcode))
1560 /* Return if redirected. */
1561 if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES)
1563 /* RFC2068 says that in case of the 300 (multiple choices)
1564 response, the server can output a preferred URL through
1565 `Location' header; otherwise, the request should be treated
1566 like GET. So, if the location is set, it will be a
1567 redirection; otherwise, just proceed normally. */
1568 if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc)
1572 logprintf (LOG_VERBOSE,
1573 _("Location: %s%s\n"),
1574 hs->newloc ? hs->newloc : _("unspecified"),
1575 hs->newloc ? _(" [following]") : "");
1577 skip_body (sock, contlen);
1578 CLOSE_FINISH (sock);
1584 /* If content-type is not given, assume text/html. This is because
1585 of the multitude of broken CGI's that "forget" to generate the
1588 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) ||
1589 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
1594 if (opt.html_extension && (*dt & TEXTHTML))
1595 /* -E / --html-extension / html_extension = on was specified, and this is a
1596 text/html file. If some case-insensitive variation on ".htm[l]" isn't
1597 already the file's suffix, tack on ".html". */
1599 char* last_period_in_local_filename = strrchr(*hs->local_file, '.');
1601 if (last_period_in_local_filename == NULL
1602 || !(0 == strcasecmp (last_period_in_local_filename, ".htm")
1603 || 0 == strcasecmp (last_period_in_local_filename, ".html")))
1605 size_t local_filename_len = strlen(*hs->local_file);
1607 *hs->local_file = xrealloc(*hs->local_file,
1608 local_filename_len + sizeof(".html"));
1609 strcpy(*hs->local_file + local_filename_len, ".html");
1611 *dt |= ADDED_HTML_EXTENSION;
1615 if (contrange == -1)
1617 /* We did not get a content-range header. This means that the
1618 server did not honor our `Range' request. Normally, this
1619 means we should reset hs->restval and continue normally. */
1621 /* However, if `-c' is used, we need to be a bit more careful:
1623 1. If `-c' is specified and the file already existed when
1624 Wget was started, it would be a bad idea for us to start
1625 downloading it from scratch, effectively truncating it. I
1626 believe this cannot happen unless `-c' was specified.
1628 2. If `-c' is used on a file that is already fully
1629 downloaded, we're requesting bytes after the end of file,
1630 which can result in server not honoring `Range'. If this is
1631 the case, `Content-Length' will be equal to the length of the
1633 if (opt.always_rest)
1635 /* Check for condition #2. */
1636 if (hs->restval > 0 /* restart was requested. */
1637 && contlen != -1 /* we got content-length. */
1638 && hs->restval >= contlen /* file fully downloaded
1642 logputs (LOG_VERBOSE, _("\
1643 \n The file is already fully retrieved; nothing to do.\n\n"));
1644 /* In case the caller inspects. */
1647 /* Mark as successfully retrieved. */
1650 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1651 might be more bytes in the body. */
1652 return RETRUNNEEDED;
1655 /* Check for condition #1. */
1656 if (hs->no_truncate)
1658 logprintf (LOG_NOTQUIET,
1661 Continued download failed on this file, which conflicts with `-c'.\n\
1662 Refusing to truncate existing file `%s'.\n\n"), *hs->local_file);
1664 CLOSE_INVALIDATE (sock); /* see above */
1665 return CONTNOTSUPPORTED;
1673 else if (contrange != hs->restval ||
1674 (H_PARTIAL (statcode) && contrange == -1))
1676 /* This means the whole request was somehow misunderstood by the
1677 server. Bail out. */
1679 CLOSE_INVALIDATE (sock);
1686 contlen += contrange;
1688 contrange = -1; /* If conent-length was not sent,
1689 content-range will be ignored. */
1691 hs->contlen = contlen;
1697 /* No need to print this output if the body won't be
1698 downloaded at all, or if the original server response is
1700 logputs (LOG_VERBOSE, _("Length: "));
1703 logputs (LOG_VERBOSE, legible (contlen));
1704 if (contrange != -1)
1705 logprintf (LOG_VERBOSE, _(" (%s to go)"),
1706 legible (contlen - contrange));
1709 logputs (LOG_VERBOSE,
1710 opt.ignore_length ? _("ignored") : _("unspecified"));
1712 logprintf (LOG_VERBOSE, " [%s]\n", type);
1714 logputs (LOG_VERBOSE, "\n");
1718 type = NULL; /* We don't need it any more. */
1720 /* Return if we have no intention of further downloading. */
1721 if (!(*dt & RETROKF) || (*dt & HEAD_ONLY))
1723 /* In case the caller cares to look... */
1727 /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
1728 servers not to send body in response to a HEAD request. If
1729 you encounter such a server (more likely a broken CGI), use
1730 `--no-http-keep-alive'. */
1731 CLOSE_FINISH (sock);
1732 return RETRFINISHED;
1735 /* Open the local file. */
1738 mkalldirs (*hs->local_file);
1740 rotate_backups (*hs->local_file);
1741 fp = fopen (*hs->local_file, hs->restval ? "ab" : "wb");
1744 logprintf (LOG_NOTQUIET, "%s: %s\n", *hs->local_file, strerror (errno));
1745 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1746 might be more bytes in the body. */
1752 extern int global_download_count;
1754 /* To ensure that repeated "from scratch" downloads work for -O
1755 files, we rewind the file pointer, unless restval is
1756 non-zero. (This works only when -O is used on regular files,
1757 but it's still a valuable feature.)
1759 However, this loses when more than one URL is specified on
1760 the command line the second rewinds eradicates the contents
1761 of the first download. Thus we disable the above trick for
1762 all the downloads except the very first one.
1764 #### A possible solution to this would be to remember the
1765 file position in the output document and to seek to that
1766 position, instead of rewinding.
1768 We don't truncate stdout, since that breaks
1769 "wget -O - [...] >> foo".
1771 if (!hs->restval && global_download_count == 0 && opt.dfp != stdout)
1773 /* This will silently fail for streams that don't correspond
1774 to regular files, but that's OK. */
1776 /* ftruncate is needed because opt.dfp is opened in append
1777 mode if opt.always_rest is set. */
1778 ftruncate (fileno (fp), 0);
1783 /* #### This confuses the code that checks for file size. There
1784 should be some overhead information. */
1785 if (opt.save_headers)
1786 fwrite (head, 1, strlen (head), fp);
1788 /* Get the contents of the document. */
1789 hs->res = fd_read_body (sock, fp, &hs->len, hs->restval,
1790 (contlen != -1 ? contlen : 0),
1791 keep_alive, &hs->dltime);
1794 CLOSE_FINISH (sock);
1796 CLOSE_INVALIDATE (sock);
1799 /* Close or flush the file. We have to be careful to check for
1800 error here. Checking the result of fwrite() is not enough --
1801 errors could go unnoticed! */
1804 flush_res = fclose (fp);
1806 flush_res = fflush (fp);
1807 if (flush_res == EOF)
1812 return RETRFINISHED;
1815 /* The genuine HTTP loop! This is the part where the retrieval is
1816 retried, and retried, and retried, and... */
1818 http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
1819 int *dt, struct url *proxy)
1822 int use_ts, got_head = 0; /* time-stamping info */
1823 char *filename_plus_orig_suffix;
1824 char *local_filename = NULL;
1825 char *tms, *locf, *tmrate;
1827 time_t tml = -1, tmr = -1; /* local and remote time-stamps */
1828 long local_size = 0; /* the size of the local file */
1829 size_t filename_len;
1830 struct http_stat hstat; /* HTTP status */
1834 /* This used to be done in main(), but it's a better idea to do it
1835 here so that we don't go through the hoops if we're just using
1839 if (!wget_cookie_jar)
1840 wget_cookie_jar = cookie_jar_new ();
1841 if (opt.cookies_input && !cookies_loaded_p)
1843 cookie_jar_load (wget_cookie_jar, opt.cookies_input);
1844 cookies_loaded_p = 1;
1850 /* Warn on (likely bogus) wildcard usage in HTTP. Don't use
1851 has_wildcards_p because it would also warn on `?', and we know that
1852 shows up in CGI paths a *lot*. */
1853 if (strchr (u->url, '*'))
1854 logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
1856 /* Determine the local filename. */
1857 if (local_file && *local_file)
1858 hstat.local_file = local_file;
1859 else if (local_file)
1861 *local_file = url_file_name (u);
1862 hstat.local_file = local_file;
1866 dummy = url_file_name (u);
1867 hstat.local_file = &dummy;
1870 if (!opt.output_document)
1871 locf = *hstat.local_file;
1873 locf = opt.output_document;
1875 hstat.referer = referer;
1877 filename_len = strlen (*hstat.local_file);
1878 filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
1880 if (opt.noclobber && file_exists_p (*hstat.local_file))
1882 /* If opt.noclobber is turned on and file already exists, do not
1883 retrieve the file */
1884 logprintf (LOG_VERBOSE, _("\
1885 File `%s' already there, will not retrieve.\n"), *hstat.local_file);
1886 /* If the file is there, we suppose it's retrieved OK. */
1889 /* #### Bogusness alert. */
1890 /* If its suffix is "html" or "htm" or similar, assume text/html. */
1891 if (has_html_suffix_p (*hstat.local_file))
1899 if (opt.timestamping)
1901 int local_dot_orig_file_exists = 0;
1903 if (opt.backup_converted)
1904 /* If -K is specified, we'll act on the assumption that it was specified
1905 last time these files were downloaded as well, and instead of just
1906 comparing local file X against server file X, we'll compare local
1907 file X.orig (if extant, else X) against server file X. If -K
1908 _wasn't_ specified last time, or the server contains files called
1909 *.orig, -N will be back to not operating correctly with -k. */
1911 /* Would a single s[n]printf() call be faster? --dan
1913 Definitely not. sprintf() is horribly slow. It's a
1914 different question whether the difference between the two
1915 affects a program. Usually I'd say "no", but at one
1916 point I profiled Wget, and found that a measurable and
1917 non-negligible amount of time was lost calling sprintf()
1918 in url.c. Replacing sprintf with inline calls to
1919 strcpy() and long_to_string() made a difference.
1921 memcpy (filename_plus_orig_suffix, *hstat.local_file, filename_len);
1922 memcpy (filename_plus_orig_suffix + filename_len,
1923 ".orig", sizeof (".orig"));
1925 /* Try to stat() the .orig file. */
1926 if (stat (filename_plus_orig_suffix, &st) == 0)
1928 local_dot_orig_file_exists = 1;
1929 local_filename = filename_plus_orig_suffix;
1933 if (!local_dot_orig_file_exists)
1934 /* Couldn't stat() <file>.orig, so try to stat() <file>. */
1935 if (stat (*hstat.local_file, &st) == 0)
1936 local_filename = *hstat.local_file;
1938 if (local_filename != NULL)
1939 /* There was a local file, so we'll check later to see if the version
1940 the server has is the same version we already have, allowing us to
1946 /* Modification time granularity is 2 seconds for Windows, so
1947 increase local time by 1 second for later comparison. */
1950 local_size = st.st_size;
1954 /* Reset the counter. */
1956 *dt = 0 | ACCEPTRANGES;
1960 /* Increment the pass counter. */
1962 sleep_between_retrievals (count);
1963 /* Get the current time string. */
1964 tms = time_str (NULL);
1965 /* Print fetch message, if opt.verbose. */
1968 char *hurl = url_string (u, 1);
1972 sprintf (tmp, _("(try:%2d)"), count);
1973 logprintf (LOG_VERBOSE, "--%s-- %s\n %s => `%s'\n",
1974 tms, hurl, tmp, locf);
1976 ws_changetitle (hurl, 1);
1981 /* Default document type is empty. However, if spider mode is
1982 on or time-stamping is employed, HEAD_ONLY commands is
1983 encoded within *dt. */
1984 if (opt.spider || (use_ts && !got_head))
1988 /* Assume no restarting. */
1990 /* Decide whether or not to restart. */
1991 if (((count > 1 && (*dt & ACCEPTRANGES)) || opt.always_rest)
1992 /* #### this calls access() and then stat(); could be optimized. */
1993 && file_exists_p (locf))
1994 if (stat (locf, &st) == 0 && S_ISREG (st.st_mode))
1995 hstat.restval = st.st_size;
1997 /* In `-c' is used and the file is existing and non-empty,
1998 refuse to truncate it if the server doesn't support continued
2000 hstat.no_truncate = 0;
2001 if (opt.always_rest && hstat.restval)
2002 hstat.no_truncate = 1;
2004 /* Decide whether to send the no-cache directive. We send it in
2006 a) we're using a proxy, and we're past our first retrieval.
2007 Some proxies are notorious for caching incomplete data, so
2008 we require a fresh get.
2009 b) caching is explicitly inhibited. */
2010 if ((proxy && count > 1) /* a */
2011 || !opt.allow_cache /* b */
2013 *dt |= SEND_NOCACHE;
2015 *dt &= ~SEND_NOCACHE;
2017 /* Try fetching the document, or at least its head. */
2018 err = gethttp (u, &hstat, dt, proxy);
2020 /* It's unfortunate that wget determines the local filename before finding
2021 out the Content-Type of the file. Barring a major restructuring of the
2022 code, we need to re-set locf here, since gethttp() may have xrealloc()d
2023 *hstat.local_file to tack on ".html". */
2024 if (!opt.output_document)
2025 locf = *hstat.local_file;
2027 locf = opt.output_document;
2030 tms = time_str (NULL);
2031 /* Get the new location (with or without the redirection). */
2033 *newloc = xstrdup (hstat.newloc);
2036 case HERR: case HEOF: case CONSOCKERR: case CONCLOSED:
2037 case CONERROR: case READERR: case WRITEFAILED:
2039 /* Non-fatal errors continue executing the loop, which will
2040 bring them to "while" statement at the end, to judge
2041 whether the number of tries was exceeded. */
2042 free_hstat (&hstat);
2043 printwhat (count, opt.ntry);
2046 case HOSTERR: case CONIMPOSSIBLE: case PROXERR: case AUTHFAILED:
2047 case SSLERRCTXCREATE: case CONTNOTSUPPORTED:
2048 /* Fatal errors just return from the function. */
2049 free_hstat (&hstat);
2053 case FWRITEERR: case FOPENERR:
2054 /* Another fatal error. */
2055 logputs (LOG_VERBOSE, "\n");
2056 logprintf (LOG_NOTQUIET, _("Cannot write to `%s' (%s).\n"),
2057 *hstat.local_file, strerror (errno));
2058 free_hstat (&hstat);
2063 /* Another fatal error. */
2064 logputs (LOG_VERBOSE, "\n");
2065 logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
2066 free_hstat (&hstat);
2071 /* Return the new location to the caller. */
2074 logprintf (LOG_NOTQUIET,
2075 _("ERROR: Redirection (%d) without location.\n"),
2077 free_hstat (&hstat);
2081 free_hstat (&hstat);
2086 /* The file was already fully retrieved. */
2087 free_hstat (&hstat);
2092 /* Deal with you later. */
2095 /* All possibilities should have been exhausted. */
2098 if (!(*dt & RETROKF))
2102 /* #### Ugly ugly ugly! */
2103 char *hurl = url_string (u, 1);
2104 logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
2107 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
2108 tms, hstat.statcode, hstat.error);
2109 logputs (LOG_VERBOSE, "\n");
2110 free_hstat (&hstat);
2115 /* Did we get the time-stamp? */
2118 if (opt.timestamping && !hstat.remote_time)
2120 logputs (LOG_NOTQUIET, _("\
2121 Last-modified header missing -- time-stamps turned off.\n"));
2123 else if (hstat.remote_time)
2125 /* Convert the date-string into struct tm. */
2126 tmr = http_atotm (hstat.remote_time);
2127 if (tmr == (time_t) (-1))
2128 logputs (LOG_VERBOSE, _("\
2129 Last-modified header invalid -- time-stamp ignored.\n"));
2133 /* The time-stamping section. */
2138 use_ts = 0; /* no more time-stamping */
2139 count = 0; /* the retrieve count for HEAD is
2141 if (hstat.remote_time && tmr != (time_t) (-1))
2143 /* Now time-stamping can be used validly. Time-stamping
2144 means that if the sizes of the local and remote file
2145 match, and local file is newer than the remote file,
2146 it will not be retrieved. Otherwise, the normal
2147 download procedure is resumed. */
2149 (hstat.contlen == -1 || local_size == hstat.contlen))
2151 logprintf (LOG_VERBOSE, _("\
2152 Server file no newer than local file `%s' -- not retrieving.\n\n"),
2154 free_hstat (&hstat);
2158 else if (tml >= tmr)
2159 logprintf (LOG_VERBOSE, _("\
2160 The sizes do not match (local %ld) -- retrieving.\n"), local_size);
2162 logputs (LOG_VERBOSE,
2163 _("Remote file is newer, retrieving.\n"));
2165 free_hstat (&hstat);
2168 if ((tmr != (time_t) (-1))
2170 && ((hstat.len == hstat.contlen) ||
2171 ((hstat.res == 0) &&
2172 ((hstat.contlen == -1) ||
2173 (hstat.len >= hstat.contlen && !opt.kill_longer)))))
2175 /* #### This code repeats in http.c and ftp.c. Move it to a
2177 const char *fl = NULL;
2178 if (opt.output_document)
2180 if (opt.od_known_regular)
2181 fl = opt.output_document;
2184 fl = *hstat.local_file;
2188 /* End of time-stamping section. */
2192 logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode, hstat.error);
2197 tmrate = retr_rate (hstat.len - hstat.restval, hstat.dltime, 0);
2199 if (hstat.len == hstat.contlen)
2203 logprintf (LOG_VERBOSE,
2204 _("%s (%s) - `%s' saved [%ld/%ld]\n\n"),
2205 tms, tmrate, locf, hstat.len, hstat.contlen);
2206 logprintf (LOG_NONVERBOSE,
2207 "%s URL:%s [%ld/%ld] -> \"%s\" [%d]\n",
2208 tms, u->url, hstat.len, hstat.contlen, locf, count);
2211 total_downloaded_bytes += hstat.len;
2213 /* Remember that we downloaded the file for later ".orig" code. */
2214 if (*dt & ADDED_HTML_EXTENSION)
2215 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2217 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2219 free_hstat (&hstat);
2223 else if (hstat.res == 0) /* No read error */
2225 if (hstat.contlen == -1) /* We don't know how much we were supposed
2226 to get, so assume we succeeded. */
2230 logprintf (LOG_VERBOSE,
2231 _("%s (%s) - `%s' saved [%ld]\n\n"),
2232 tms, tmrate, locf, hstat.len);
2233 logprintf (LOG_NONVERBOSE,
2234 "%s URL:%s [%ld] -> \"%s\" [%d]\n",
2235 tms, u->url, hstat.len, locf, count);
2238 total_downloaded_bytes += hstat.len;
2240 /* Remember that we downloaded the file for later ".orig" code. */
2241 if (*dt & ADDED_HTML_EXTENSION)
2242 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2244 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2246 free_hstat (&hstat);
2250 else if (hstat.len < hstat.contlen) /* meaning we lost the
2251 connection too soon */
2253 logprintf (LOG_VERBOSE,
2254 _("%s (%s) - Connection closed at byte %ld. "),
2255 tms, tmrate, hstat.len);
2256 printwhat (count, opt.ntry);
2257 free_hstat (&hstat);
2260 else if (!opt.kill_longer) /* meaning we got more than expected */
2262 logprintf (LOG_VERBOSE,
2263 _("%s (%s) - `%s' saved [%ld/%ld])\n\n"),
2264 tms, tmrate, locf, hstat.len, hstat.contlen);
2265 logprintf (LOG_NONVERBOSE,
2266 "%s URL:%s [%ld/%ld] -> \"%s\" [%d]\n",
2267 tms, u->url, hstat.len, hstat.contlen, locf, count);
2269 total_downloaded_bytes += hstat.len;
2271 /* Remember that we downloaded the file for later ".orig" code. */
2272 if (*dt & ADDED_HTML_EXTENSION)
2273 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2275 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2277 free_hstat (&hstat);
2281 else /* the same, but not accepted */
2283 logprintf (LOG_VERBOSE,
2284 _("%s (%s) - Connection closed at byte %ld/%ld. "),
2285 tms, tmrate, hstat.len, hstat.contlen);
2286 printwhat (count, opt.ntry);
2287 free_hstat (&hstat);
2291 else /* now hstat.res can only be -1 */
2293 if (hstat.contlen == -1)
2295 logprintf (LOG_VERBOSE,
2296 _("%s (%s) - Read error at byte %ld (%s)."),
2297 tms, tmrate, hstat.len, strerror (errno));
2298 printwhat (count, opt.ntry);
2299 free_hstat (&hstat);
2302 else /* hstat.res == -1 and contlen is given */
2304 logprintf (LOG_VERBOSE,
2305 _("%s (%s) - Read error at byte %ld/%ld (%s). "),
2306 tms, tmrate, hstat.len, hstat.contlen,
2308 printwhat (count, opt.ntry);
2309 free_hstat (&hstat);
2316 while (!opt.ntry || (count < opt.ntry));
2320 /* Converts struct tm to time_t, assuming the data in tm is UTC rather
2321 than local timezone.
2323 mktime is similar but assumes struct tm, also known as the
2324 "broken-down" form of time, is in local time zone. mktime_from_utc
2325 uses mktime to make the conversion understanding that an offset
2326 will be introduced by the local time assumption.
2328 mktime_from_utc then measures the introduced offset by applying
2329 gmtime to the initial result and applying mktime to the resulting
2330 "broken-down" form. The difference between the two mktime results
2331 is the measured offset which is then subtracted from the initial
2332 mktime result to yield a calendar time which is the value returned.
2334 tm_isdst in struct tm is set to 0 to force mktime to introduce a
2335 consistent offset (the non DST offset) since tm and tm+o might be
2336 on opposite sides of a DST change.
2338 Some implementations of mktime return -1 for the nonexistent
2339 localtime hour at the beginning of DST. In this event, use
2340 mktime(tm - 1hr) + 3600.
2344 gmtime(t+o) --> tm+o
2345 mktime(tm+o) --> t+2o
2346 t+o - (t+2o - t+o) = t
2348 Note that glibc contains a function of the same purpose named
2349 `timegm' (reverse of gmtime). But obviously, it is not universally
2350 available, and unfortunately it is not straightforwardly
2351 extractable for use here. Perhaps configure should detect timegm
2352 and use it where available.
2354 Contributed by Roger Beeman <beeman@cisco.com>, with the help of
2355 Mark Baushke <mdb@cisco.com> and the rest of the Gurus at CISCO.
2356 Further improved by Roger with assistance from Edward J. Sabol
2357 based on input by Jamie Zawinski. */
2360 mktime_from_utc (struct tm *t)
2371 return -1; /* can't deal with output from strptime */
2382 return -1; /* can't deal with output from gmtime */
2385 return (tl - (tb - tl));
2388 /* Check whether the result of strptime() indicates success.
2389 strptime() returns the pointer to how far it got to in the string.
2390 The processing has been successful if the string is at `GMT' or
2391 `+X', or at the end of the string.
2393 In extended regexp parlance, the function returns 1 if P matches
2394 "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime
2395 can return) is considered a failure and 0 is returned. */
2397 check_end (const char *p)
2401 while (ISSPACE (*p))
2404 || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T')
2405 || ((p[0] == '+' || p[0] == '-') && ISDIGIT (p[1])))
2411 /* Convert the textual specification of time in TIME_STRING to the
2412 number of seconds since the Epoch.
2414 TIME_STRING can be in any of the three formats RFC2068 allows the
2415 HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date.
2416 Timezones are ignored, and should be GMT.
2418 Return the computed time_t representation, or -1 if the conversion
2421 This function uses strptime with various string formats for parsing
2422 TIME_STRING. This results in a parser that is not as lenient in
2423 interpreting TIME_STRING as I would like it to be. Being based on
2424 strptime, it always allows shortened months, one-digit days, etc.,
2425 but due to the multitude of formats in which time can be
2426 represented, an ideal HTTP time parser would be even more
2427 forgiving. It should completely ignore things like week days and
2428 concentrate only on the various forms of representing years,
2429 months, days, hours, minutes, and seconds. For example, it would
2430 be nice if it accepted ISO 8601 out of the box.
2432 I've investigated free and PD code for this purpose, but none was
2433 usable. getdate was big and unwieldy, and had potential copyright
2434 issues, or so I was informed. Dr. Marcus Hennecke's atotm(),
2435 distributed with phttpd, is excellent, but we cannot use it because
2436 it is not assigned to the FSF. So I stuck it with strptime. */
2439 http_atotm (const char *time_string)
2441 /* NOTE: Solaris strptime man page claims that %n and %t match white
2442 space, but that's not universally available. Instead, we simply
2443 use ` ' to mean "skip all WS", which works under all strptime
2444 implementations I've tested. */
2446 static const char *time_formats[] = {
2447 "%a, %d %b %Y %T", /* RFC1123: Thu, 29 Jan 1998 22:12:57 */
2448 "%A, %d-%b-%y %T", /* RFC850: Thursday, 29-Jan-98 22:12:57 */
2449 "%a, %d-%b-%Y %T", /* pseudo-RFC850: Thu, 29-Jan-1998 22:12:57
2450 (google.com uses this for their cookies.) */
2451 "%a %b %d %T %Y" /* asctime: Thu Jan 29 22:12:57 1998 */
2457 /* According to Roger Beeman, we need to initialize tm_isdst, since
2458 strptime won't do it. */
2461 /* Note that under foreign locales Solaris strptime() fails to
2462 recognize English dates, which renders this function useless. We
2463 solve this by being careful not to affect LC_TIME when
2464 initializing locale.
2466 Another solution would be to temporarily set locale to C, invoke
2467 strptime(), and restore it back. This is slow and dirty,
2468 however, and locale support other than LC_MESSAGES can mess other
2469 things, so I rather chose to stick with just setting LC_MESSAGES.
2471 GNU strptime does not have this problem because it recognizes
2472 both international and local dates. */
2474 for (i = 0; i < countof (time_formats); i++)
2475 if (check_end (strptime (time_string, time_formats[i], &t)))
2476 return mktime_from_utc (&t);
2478 /* All formats have failed. */
2482 /* Authorization support: We support two authorization schemes:
2484 * `Basic' scheme, consisting of base64-ing USER:PASSWORD string;
2486 * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>,
2487 consisting of answering to the server's challenge with the proper
2490 /* How many bytes it will take to store LEN bytes in base64. */
2491 #define BASE64_LENGTH(len) (4 * (((len) + 2) / 3))
2493 /* Encode the string S of length LENGTH to base64 format and place it
2494 to STORE. STORE will be 0-terminated, and must point to a writable
2495 buffer of at least 1+BASE64_LENGTH(length) bytes. */
2497 base64_encode (const char *s, char *store, int length)
2499 /* Conversion table. */
2500 static char tbl[64] = {
2501 'A','B','C','D','E','F','G','H',
2502 'I','J','K','L','M','N','O','P',
2503 'Q','R','S','T','U','V','W','X',
2504 'Y','Z','a','b','c','d','e','f',
2505 'g','h','i','j','k','l','m','n',
2506 'o','p','q','r','s','t','u','v',
2507 'w','x','y','z','0','1','2','3',
2508 '4','5','6','7','8','9','+','/'
2511 unsigned char *p = (unsigned char *)store;
2513 /* Transform the 3x8 bits to 4x6 bits, as required by base64. */
2514 for (i = 0; i < length; i += 3)
2516 *p++ = tbl[s[0] >> 2];
2517 *p++ = tbl[((s[0] & 3) << 4) + (s[1] >> 4)];
2518 *p++ = tbl[((s[1] & 0xf) << 2) + (s[2] >> 6)];
2519 *p++ = tbl[s[2] & 0x3f];
2522 /* Pad the result if necessary... */
2523 if (i == length + 1)
2525 else if (i == length + 2)
2526 *(p - 1) = *(p - 2) = '=';
2527 /* ...and zero-terminate it. */
2531 /* Create the authentication header contents for the `Basic' scheme.
2532 This is done by encoding the string `USER:PASS' in base64 and
2533 prepending `HEADER: Basic ' to it. */
2535 basic_authentication_encode (const char *user, const char *passwd)
2537 char *t1, *t2, *res;
2538 int len1 = strlen (user) + 1 + strlen (passwd);
2539 int len2 = BASE64_LENGTH (len1);
2541 t1 = (char *)alloca (len1 + 1);
2542 sprintf (t1, "%s:%s", user, passwd);
2544 t2 = (char *)alloca (len2 + 1);
2545 base64_encode (t1, t2, len1);
2547 res = (char *)xmalloc (6 + len2 + 1);
2548 sprintf (res, "Basic %s", t2);
2553 #define SKIP_WS(x) do { \
2554 while (ISSPACE (*(x))) \
2559 /* Parse HTTP `WWW-Authenticate:' header. AU points to the beginning
2560 of a field in such a header. If the field is the one specified by
2561 ATTR_NAME ("realm", "opaque", and "nonce" are used by the current
2562 digest authorization code), extract its value in the (char*)
2563 variable pointed by RET. Returns negative on a malformed header,
2564 or number of bytes that have been parsed by this call. */
2566 extract_header_attr (const char *au, const char *attr_name, char **ret)
2568 const char *cp, *ep;
2572 if (strncmp (cp, attr_name, strlen (attr_name)) == 0)
2574 cp += strlen (attr_name);
2587 for (ep = cp; *ep && *ep != '\"'; ep++)
2592 *ret = strdupdelim (cp, ep);
2599 /* Dump the hexadecimal representation of HASH to BUF. HASH should be
2600 an array of 16 bytes containing the hash keys, and BUF should be a
2601 buffer of 33 writable characters (32 for hex digits plus one for
2602 zero termination). */
2604 dump_hash (unsigned char *buf, const unsigned char *hash)
2608 for (i = 0; i < MD5_HASHLEN; i++, hash++)
2610 *buf++ = XNUM_TO_digit (*hash >> 4);
2611 *buf++ = XNUM_TO_digit (*hash & 0xf);
2616 /* Take the line apart to find the challenge, and compose a digest
2617 authorization header. See RFC2069 section 2.1.2. */
2619 digest_authentication_encode (const char *au, const char *user,
2620 const char *passwd, const char *method,
2623 static char *realm, *opaque, *nonce;
2628 { "realm", &realm },
2629 { "opaque", &opaque },
2634 realm = opaque = nonce = NULL;
2636 au += 6; /* skip over `Digest' */
2642 for (i = 0; i < countof (options); i++)
2644 int skip = extract_header_attr (au, options[i].name,
2645 options[i].variable);
2649 xfree_null (opaque);
2659 if (i == countof (options))
2661 while (*au && *au != '=')
2669 while (*au && *au != '\"')
2676 while (*au && *au != ',')
2681 if (!realm || !nonce || !user || !passwd || !path || !method)
2684 xfree_null (opaque);
2689 /* Calculate the digest value. */
2691 ALLOCA_MD5_CONTEXT (ctx);
2692 unsigned char hash[MD5_HASHLEN];
2693 unsigned char a1buf[MD5_HASHLEN * 2 + 1], a2buf[MD5_HASHLEN * 2 + 1];
2694 unsigned char response_digest[MD5_HASHLEN * 2 + 1];
2696 /* A1BUF = H(user ":" realm ":" password) */
2698 gen_md5_update ((unsigned char *)user, strlen (user), ctx);
2699 gen_md5_update ((unsigned char *)":", 1, ctx);
2700 gen_md5_update ((unsigned char *)realm, strlen (realm), ctx);
2701 gen_md5_update ((unsigned char *)":", 1, ctx);
2702 gen_md5_update ((unsigned char *)passwd, strlen (passwd), ctx);
2703 gen_md5_finish (ctx, hash);
2704 dump_hash (a1buf, hash);
2706 /* A2BUF = H(method ":" path) */
2708 gen_md5_update ((unsigned char *)method, strlen (method), ctx);
2709 gen_md5_update ((unsigned char *)":", 1, ctx);
2710 gen_md5_update ((unsigned char *)path, strlen (path), ctx);
2711 gen_md5_finish (ctx, hash);
2712 dump_hash (a2buf, hash);
2714 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */
2716 gen_md5_update (a1buf, MD5_HASHLEN * 2, ctx);
2717 gen_md5_update ((unsigned char *)":", 1, ctx);
2718 gen_md5_update ((unsigned char *)nonce, strlen (nonce), ctx);
2719 gen_md5_update ((unsigned char *)":", 1, ctx);
2720 gen_md5_update (a2buf, MD5_HASHLEN * 2, ctx);
2721 gen_md5_finish (ctx, hash);
2722 dump_hash (response_digest, hash);
2724 res = (char*) xmalloc (strlen (user)
2729 + 2 * MD5_HASHLEN /*strlen (response_digest)*/
2730 + (opaque ? strlen (opaque) : 0)
2732 sprintf (res, "Digest \
2733 username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"",
2734 user, realm, nonce, path, response_digest);
2737 char *p = res + strlen (res);
2738 strcat (p, ", opaque=\"");
2745 #endif /* USE_DIGEST */
2748 #define BEGINS_WITH(line, string_constant) \
2749 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
2750 && (ISSPACE (line[sizeof (string_constant) - 1]) \
2751 || !line[sizeof (string_constant) - 1]))
2754 known_authentication_scheme_p (const char *au)
2756 return BEGINS_WITH (au, "Basic")
2757 || BEGINS_WITH (au, "Digest")
2758 || BEGINS_WITH (au, "NTLM");
2763 /* Create the HTTP authorization request header. When the
2764 `WWW-Authenticate' response header is seen, according to the
2765 authorization scheme specified in that header (`Basic' and `Digest'
2766 are supported by the current implementation), produce an
2767 appropriate HTTP authorization request header. */
2769 create_authorization_line (const char *au, const char *user,
2770 const char *passwd, const char *method,
2773 if (0 == strncasecmp (au, "Basic", 5))
2774 return basic_authentication_encode (user, passwd);
2776 if (0 == strncasecmp (au, "Digest", 6))
2777 return digest_authentication_encode (au, user, passwd, method, path);
2778 #endif /* USE_DIGEST */