2 Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001, 2002
3 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 In addition, as a special exception, the Free Software Foundation
22 gives permission to link the code of its release of Wget with the
23 OpenSSL project's "OpenSSL" library (or with modified versions of it
24 that use the same license as the "OpenSSL" library), and distribute
25 the linked executables. You must obey the GNU General Public License
26 in all respects for all of the code used other than "OpenSSL". If you
27 modify this file, you may extend this exception to your version of the
28 file, but you are not obligated to do so. If you do not wish to do
29 so, delete this exception statement from your version. */
35 #include <sys/types.h>
46 #if TIME_WITH_SYS_TIME
47 # include <sys/time.h>
51 # include <sys/time.h>
68 # include "gen_sslfunc.h"
76 extern char *version_string;
77 extern LARGE_INT total_downloaded_bytes;
80 # define MIN(x, y) ((x) > (y) ? (y) : (x))
84 static int cookies_loaded_p;
85 struct cookie_jar *wget_cookie_jar;
87 #define TEXTHTML_S "text/html"
88 #define TEXTXHTML_S "application/xhtml+xml"
90 /* Some status code validation macros: */
91 #define H_20X(x) (((x) >= 200) && ((x) < 300))
92 #define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS)
93 #define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \
94 || (x) == HTTP_STATUS_MOVED_TEMPORARILY \
95 || (x) == HTTP_STATUS_TEMPORARY_REDIRECT)
97 /* HTTP/1.0 status codes from RFC1945, provided for reference. */
99 #define HTTP_STATUS_OK 200
100 #define HTTP_STATUS_CREATED 201
101 #define HTTP_STATUS_ACCEPTED 202
102 #define HTTP_STATUS_NO_CONTENT 204
103 #define HTTP_STATUS_PARTIAL_CONTENTS 206
105 /* Redirection 3xx. */
106 #define HTTP_STATUS_MULTIPLE_CHOICES 300
107 #define HTTP_STATUS_MOVED_PERMANENTLY 301
108 #define HTTP_STATUS_MOVED_TEMPORARILY 302
109 #define HTTP_STATUS_NOT_MODIFIED 304
110 #define HTTP_STATUS_TEMPORARY_REDIRECT 307
112 /* Client error 4xx. */
113 #define HTTP_STATUS_BAD_REQUEST 400
114 #define HTTP_STATUS_UNAUTHORIZED 401
115 #define HTTP_STATUS_FORBIDDEN 403
116 #define HTTP_STATUS_NOT_FOUND 404
118 /* Server errors 5xx. */
119 #define HTTP_STATUS_INTERNAL 500
120 #define HTTP_STATUS_NOT_IMPLEMENTED 501
121 #define HTTP_STATUS_BAD_GATEWAY 502
122 #define HTTP_STATUS_UNAVAILABLE 503
125 rel_none, rel_name, rel_value, rel_both
132 struct request_header {
134 enum rp release_policy;
136 int hcount, hcapacity;
139 /* Create a new, empty request. At least request_set_method must be
140 called before the request can be used. */
142 static struct request *
145 struct request *req = xnew0 (struct request);
147 req->headers = xnew_array (struct request_header, req->hcapacity);
151 /* Set the request's method and its arguments. METH should be a
152 literal string (or it should outlive the request) because it will
153 not be freed. ARG will be freed by request_free. */
156 request_set_method (struct request *req, const char *meth, char *arg)
162 /* Return the method string passed with the last call to
163 request_set_method. */
166 request_method (const struct request *req)
171 /* Free one header according to the release policy specified with
172 request_set_header. */
175 release_header (struct request_header *hdr)
177 switch (hdr->release_policy)
194 /* Set the request named NAME to VALUE. Specifically, this means that
195 a "NAME: VALUE\r\n" header line will be used in the request. If a
196 header with the same name previously existed in the request, its
197 value will be replaced by this one.
199 RELEASE_POLICY determines whether NAME and VALUE should be released
200 (freed) with request_free. Allowed values are:
202 - rel_none - don't free NAME or VALUE
203 - rel_name - free NAME when done
204 - rel_value - free VALUE when done
205 - rel_both - free both NAME and VALUE when done
207 Setting release policy is useful when arguments come from different
208 sources. For example:
210 // Don't free literal strings!
211 request_set_header (req, "Pragma", "no-cache", rel_none);
213 // Don't free a global variable, we'll need it later.
214 request_set_header (req, "Referer", opt.referer, rel_none);
216 // Value freshly allocated, free it when done.
217 request_set_header (req, "Range", aprintf ("bytes=%ld-", hs->restval),
222 request_set_header (struct request *req, char *name, char *value,
223 enum rp release_policy)
225 struct request_header *hdr;
229 for (i = 0; i < req->hcount; i++)
231 hdr = &req->headers[i];
232 if (0 == strcasecmp (name, hdr->name))
234 /* Replace existing header. */
235 release_header (hdr);
238 hdr->release_policy = release_policy;
243 /* Install new header. */
245 if (req->hcount >= req->hcount)
247 req->hcapacity <<= 1;
248 req->headers = xrealloc (req->headers,
249 req->hcapacity * sizeof (struct request_header));
251 hdr = &req->headers[req->hcount++];
254 hdr->release_policy = release_policy;
257 /* Like request_set_header, but sets the whole header line, as
258 provided by the user using the `--header' option. For example,
259 request_set_user_header (req, "Foo: bar") works just like
260 request_set_header (req, "Foo", "bar"). */
263 request_set_user_header (struct request *req, const char *header)
266 const char *p = strchr (header, ':');
269 BOUNDED_TO_ALLOCA (header, p, name);
273 request_set_header (req, xstrdup (name), (char *) p, rel_name);
276 #define APPEND(p, str) do { \
277 int A_len = strlen (str); \
278 memcpy (p, str, A_len); \
282 /* Construct the request and write it to FD using fd_write. */
285 request_send (const struct request *req, int fd)
287 char *request_string, *p;
288 int i, size, write_error;
290 /* Count the request size. */
293 /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */
294 size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2;
296 for (i = 0; i < req->hcount; i++)
298 struct request_header *hdr = &req->headers[i];
299 /* NAME ": " VALUE "\r\n" */
300 size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2;
306 p = request_string = alloca_array (char, size);
308 /* Generate the request. */
310 APPEND (p, req->method); *p++ = ' ';
311 APPEND (p, req->arg); *p++ = ' ';
312 memcpy (p, "HTTP/1.0\r\n", 10); p += 10;
314 for (i = 0; i < req->hcount; i++)
316 struct request_header *hdr = &req->headers[i];
317 APPEND (p, hdr->name);
318 *p++ = ':', *p++ = ' ';
319 APPEND (p, hdr->value);
320 *p++ = '\r', *p++ = '\n';
323 *p++ = '\r', *p++ = '\n', *p++ = '\0';
324 assert (p - request_string == size);
328 DEBUGP (("\n---request begin---\n%s---request end---\n", request_string));
330 /* Send the request to the server. */
332 write_error = fd_write (fd, request_string, size - 1, -1);
334 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
339 /* Release the resources used by REQ. */
342 request_free (struct request *req)
345 xfree_null (req->arg);
346 for (i = 0; i < req->hcount; i++)
347 release_header (&req->headers[i]);
348 xfree_null (req->headers);
353 head_terminator (const char *hunk, int oldlen, int peeklen)
355 const char *start, *end;
357 /* If at first peek, verify whether HUNK starts with "HTTP". If
358 not, this is a HTTP/0.9 request and we must bail out without
360 if (oldlen == 0 && 0 != memcmp (hunk, "HTTP", MIN (peeklen, 4)))
366 start = hunk + oldlen - 4;
367 end = hunk + oldlen + peeklen;
369 for (; start < end - 1; start++)
376 if (start[1] == '\n')
382 /* Read the HTTP request head from FD and return it. The error
383 conditions are the same as with fd_read_hunk.
385 To support HTTP/0.9 responses, this function tries to make sure
386 that the data begins with "HTTP". If this is not the case, no data
387 is read and an empty request is returned, so that the remaining
388 data can be treated as body. */
391 fd_read_http_head (int fd)
393 return fd_read_hunk (fd, head_terminator, 512);
397 /* The response data. */
400 /* The array of pointers that indicate where each header starts.
401 For example, given this HTTP response:
408 The headers are located like this:
410 "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n"
412 headers[0] headers[1] headers[2] headers[3]
414 I.e. headers[0] points to the beginning of the request,
415 headers[1] points to the end of the first header and the
416 beginning of the second one, etc. */
418 const char **headers;
421 /* Create a new response object from the text of the HTTP response,
422 available in HEAD. That text is automatically split into
423 constituent header lines for fast retrieval using
424 response_header_*. */
426 static struct response *
427 response_new (const char *head)
432 struct response *resp = xnew0 (struct response);
437 /* Empty head means that we're dealing with a headerless
438 (HTTP/0.9) response. In that case, don't set HEADERS at
443 /* Split HEAD into header lines, so that response_header_* functions
444 don't need to do this over and over again. */
450 DO_REALLOC (resp->headers, size, count + 1, const char *);
451 resp->headers[count++] = hdr;
453 /* Break upon encountering an empty line. */
454 if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n')
457 /* Find the end of HDR, including continuations. */
460 const char *end = strchr (hdr, '\n');
466 while (*hdr == ' ' || *hdr == '\t');
468 DO_REALLOC (resp->headers, size, count + 1, const char *);
469 resp->headers[count++] = NULL;
474 /* Locate the header named NAME in the request data. If found, set
475 *BEGPTR to its starting, and *ENDPTR to its ending position, and
476 return 1. Otherwise return 0.
478 This function is used as a building block for response_header_copy
479 and response_header_strdup. */
482 response_header_bounds (const struct response *resp, const char *name,
483 const char **begptr, const char **endptr)
486 const char **headers = resp->headers;
489 if (!headers || !headers[1])
492 name_len = strlen (name);
494 for (i = 1; headers[i + 1]; i++)
496 const char *b = headers[i];
497 const char *e = headers[i + 1];
499 && b[name_len] == ':'
500 && 0 == strncasecmp (b, name, name_len))
503 while (b < e && ISSPACE (*b))
505 while (b < e && ISSPACE (e[-1]))
515 /* Copy the response header named NAME to buffer BUF, no longer than
516 BUFSIZE (BUFSIZE includes the terminating 0). If the header
517 exists, 1 is returned, otherwise 0. If there should be no limit on
518 the size of the header, use response_header_strdup instead.
520 If BUFSIZE is 0, no data is copied, but the boolean indication of
521 whether the header is present is still returned. */
524 response_header_copy (const struct response *resp, const char *name,
525 char *buf, int bufsize)
528 if (!response_header_bounds (resp, name, &b, &e))
532 int len = MIN (e - b, bufsize);
533 strncpy (buf, b, len);
539 /* Return the value of header named NAME in RESP, allocated with
540 malloc. If such a header does not exist in RESP, return NULL. */
543 response_header_strdup (const struct response *resp, const char *name)
546 if (!response_header_bounds (resp, name, &b, &e))
548 return strdupdelim (b, e);
551 /* Parse the HTTP status line, which is of format:
553 HTTP-Version SP Status-Code SP Reason-Phrase
555 The function returns the status-code, or -1 if the status line
556 appears malformed. The pointer to "reason-phrase" message is
557 returned in *MESSAGE. */
560 response_status (const struct response *resp, char **message)
567 /* For a HTTP/0.9 response, assume status 200. */
569 *message = xstrdup (_("No headers, assuming HTTP/0.9"));
573 p = resp->headers[0];
574 end = resp->headers[1];
580 if (end - p < 4 || 0 != strncmp (p, "HTTP", 4))
584 /* Match the HTTP version. This is optional because Gnutella
585 servers have been reported to not specify HTTP version. */
586 if (p < end && *p == '/')
589 while (p < end && ISDIGIT (*p))
591 if (p < end && *p == '.')
593 while (p < end && ISDIGIT (*p))
597 while (p < end && ISSPACE (*p))
599 if (end - p < 3 || !ISDIGIT (p[0]) || !ISDIGIT (p[1]) || !ISDIGIT (p[2]))
602 status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0');
607 while (p < end && ISSPACE (*p))
609 while (p < end && ISSPACE (end[-1]))
611 *message = strdupdelim (p, end);
617 /* Release the resources used by RESP. */
620 response_free (struct response *resp)
622 xfree_null (resp->headers);
626 /* Print [b, e) to the log, omitting the trailing CRLF. */
629 print_server_response_1 (const char *prefix, const char *b, const char *e)
632 if (b < e && e[-1] == '\n')
634 if (b < e && e[-1] == '\r')
636 BOUNDED_TO_ALLOCA (b, e, ln);
637 logprintf (LOG_VERBOSE, "%s%s\n", prefix, ln);
640 /* Print the server response, line by line, omitting the trailing CR
641 characters, prefixed with PREFIX. */
644 print_server_response (const struct response *resp, const char *prefix)
649 for (i = 0; resp->headers[i + 1]; i++)
650 print_server_response_1 (prefix, resp->headers[i], resp->headers[i + 1]);
653 /* Parse the `Content-Range' header and extract the information it
654 contains. Returns 1 if successful, -1 otherwise. */
656 parse_content_range (const char *hdr, long *first_byte_ptr,
657 long *last_byte_ptr, long *entity_length_ptr)
661 /* Ancient versions of Netscape proxy server, presumably predating
662 rfc2068, sent out `Content-Range' without the "bytes"
664 if (!strncasecmp (hdr, "bytes", 5))
667 /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the
671 while (ISSPACE (*hdr))
678 for (num = 0; ISDIGIT (*hdr); hdr++)
679 num = 10 * num + (*hdr - '0');
680 if (*hdr != '-' || !ISDIGIT (*(hdr + 1)))
682 *first_byte_ptr = num;
684 for (num = 0; ISDIGIT (*hdr); hdr++)
685 num = 10 * num + (*hdr - '0');
686 if (*hdr != '/' || !ISDIGIT (*(hdr + 1)))
688 *last_byte_ptr = num;
690 for (num = 0; ISDIGIT (*hdr); hdr++)
691 num = 10 * num + (*hdr - '0');
692 *entity_length_ptr = num;
696 /* Send the contents of FILE_NAME to SOCK/SSL. Make sure that exactly
697 PROMISED_SIZE bytes are sent over the wire -- if the file is
698 longer, read only that much; if the file is shorter, report an error. */
701 post_file (int sock, const char *file_name, long promised_size)
703 static char chunk[8192];
708 DEBUGP (("[writing POST file %s ... ", file_name));
710 fp = fopen (file_name, "rb");
713 while (!feof (fp) && written < promised_size)
716 int length = fread (chunk, 1, sizeof (chunk), fp);
719 towrite = MIN (promised_size - written, length);
720 write_error = fd_write (sock, chunk, towrite, -1);
730 /* If we've written less than was promised, report a (probably
731 nonsensical) error rather than break the promise. */
732 if (written < promised_size)
738 assert (written == promised_size);
739 DEBUGP (("done]\n"));
743 /* Persistent connections. Currently, we cache the most recently used
744 connection as persistent, provided that the HTTP server agrees to
745 make it such. The persistence data is stored in the variables
746 below. Ideally, it should be possible to cache an arbitrary fixed
747 number of these connections. */
749 /* Whether a persistent connection is active. */
750 static int pconn_active;
753 /* The socket of the connection. */
756 /* Host and port of the currently active persistent connection. */
760 /* Whether a ssl handshake has occoured on this connection. */
764 /* Mark the persistent connection as invalid and free the resources it
765 uses. This is used by the CLOSE_* macros after they forcefully
766 close a registered persistent connection. */
769 invalidate_persistent (void)
771 DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket));
773 fd_close (pconn.socket);
778 /* Register FD, which should be a TCP/IP connection to HOST:PORT, as
779 persistent. This will enable someone to use the same connection
780 later. In the context of HTTP, this must be called only AFTER the
781 response has been received and the server has promised that the
782 connection will remain alive.
784 If a previous connection was persistent, it is closed. */
787 register_persistent (const char *host, int port, int fd, int ssl)
791 if (pconn.socket == fd)
793 /* The connection FD is already registered. */
798 /* The old persistent connection is still active; close it
799 first. This situation arises whenever a persistent
800 connection exists, but we then connect to a different
801 host, and try to register a persistent connection to that
803 invalidate_persistent ();
809 pconn.host = xstrdup (host);
813 DEBUGP (("Registered socket %d for persistent reuse.\n", fd));
816 /* Return non-zero if a persistent connection is available for
817 connecting to HOST:PORT. */
820 persistent_available_p (const char *host, int port, int ssl,
821 int *host_lookup_failed)
823 /* First, check whether a persistent connection is active at all. */
827 /* If we want SSL and the last connection wasn't or vice versa,
828 don't use it. Checking for host and port is not enough because
829 HTTP and HTTPS can apparently coexist on the same port. */
830 if (ssl != pconn.ssl)
833 /* If we're not connecting to the same port, we're not interested. */
834 if (port != pconn.port)
837 /* If the host is the same, we're in business. If not, there is
838 still hope -- read below. */
839 if (0 != strcasecmp (host, pconn.host))
841 /* If pconn.socket is already talking to HOST, we needn't
842 reconnect. This happens often when both sites are virtual
843 hosts distinguished only by name and served by the same
844 network interface, and hence the same web server (possibly
845 set up by the ISP and serving many different web sites).
846 This admittedly non-standard optimization does not contradict
847 HTTP and works well with popular server software. */
851 struct address_list *al;
854 /* Don't try to talk to two different SSL sites over the same
855 secure connection! (Besides, it's not clear if name-based
856 virtual hosting is even possible with SSL.) */
859 /* If pconn.socket's peer is one of the IP addresses HOST
860 resolves to, pconn.socket is for all intents and purposes
861 already talking to HOST. */
863 if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER))
865 /* Can't get the peer's address -- something must be very
866 wrong with the connection. */
867 invalidate_persistent ();
870 al = lookup_host (host, 0);
873 *host_lookup_failed = 1;
877 found = address_list_contains (al, &ip);
878 address_list_release (al);
883 /* The persistent connection's peer address was found among the
884 addresses HOST resolved to; therefore, pconn.sock is in fact
885 already talking to HOST -- no need to reconnect. */
888 /* Finally, check whether the connection is still open. This is
889 important because most server implement a liberal (short) timeout
890 on persistent connections. Wget can of course always reconnect
891 if the connection doesn't work out, but it's nicer to know in
892 advance. This test is a logical followup of the first test, but
893 is "expensive" and therefore placed at the end of the list. */
895 if (!test_socket_open (pconn.socket))
897 /* Oops, the socket is no longer open. Now that we know that,
898 let's invalidate the persistent connection before returning
900 invalidate_persistent ();
907 /* The idea behind these two CLOSE macros is to distinguish between
908 two cases: one when the job we've been doing is finished, and we
909 want to close the connection and leave, and two when something is
910 seriously wrong and we're closing the connection as part of
913 In case of keep_alive, CLOSE_FINISH should leave the connection
914 open, while CLOSE_INVALIDATE should still close it.
916 Note that the semantics of the flag `keep_alive' is "this
917 connection *will* be reused (the server has promised not to close
918 the connection once we're done)", while the semantics of
919 `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an
920 active, registered connection". */
922 #define CLOSE_FINISH(fd) do { \
925 if (pconn_active && (fd) == pconn.socket) \
926 invalidate_persistent (); \
932 #define CLOSE_INVALIDATE(fd) do { \
933 if (pconn_active && (fd) == pconn.socket) \
934 invalidate_persistent (); \
942 long len; /* received length */
943 long contlen; /* expected length */
944 long restval; /* the restart value */
945 int res; /* the result of last read */
946 char *newloc; /* new location (redirection) */
947 char *remote_time; /* remote time-stamp string */
948 char *error; /* textual HTTP error */
949 int statcode; /* status code */
950 double dltime; /* time of the download in msecs */
951 int no_truncate; /* whether truncating the file is
953 const char *referer; /* value of the referer header. */
954 char **local_file; /* local file. */
958 free_hstat (struct http_stat *hs)
960 xfree_null (hs->newloc);
961 xfree_null (hs->remote_time);
962 xfree_null (hs->error);
964 /* Guard against being called twice. */
966 hs->remote_time = NULL;
970 static char *create_authorization_line PARAMS ((const char *, const char *,
971 const char *, const char *,
973 static char *basic_authentication_encode PARAMS ((const char *, const char *));
974 static int known_authentication_scheme_p PARAMS ((const char *));
976 time_t http_atotm PARAMS ((const char *));
978 #define BEGINS_WITH(line, string_constant) \
979 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
980 && (ISSPACE (line[sizeof (string_constant) - 1]) \
981 || !line[sizeof (string_constant) - 1]))
983 /* Retrieve a document through HTTP protocol. It recognizes status
984 code, and correctly handles redirections. It closes the network
985 socket. If it receives an error from the functions below it, it
986 will print it if there is enough information to do so (almost
987 always), returning the error to the caller (i.e. http_loop).
989 Various HTTP parameters are stored to hs.
991 If PROXY is non-NULL, the connection will be made to the proxy
992 server, and u->url will be requested. */
994 gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy)
1003 long contlen, contrange;
1009 /* Whether authorization has been already tried. */
1010 int auth_tried_already = 0;
1012 /* Whether our connection to the remote host is through SSL. */
1016 struct response *resp;
1020 /* Whether this connection will be kept alive after the HTTP request
1024 /* Flag that detects having received a keep-alive response. */
1025 int keep_alive_confirmed;
1027 /* Whether keep-alive should be inhibited. */
1028 int inhibit_keep_alive = !opt.http_keep_alive;
1030 /* Headers sent when using POST. */
1031 long post_data_size = 0;
1033 int host_lookup_failed = 0;
1036 if (u->scheme == SCHEME_HTTPS)
1038 /* Initialize the SSL context. After this has once been done,
1039 it becomes a no-op. */
1040 switch (ssl_init ())
1042 case SSLERRCTXCREATE:
1044 logprintf (LOG_NOTQUIET, _("Failed to set up an SSL context\n"));
1045 return SSLERRCTXCREATE;
1046 case SSLERRCERTFILE:
1047 /* try without certfile */
1048 logprintf (LOG_NOTQUIET,
1049 _("Failed to load certificates from %s\n"),
1051 logprintf (LOG_NOTQUIET,
1052 _("Trying without the specified certificate\n"));
1055 logprintf (LOG_NOTQUIET,
1056 _("Failed to get certificate key from %s\n"),
1058 logprintf (LOG_NOTQUIET,
1059 _("Trying without the specified certificate\n"));
1065 #endif /* HAVE_SSL */
1067 if (!(*dt & HEAD_ONLY))
1068 /* If we're doing a GET on the URL, as opposed to just a HEAD, we need to
1069 know the local filename so we can save to it. */
1070 assert (*hs->local_file != NULL);
1072 auth_tried_already = 0;
1074 /* Initialize certain elements of struct http_stat. */
1079 hs->remote_time = NULL;
1087 char *proxy_user, *proxy_passwd;
1088 /* For normal username and password, URL components override
1089 command-line/wgetrc parameters. With proxy
1090 authentication, it's the reverse, because proxy URLs are
1091 normally the "permanent" ones, so command-line args
1092 should take precedence. */
1093 if (opt.proxy_user && opt.proxy_passwd)
1095 proxy_user = opt.proxy_user;
1096 proxy_passwd = opt.proxy_passwd;
1100 proxy_user = proxy->user;
1101 proxy_passwd = proxy->passwd;
1103 /* #### This does not appear right. Can't the proxy request,
1104 say, `Digest' authentication? */
1105 if (proxy_user && proxy_passwd)
1106 proxyauth = basic_authentication_encode (proxy_user, proxy_passwd);
1108 /* If we're using a proxy, we will be connecting to the proxy
1113 /* Prepare the request to send. */
1115 req = request_new ();
1117 const char *meth = "GET";
1118 if (*dt & HEAD_ONLY)
1120 else if (opt.post_file_name || opt.post_data)
1122 /* Use the full path, i.e. one that includes the leading slash and
1123 the query string. E.g. if u->path is "foo/bar" and u->query is
1124 "param=value", full_path will be "/foo/bar?param=value". */
1125 request_set_method (req, meth,
1126 proxy ? xstrdup (u->url) : url_full_path (u));
1129 request_set_header (req, "Referer", (char *) hs->referer, rel_none);
1130 if (*dt & SEND_NOCACHE)
1131 request_set_header (req, "Pragma", "no-cache", rel_none);
1133 request_set_header (req, "Range",
1134 aprintf ("bytes=%ld-", hs->restval), rel_value);
1136 request_set_header (req, "User-Agent", opt.useragent, rel_none);
1138 request_set_header (req, "User-Agent",
1139 aprintf ("Wget/%s", version_string), rel_value);
1140 request_set_header (req, "Accept", "*/*", rel_none);
1142 /* Find the username and password for authentication. */
1145 search_netrc (u->host, (const char **)&user, (const char **)&passwd, 0);
1146 user = user ? user : opt.http_user;
1147 passwd = passwd ? passwd : opt.http_passwd;
1151 /* We have the username and the password, but haven't tried
1152 any authorization yet. Let's see if the "Basic" method
1153 works. If not, we'll come back here and construct a
1154 proper authorization method with the right challenges.
1156 If we didn't employ this kind of logic, every URL that
1157 requires authorization would have to be processed twice,
1158 which is very suboptimal and generates a bunch of false
1159 "unauthorized" errors in the server log.
1161 #### But this logic also has a serious problem when used
1162 with stronger authentications: we *first* transmit the
1163 username and the password in clear text, and *then* attempt a
1164 stronger authentication scheme. That cannot be right! We
1165 are only fortunate that almost everyone still uses the
1166 `Basic' scheme anyway.
1168 There should be an option to prevent this from happening, for
1169 those who use strong authentication schemes and value their
1171 request_set_header (req, "Authorization",
1172 basic_authentication_encode (user, passwd),
1177 /* Whether we need to print the host header with braces around
1178 host, e.g. "Host: [3ffe:8100:200:2::2]:1234" instead of the
1179 usual "Host: symbolic-name:1234". */
1180 int squares = strchr (u->host, ':') != NULL;
1181 if (u->port == scheme_default_port (u->scheme))
1182 request_set_header (req, "Host",
1183 aprintf (squares ? "[%s]" : "%s", u->host),
1186 request_set_header (req, "Host",
1187 aprintf (squares ? "[%s]:%d" : "%s:%d",
1192 if (!inhibit_keep_alive)
1193 request_set_header (req, "Connection", "Keep-Alive", rel_none);
1196 request_set_header (req, "Cookie",
1197 cookie_header (wget_cookie_jar,
1198 u->host, u->port, u->path,
1200 u->scheme == SCHEME_HTTPS
1207 if (opt.post_data || opt.post_file_name)
1209 request_set_header (req, "Content-Type",
1210 "application/x-www-form-urlencoded", rel_none);
1212 post_data_size = strlen (opt.post_data);
1215 post_data_size = file_size (opt.post_file_name);
1216 if (post_data_size == -1)
1218 logprintf (LOG_NOTQUIET, "POST data file missing: %s\n",
1219 opt.post_file_name);
1223 request_set_header (req, "Content-Length",
1224 aprintf ("Content-Length: %ld", post_data_size),
1228 /* Add the user headers. */
1229 if (opt.user_headers)
1232 for (i = 0; opt.user_headers[i]; i++)
1233 request_set_user_header (req, opt.user_headers[i]);
1237 /* We need to come back here when the initial attempt to retrieve
1238 without authorization header fails. (Expected to happen at least
1239 for the Digest authorization scheme.) */
1242 keep_alive_confirmed = 0;
1244 /* Establish the connection. */
1246 if (!inhibit_keep_alive)
1248 /* Look for a persistent connection to target host, unless a
1249 proxy is used. The exception is when SSL is in use, in which
1250 case the proxy is nothing but a passthrough to the target
1251 host, registered as a connection to the latter. */
1252 struct url *relevant = conn;
1254 if (u->scheme == SCHEME_HTTPS)
1258 if (persistent_available_p (relevant->host, relevant->port,
1260 relevant->scheme == SCHEME_HTTPS,
1264 &host_lookup_failed))
1266 sock = pconn.socket;
1267 using_ssl = pconn.ssl;
1268 logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"),
1269 pconn.host, pconn.port);
1270 DEBUGP (("Reusing fd %d.\n", sock));
1276 /* In its current implementation, persistent_available_p will
1277 look up conn->host in some cases. If that lookup failed, we
1278 don't need to bother with connect_to_host. */
1279 if (host_lookup_failed)
1282 sock = connect_to_host (conn->host, conn->port);
1286 return (retryable_socket_connect_error (errno)
1287 ? CONERROR : CONIMPOSSIBLE);
1290 if (proxy && u->scheme == SCHEME_HTTPS)
1292 /* When requesting SSL URLs through proxies, use the
1293 CONNECT method to request passthrough. */
1294 struct request *connreq = request_new ();
1295 request_set_method (connreq, "CONNECT",
1296 aprintf ("%s:%d", u->host, u->port));
1299 request_set_header (connreq, "Proxy-Authorization",
1300 proxyauth, rel_value);
1301 /* Now that PROXYAUTH is part of the CONNECT request,
1302 zero it out so we don't send proxy authorization with
1303 the regular request below. */
1307 write_error = request_send (connreq, sock);
1308 request_free (connreq);
1309 if (write_error < 0)
1311 logprintf (LOG_VERBOSE, _("Failed writing to proxy: %s.\n"),
1313 CLOSE_INVALIDATE (sock);
1317 head = fd_read_http_head (sock);
1320 logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"),
1322 CLOSE_INVALIDATE (sock);
1331 DEBUGP (("proxy responded with: [%s]\n", head));
1333 resp = response_new (head);
1334 statcode = response_status (resp, &message);
1335 response_free (resp);
1336 if (statcode != 200)
1339 logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"),
1340 message ? message : "?");
1341 xfree_null (message);
1346 /* SOCK is now *really* connected to u->host, so update CONN
1347 to reflect this. That way register_persistent will
1348 register SOCK as being connected to u->host:u->port. */
1352 if (conn->scheme == SCHEME_HTTPS)
1354 if (!ssl_connect (sock))
1361 #endif /* HAVE_SSL */
1364 /* Send the request to server. */
1365 write_error = request_send (req, sock);
1367 if (write_error >= 0)
1371 DEBUGP (("[POST data: %s]\n", opt.post_data));
1372 write_error = fd_write (sock, opt.post_data, post_data_size, -1);
1374 else if (opt.post_file_name && post_data_size != 0)
1375 write_error = post_file (sock, opt.post_file_name, post_data_size);
1377 DEBUGP (("---request end---\n"));
1379 if (write_error < 0)
1381 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
1383 CLOSE_INVALIDATE (sock);
1387 logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
1388 proxy ? "Proxy" : "HTTP");
1389 contlen = contrange = -1;
1394 head = fd_read_http_head (sock);
1399 logputs (LOG_NOTQUIET, _("No data received.\n"));
1400 CLOSE_INVALIDATE (sock);
1406 logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"),
1408 CLOSE_INVALIDATE (sock);
1413 DEBUGP (("\n---response begin---\n%s---response end---\n", head));
1415 resp = response_new (head);
1417 /* Check for status line. */
1419 statcode = response_status (resp, &message);
1420 if (!opt.server_response)
1421 logprintf (LOG_VERBOSE, "%2d %s\n", statcode, message ? message : "");
1424 logprintf (LOG_VERBOSE, "\n");
1425 print_server_response (resp, " ");
1428 if (statcode == HTTP_STATUS_UNAUTHORIZED)
1430 /* Authorization is required. */
1431 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1432 might be more bytes in the body. */
1433 if (auth_tried_already || !(user && passwd))
1435 /* If we have tried it already, then there is not point
1437 logputs (LOG_NOTQUIET, _("Authorization failed.\n"));
1441 char *www_authenticate = response_header_strdup (resp,
1442 "WWW-Authenticate");
1443 /* If the authentication scheme is unknown or if it's the
1444 "Basic" authentication (which we try by default), there's
1445 no sense in retrying. */
1446 if (!www_authenticate
1447 || !known_authentication_scheme_p (www_authenticate)
1448 || BEGINS_WITH (www_authenticate, "Basic"))
1450 xfree_null (www_authenticate);
1451 logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n"));
1456 auth_tried_already = 1;
1457 pth = url_full_path (u);
1458 request_set_header (req, "Authorization",
1459 create_authorization_line (www_authenticate,
1461 request_method (req),
1465 xfree (www_authenticate);
1466 goto retry_with_auth;
1474 hs->statcode = statcode;
1476 hs->error = xstrdup (_("Malformed status line"));
1478 hs->error = xstrdup (_("(no description)"));
1480 hs->error = xstrdup (message);
1482 if (response_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval)))
1483 contlen = strtol (hdrval, NULL, 10);
1484 type = response_header_strdup (resp, "Content-Type");
1487 char *tmp = strchr (type, ';');
1490 while (tmp > type && ISSPACE (tmp[-1]))
1495 hs->newloc = response_header_strdup (resp, "Location");
1496 hs->remote_time = response_header_strdup (resp, "Last-Modified");
1498 char *set_cookie = response_header_strdup (resp, "Set-Cookie");
1501 /* The jar should have been created by now. */
1502 assert (wget_cookie_jar != NULL);
1503 cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port, u->path,
1508 if (response_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval)))
1510 long first_byte_pos, last_byte_pos, entity_length;
1511 if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos,
1513 contrange = first_byte_pos;
1516 /* Check for keep-alive related responses. */
1517 if (!inhibit_keep_alive && contlen != -1)
1519 if (response_header_copy (resp, "Keep-Alive", NULL, 0))
1521 else if (response_header_copy (resp, "Connection", hdrval,
1524 if (0 == strcasecmp (hdrval, "Keep-Alive"))
1528 response_free (resp);
1531 /* The server has promised that it will not close the connection
1532 when we're done. This means that we can register it. */
1533 register_persistent (conn->host, conn->port, sock, using_ssl);
1535 /* 20x responses are counted among successful by default. */
1536 if (H_20X (statcode))
1539 /* Return if redirected. */
1540 if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES)
1542 /* RFC2068 says that in case of the 300 (multiple choices)
1543 response, the server can output a preferred URL through
1544 `Location' header; otherwise, the request should be treated
1545 like GET. So, if the location is set, it will be a
1546 redirection; otherwise, just proceed normally. */
1547 if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc)
1551 logprintf (LOG_VERBOSE,
1552 _("Location: %s%s\n"),
1553 hs->newloc ? hs->newloc : _("unspecified"),
1554 hs->newloc ? _(" [following]") : "");
1555 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1556 might be more bytes in the body. */
1562 /* If content-type is not given, assume text/html. This is because
1563 of the multitude of broken CGI's that "forget" to generate the
1566 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) ||
1567 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
1572 if (opt.html_extension && (*dt & TEXTHTML))
1573 /* -E / --html-extension / html_extension = on was specified, and this is a
1574 text/html file. If some case-insensitive variation on ".htm[l]" isn't
1575 already the file's suffix, tack on ".html". */
1577 char* last_period_in_local_filename = strrchr(*hs->local_file, '.');
1579 if (last_period_in_local_filename == NULL
1580 || !(0 == strcasecmp (last_period_in_local_filename, ".htm")
1581 || 0 == strcasecmp (last_period_in_local_filename, ".html")))
1583 size_t local_filename_len = strlen(*hs->local_file);
1585 *hs->local_file = xrealloc(*hs->local_file,
1586 local_filename_len + sizeof(".html"));
1587 strcpy(*hs->local_file + local_filename_len, ".html");
1589 *dt |= ADDED_HTML_EXTENSION;
1593 if (contrange == -1)
1595 /* We did not get a content-range header. This means that the
1596 server did not honor our `Range' request. Normally, this
1597 means we should reset hs->restval and continue normally. */
1599 /* However, if `-c' is used, we need to be a bit more careful:
1601 1. If `-c' is specified and the file already existed when
1602 Wget was started, it would be a bad idea for us to start
1603 downloading it from scratch, effectively truncating it. I
1604 believe this cannot happen unless `-c' was specified.
1606 2. If `-c' is used on a file that is already fully
1607 downloaded, we're requesting bytes after the end of file,
1608 which can result in server not honoring `Range'. If this is
1609 the case, `Content-Length' will be equal to the length of the
1611 if (opt.always_rest)
1613 /* Check for condition #2. */
1614 if (hs->restval > 0 /* restart was requested. */
1615 && contlen != -1 /* we got content-length. */
1616 && hs->restval >= contlen /* file fully downloaded
1620 logputs (LOG_VERBOSE, _("\
1621 \n The file is already fully retrieved; nothing to do.\n\n"));
1622 /* In case the caller inspects. */
1625 /* Mark as successfully retrieved. */
1628 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1629 might be more bytes in the body. */
1630 return RETRUNNEEDED;
1633 /* Check for condition #1. */
1634 if (hs->no_truncate)
1636 logprintf (LOG_NOTQUIET,
1639 Continued download failed on this file, which conflicts with `-c'.\n\
1640 Refusing to truncate existing file `%s'.\n\n"), *hs->local_file);
1642 CLOSE_INVALIDATE (sock);
1643 return CONTNOTSUPPORTED;
1651 else if (contrange != hs->restval ||
1652 (H_PARTIAL (statcode) && contrange == -1))
1654 /* This means the whole request was somehow misunderstood by the
1655 server. Bail out. */
1657 CLOSE_INVALIDATE (sock);
1664 contlen += contrange;
1666 contrange = -1; /* If conent-length was not sent,
1667 content-range will be ignored. */
1669 hs->contlen = contlen;
1675 /* No need to print this output if the body won't be
1676 downloaded at all, or if the original server response is
1678 logputs (LOG_VERBOSE, _("Length: "));
1681 logputs (LOG_VERBOSE, legible (contlen));
1682 if (contrange != -1)
1683 logprintf (LOG_VERBOSE, _(" (%s to go)"),
1684 legible (contlen - contrange));
1687 logputs (LOG_VERBOSE,
1688 opt.ignore_length ? _("ignored") : _("unspecified"));
1690 logprintf (LOG_VERBOSE, " [%s]\n", type);
1692 logputs (LOG_VERBOSE, "\n");
1696 type = NULL; /* We don't need it any more. */
1698 /* Return if we have no intention of further downloading. */
1699 if (!(*dt & RETROKF) || (*dt & HEAD_ONLY))
1701 /* In case the caller cares to look... */
1705 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1706 might be more bytes in the body. */
1707 return RETRFINISHED;
1710 /* Open the local file. */
1713 mkalldirs (*hs->local_file);
1715 rotate_backups (*hs->local_file);
1716 fp = fopen (*hs->local_file, hs->restval ? "ab" : "wb");
1719 logprintf (LOG_NOTQUIET, "%s: %s\n", *hs->local_file, strerror (errno));
1720 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
1721 might be more bytes in the body. */
1727 extern int global_download_count;
1729 /* To ensure that repeated "from scratch" downloads work for -O
1730 files, we rewind the file pointer, unless restval is
1731 non-zero. (This works only when -O is used on regular files,
1732 but it's still a valuable feature.)
1734 However, this loses when more than one URL is specified on
1735 the command line the second rewinds eradicates the contents
1736 of the first download. Thus we disable the above trick for
1737 all the downloads except the very first one.
1739 #### A possible solution to this would be to remember the
1740 file position in the output document and to seek to that
1741 position, instead of rewinding.
1743 We don't truncate stdout, since that breaks
1744 "wget -O - [...] >> foo".
1746 if (!hs->restval && global_download_count == 0 && opt.dfp != stdout)
1748 /* This will silently fail for streams that don't correspond
1749 to regular files, but that's OK. */
1751 /* ftruncate is needed because opt.dfp is opened in append
1752 mode if opt.always_rest is set. */
1753 ftruncate (fileno (fp), 0);
1758 /* #### This confuses the code that checks for file size. There
1759 should be some overhead information. */
1760 if (opt.save_headers)
1761 fwrite (head, 1, strlen (head), fp);
1763 /* Get the contents of the document. */
1764 hs->res = fd_read_body (sock, fp, &hs->len, hs->restval,
1765 (contlen != -1 ? contlen : 0),
1766 keep_alive, &hs->dltime);
1769 CLOSE_FINISH (sock);
1771 CLOSE_INVALIDATE (sock);
1774 /* Close or flush the file. We have to be careful to check for
1775 error here. Checking the result of fwrite() is not enough --
1776 errors could go unnoticed! */
1779 flush_res = fclose (fp);
1781 flush_res = fflush (fp);
1782 if (flush_res == EOF)
1787 return RETRFINISHED;
1790 /* The genuine HTTP loop! This is the part where the retrieval is
1791 retried, and retried, and retried, and... */
1793 http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
1794 int *dt, struct url *proxy)
1797 int use_ts, got_head = 0; /* time-stamping info */
1798 char *filename_plus_orig_suffix;
1799 char *local_filename = NULL;
1800 char *tms, *locf, *tmrate;
1802 time_t tml = -1, tmr = -1; /* local and remote time-stamps */
1803 long local_size = 0; /* the size of the local file */
1804 size_t filename_len;
1805 struct http_stat hstat; /* HTTP status */
1809 /* This used to be done in main(), but it's a better idea to do it
1810 here so that we don't go through the hoops if we're just using
1814 if (!wget_cookie_jar)
1815 wget_cookie_jar = cookie_jar_new ();
1816 if (opt.cookies_input && !cookies_loaded_p)
1818 cookie_jar_load (wget_cookie_jar, opt.cookies_input);
1819 cookies_loaded_p = 1;
1825 /* Warn on (likely bogus) wildcard usage in HTTP. Don't use
1826 has_wildcards_p because it would also warn on `?', and we know that
1827 shows up in CGI paths a *lot*. */
1828 if (strchr (u->url, '*'))
1829 logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
1831 /* Determine the local filename. */
1832 if (local_file && *local_file)
1833 hstat.local_file = local_file;
1834 else if (local_file)
1836 *local_file = url_file_name (u);
1837 hstat.local_file = local_file;
1841 dummy = url_file_name (u);
1842 hstat.local_file = &dummy;
1845 if (!opt.output_document)
1846 locf = *hstat.local_file;
1848 locf = opt.output_document;
1850 hstat.referer = referer;
1852 filename_len = strlen (*hstat.local_file);
1853 filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
1855 if (opt.noclobber && file_exists_p (*hstat.local_file))
1857 /* If opt.noclobber is turned on and file already exists, do not
1858 retrieve the file */
1859 logprintf (LOG_VERBOSE, _("\
1860 File `%s' already there, will not retrieve.\n"), *hstat.local_file);
1861 /* If the file is there, we suppose it's retrieved OK. */
1864 /* #### Bogusness alert. */
1865 /* If its suffix is "html" or "htm" or similar, assume text/html. */
1866 if (has_html_suffix_p (*hstat.local_file))
1874 if (opt.timestamping)
1876 int local_dot_orig_file_exists = 0;
1878 if (opt.backup_converted)
1879 /* If -K is specified, we'll act on the assumption that it was specified
1880 last time these files were downloaded as well, and instead of just
1881 comparing local file X against server file X, we'll compare local
1882 file X.orig (if extant, else X) against server file X. If -K
1883 _wasn't_ specified last time, or the server contains files called
1884 *.orig, -N will be back to not operating correctly with -k. */
1886 /* Would a single s[n]printf() call be faster? --dan
1888 Definitely not. sprintf() is horribly slow. It's a
1889 different question whether the difference between the two
1890 affects a program. Usually I'd say "no", but at one
1891 point I profiled Wget, and found that a measurable and
1892 non-negligible amount of time was lost calling sprintf()
1893 in url.c. Replacing sprintf with inline calls to
1894 strcpy() and long_to_string() made a difference.
1896 memcpy (filename_plus_orig_suffix, *hstat.local_file, filename_len);
1897 memcpy (filename_plus_orig_suffix + filename_len,
1898 ".orig", sizeof (".orig"));
1900 /* Try to stat() the .orig file. */
1901 if (stat (filename_plus_orig_suffix, &st) == 0)
1903 local_dot_orig_file_exists = 1;
1904 local_filename = filename_plus_orig_suffix;
1908 if (!local_dot_orig_file_exists)
1909 /* Couldn't stat() <file>.orig, so try to stat() <file>. */
1910 if (stat (*hstat.local_file, &st) == 0)
1911 local_filename = *hstat.local_file;
1913 if (local_filename != NULL)
1914 /* There was a local file, so we'll check later to see if the version
1915 the server has is the same version we already have, allowing us to
1921 /* Modification time granularity is 2 seconds for Windows, so
1922 increase local time by 1 second for later comparison. */
1925 local_size = st.st_size;
1929 /* Reset the counter. */
1931 *dt = 0 | ACCEPTRANGES;
1935 /* Increment the pass counter. */
1937 sleep_between_retrievals (count);
1938 /* Get the current time string. */
1939 tms = time_str (NULL);
1940 /* Print fetch message, if opt.verbose. */
1943 char *hurl = url_string (u, 1);
1947 sprintf (tmp, _("(try:%2d)"), count);
1948 logprintf (LOG_VERBOSE, "--%s-- %s\n %s => `%s'\n",
1949 tms, hurl, tmp, locf);
1951 ws_changetitle (hurl, 1);
1956 /* Default document type is empty. However, if spider mode is
1957 on or time-stamping is employed, HEAD_ONLY commands is
1958 encoded within *dt. */
1959 if (opt.spider || (use_ts && !got_head))
1963 /* Assume no restarting. */
1965 /* Decide whether or not to restart. */
1966 if (((count > 1 && (*dt & ACCEPTRANGES)) || opt.always_rest)
1967 /* #### this calls access() and then stat(); could be optimized. */
1968 && file_exists_p (locf))
1969 if (stat (locf, &st) == 0 && S_ISREG (st.st_mode))
1970 hstat.restval = st.st_size;
1972 /* In `-c' is used and the file is existing and non-empty,
1973 refuse to truncate it if the server doesn't support continued
1975 hstat.no_truncate = 0;
1976 if (opt.always_rest && hstat.restval)
1977 hstat.no_truncate = 1;
1979 /* Decide whether to send the no-cache directive. We send it in
1981 a) we're using a proxy, and we're past our first retrieval.
1982 Some proxies are notorious for caching incomplete data, so
1983 we require a fresh get.
1984 b) caching is explicitly inhibited. */
1985 if ((proxy && count > 1) /* a */
1986 || !opt.allow_cache /* b */
1988 *dt |= SEND_NOCACHE;
1990 *dt &= ~SEND_NOCACHE;
1992 /* Try fetching the document, or at least its head. */
1993 err = gethttp (u, &hstat, dt, proxy);
1995 /* It's unfortunate that wget determines the local filename before finding
1996 out the Content-Type of the file. Barring a major restructuring of the
1997 code, we need to re-set locf here, since gethttp() may have xrealloc()d
1998 *hstat.local_file to tack on ".html". */
1999 if (!opt.output_document)
2000 locf = *hstat.local_file;
2002 locf = opt.output_document;
2005 tms = time_str (NULL);
2006 /* Get the new location (with or without the redirection). */
2008 *newloc = xstrdup (hstat.newloc);
2011 case HERR: case HEOF: case CONSOCKERR: case CONCLOSED:
2012 case CONERROR: case READERR: case WRITEFAILED:
2014 /* Non-fatal errors continue executing the loop, which will
2015 bring them to "while" statement at the end, to judge
2016 whether the number of tries was exceeded. */
2017 free_hstat (&hstat);
2018 printwhat (count, opt.ntry);
2021 case HOSTERR: case CONIMPOSSIBLE: case PROXERR: case AUTHFAILED:
2022 case SSLERRCTXCREATE: case CONTNOTSUPPORTED:
2023 /* Fatal errors just return from the function. */
2024 free_hstat (&hstat);
2028 case FWRITEERR: case FOPENERR:
2029 /* Another fatal error. */
2030 logputs (LOG_VERBOSE, "\n");
2031 logprintf (LOG_NOTQUIET, _("Cannot write to `%s' (%s).\n"),
2032 *hstat.local_file, strerror (errno));
2033 free_hstat (&hstat);
2038 /* Another fatal error. */
2039 logputs (LOG_VERBOSE, "\n");
2040 logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
2041 free_hstat (&hstat);
2046 /* Return the new location to the caller. */
2049 logprintf (LOG_NOTQUIET,
2050 _("ERROR: Redirection (%d) without location.\n"),
2052 free_hstat (&hstat);
2056 free_hstat (&hstat);
2061 /* The file was already fully retrieved. */
2062 free_hstat (&hstat);
2067 /* Deal with you later. */
2070 /* All possibilities should have been exhausted. */
2073 if (!(*dt & RETROKF))
2077 /* #### Ugly ugly ugly! */
2078 char *hurl = url_string (u, 1);
2079 logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
2082 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
2083 tms, hstat.statcode, hstat.error);
2084 logputs (LOG_VERBOSE, "\n");
2085 free_hstat (&hstat);
2090 /* Did we get the time-stamp? */
2093 if (opt.timestamping && !hstat.remote_time)
2095 logputs (LOG_NOTQUIET, _("\
2096 Last-modified header missing -- time-stamps turned off.\n"));
2098 else if (hstat.remote_time)
2100 /* Convert the date-string into struct tm. */
2101 tmr = http_atotm (hstat.remote_time);
2102 if (tmr == (time_t) (-1))
2103 logputs (LOG_VERBOSE, _("\
2104 Last-modified header invalid -- time-stamp ignored.\n"));
2108 /* The time-stamping section. */
2113 use_ts = 0; /* no more time-stamping */
2114 count = 0; /* the retrieve count for HEAD is
2116 if (hstat.remote_time && tmr != (time_t) (-1))
2118 /* Now time-stamping can be used validly. Time-stamping
2119 means that if the sizes of the local and remote file
2120 match, and local file is newer than the remote file,
2121 it will not be retrieved. Otherwise, the normal
2122 download procedure is resumed. */
2124 (hstat.contlen == -1 || local_size == hstat.contlen))
2126 logprintf (LOG_VERBOSE, _("\
2127 Server file no newer than local file `%s' -- not retrieving.\n\n"),
2129 free_hstat (&hstat);
2133 else if (tml >= tmr)
2134 logprintf (LOG_VERBOSE, _("\
2135 The sizes do not match (local %ld) -- retrieving.\n"), local_size);
2137 logputs (LOG_VERBOSE,
2138 _("Remote file is newer, retrieving.\n"));
2140 free_hstat (&hstat);
2143 if ((tmr != (time_t) (-1))
2145 && ((hstat.len == hstat.contlen) ||
2146 ((hstat.res == 0) &&
2147 ((hstat.contlen == -1) ||
2148 (hstat.len >= hstat.contlen && !opt.kill_longer)))))
2150 /* #### This code repeats in http.c and ftp.c. Move it to a
2152 const char *fl = NULL;
2153 if (opt.output_document)
2155 if (opt.od_known_regular)
2156 fl = opt.output_document;
2159 fl = *hstat.local_file;
2163 /* End of time-stamping section. */
2167 logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode, hstat.error);
2172 tmrate = retr_rate (hstat.len - hstat.restval, hstat.dltime, 0);
2174 if (hstat.len == hstat.contlen)
2178 logprintf (LOG_VERBOSE,
2179 _("%s (%s) - `%s' saved [%ld/%ld]\n\n"),
2180 tms, tmrate, locf, hstat.len, hstat.contlen);
2181 logprintf (LOG_NONVERBOSE,
2182 "%s URL:%s [%ld/%ld] -> \"%s\" [%d]\n",
2183 tms, u->url, hstat.len, hstat.contlen, locf, count);
2186 total_downloaded_bytes += hstat.len;
2188 /* Remember that we downloaded the file for later ".orig" code. */
2189 if (*dt & ADDED_HTML_EXTENSION)
2190 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2192 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2194 free_hstat (&hstat);
2198 else if (hstat.res == 0) /* No read error */
2200 if (hstat.contlen == -1) /* We don't know how much we were supposed
2201 to get, so assume we succeeded. */
2205 logprintf (LOG_VERBOSE,
2206 _("%s (%s) - `%s' saved [%ld]\n\n"),
2207 tms, tmrate, locf, hstat.len);
2208 logprintf (LOG_NONVERBOSE,
2209 "%s URL:%s [%ld] -> \"%s\" [%d]\n",
2210 tms, u->url, hstat.len, locf, count);
2213 total_downloaded_bytes += hstat.len;
2215 /* Remember that we downloaded the file for later ".orig" code. */
2216 if (*dt & ADDED_HTML_EXTENSION)
2217 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2219 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2221 free_hstat (&hstat);
2225 else if (hstat.len < hstat.contlen) /* meaning we lost the
2226 connection too soon */
2228 logprintf (LOG_VERBOSE,
2229 _("%s (%s) - Connection closed at byte %ld. "),
2230 tms, tmrate, hstat.len);
2231 printwhat (count, opt.ntry);
2232 free_hstat (&hstat);
2235 else if (!opt.kill_longer) /* meaning we got more than expected */
2237 logprintf (LOG_VERBOSE,
2238 _("%s (%s) - `%s' saved [%ld/%ld])\n\n"),
2239 tms, tmrate, locf, hstat.len, hstat.contlen);
2240 logprintf (LOG_NONVERBOSE,
2241 "%s URL:%s [%ld/%ld] -> \"%s\" [%d]\n",
2242 tms, u->url, hstat.len, hstat.contlen, locf, count);
2244 total_downloaded_bytes += hstat.len;
2246 /* Remember that we downloaded the file for later ".orig" code. */
2247 if (*dt & ADDED_HTML_EXTENSION)
2248 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, locf);
2250 downloaded_file(FILE_DOWNLOADED_NORMALLY, locf);
2252 free_hstat (&hstat);
2256 else /* the same, but not accepted */
2258 logprintf (LOG_VERBOSE,
2259 _("%s (%s) - Connection closed at byte %ld/%ld. "),
2260 tms, tmrate, hstat.len, hstat.contlen);
2261 printwhat (count, opt.ntry);
2262 free_hstat (&hstat);
2266 else /* now hstat.res can only be -1 */
2268 if (hstat.contlen == -1)
2270 logprintf (LOG_VERBOSE,
2271 _("%s (%s) - Read error at byte %ld (%s)."),
2272 tms, tmrate, hstat.len, strerror (errno));
2273 printwhat (count, opt.ntry);
2274 free_hstat (&hstat);
2277 else /* hstat.res == -1 and contlen is given */
2279 logprintf (LOG_VERBOSE,
2280 _("%s (%s) - Read error at byte %ld/%ld (%s). "),
2281 tms, tmrate, hstat.len, hstat.contlen,
2283 printwhat (count, opt.ntry);
2284 free_hstat (&hstat);
2291 while (!opt.ntry || (count < opt.ntry));
2295 /* Converts struct tm to time_t, assuming the data in tm is UTC rather
2296 than local timezone.
2298 mktime is similar but assumes struct tm, also known as the
2299 "broken-down" form of time, is in local time zone. mktime_from_utc
2300 uses mktime to make the conversion understanding that an offset
2301 will be introduced by the local time assumption.
2303 mktime_from_utc then measures the introduced offset by applying
2304 gmtime to the initial result and applying mktime to the resulting
2305 "broken-down" form. The difference between the two mktime results
2306 is the measured offset which is then subtracted from the initial
2307 mktime result to yield a calendar time which is the value returned.
2309 tm_isdst in struct tm is set to 0 to force mktime to introduce a
2310 consistent offset (the non DST offset) since tm and tm+o might be
2311 on opposite sides of a DST change.
2313 Some implementations of mktime return -1 for the nonexistent
2314 localtime hour at the beginning of DST. In this event, use
2315 mktime(tm - 1hr) + 3600.
2319 gmtime(t+o) --> tm+o
2320 mktime(tm+o) --> t+2o
2321 t+o - (t+2o - t+o) = t
2323 Note that glibc contains a function of the same purpose named
2324 `timegm' (reverse of gmtime). But obviously, it is not universally
2325 available, and unfortunately it is not straightforwardly
2326 extractable for use here. Perhaps configure should detect timegm
2327 and use it where available.
2329 Contributed by Roger Beeman <beeman@cisco.com>, with the help of
2330 Mark Baushke <mdb@cisco.com> and the rest of the Gurus at CISCO.
2331 Further improved by Roger with assistance from Edward J. Sabol
2332 based on input by Jamie Zawinski. */
2335 mktime_from_utc (struct tm *t)
2346 return -1; /* can't deal with output from strptime */
2357 return -1; /* can't deal with output from gmtime */
2360 return (tl - (tb - tl));
2363 /* Check whether the result of strptime() indicates success.
2364 strptime() returns the pointer to how far it got to in the string.
2365 The processing has been successful if the string is at `GMT' or
2366 `+X', or at the end of the string.
2368 In extended regexp parlance, the function returns 1 if P matches
2369 "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime
2370 can return) is considered a failure and 0 is returned. */
2372 check_end (const char *p)
2376 while (ISSPACE (*p))
2379 || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T')
2380 || ((p[0] == '+' || p[0] == '-') && ISDIGIT (p[1])))
2386 /* Convert the textual specification of time in TIME_STRING to the
2387 number of seconds since the Epoch.
2389 TIME_STRING can be in any of the three formats RFC2068 allows the
2390 HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date.
2391 Timezones are ignored, and should be GMT.
2393 Return the computed time_t representation, or -1 if the conversion
2396 This function uses strptime with various string formats for parsing
2397 TIME_STRING. This results in a parser that is not as lenient in
2398 interpreting TIME_STRING as I would like it to be. Being based on
2399 strptime, it always allows shortened months, one-digit days, etc.,
2400 but due to the multitude of formats in which time can be
2401 represented, an ideal HTTP time parser would be even more
2402 forgiving. It should completely ignore things like week days and
2403 concentrate only on the various forms of representing years,
2404 months, days, hours, minutes, and seconds. For example, it would
2405 be nice if it accepted ISO 8601 out of the box.
2407 I've investigated free and PD code for this purpose, but none was
2408 usable. getdate was big and unwieldy, and had potential copyright
2409 issues, or so I was informed. Dr. Marcus Hennecke's atotm(),
2410 distributed with phttpd, is excellent, but we cannot use it because
2411 it is not assigned to the FSF. So I stuck it with strptime. */
2414 http_atotm (const char *time_string)
2416 /* NOTE: Solaris strptime man page claims that %n and %t match white
2417 space, but that's not universally available. Instead, we simply
2418 use ` ' to mean "skip all WS", which works under all strptime
2419 implementations I've tested. */
2421 static const char *time_formats[] = {
2422 "%a, %d %b %Y %T", /* RFC1123: Thu, 29 Jan 1998 22:12:57 */
2423 "%A, %d-%b-%y %T", /* RFC850: Thursday, 29-Jan-98 22:12:57 */
2424 "%a, %d-%b-%Y %T", /* pseudo-RFC850: Thu, 29-Jan-1998 22:12:57
2425 (google.com uses this for their cookies.) */
2426 "%a %b %d %T %Y" /* asctime: Thu Jan 29 22:12:57 1998 */
2432 /* According to Roger Beeman, we need to initialize tm_isdst, since
2433 strptime won't do it. */
2436 /* Note that under foreign locales Solaris strptime() fails to
2437 recognize English dates, which renders this function useless. We
2438 solve this by being careful not to affect LC_TIME when
2439 initializing locale.
2441 Another solution would be to temporarily set locale to C, invoke
2442 strptime(), and restore it back. This is slow and dirty,
2443 however, and locale support other than LC_MESSAGES can mess other
2444 things, so I rather chose to stick with just setting LC_MESSAGES.
2446 GNU strptime does not have this problem because it recognizes
2447 both international and local dates. */
2449 for (i = 0; i < countof (time_formats); i++)
2450 if (check_end (strptime (time_string, time_formats[i], &t)))
2451 return mktime_from_utc (&t);
2453 /* All formats have failed. */
2457 /* Authorization support: We support two authorization schemes:
2459 * `Basic' scheme, consisting of base64-ing USER:PASSWORD string;
2461 * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>,
2462 consisting of answering to the server's challenge with the proper
2465 /* How many bytes it will take to store LEN bytes in base64. */
2466 #define BASE64_LENGTH(len) (4 * (((len) + 2) / 3))
2468 /* Encode the string S of length LENGTH to base64 format and place it
2469 to STORE. STORE will be 0-terminated, and must point to a writable
2470 buffer of at least 1+BASE64_LENGTH(length) bytes. */
2472 base64_encode (const char *s, char *store, int length)
2474 /* Conversion table. */
2475 static char tbl[64] = {
2476 'A','B','C','D','E','F','G','H',
2477 'I','J','K','L','M','N','O','P',
2478 'Q','R','S','T','U','V','W','X',
2479 'Y','Z','a','b','c','d','e','f',
2480 'g','h','i','j','k','l','m','n',
2481 'o','p','q','r','s','t','u','v',
2482 'w','x','y','z','0','1','2','3',
2483 '4','5','6','7','8','9','+','/'
2486 unsigned char *p = (unsigned char *)store;
2488 /* Transform the 3x8 bits to 4x6 bits, as required by base64. */
2489 for (i = 0; i < length; i += 3)
2491 *p++ = tbl[s[0] >> 2];
2492 *p++ = tbl[((s[0] & 3) << 4) + (s[1] >> 4)];
2493 *p++ = tbl[((s[1] & 0xf) << 2) + (s[2] >> 6)];
2494 *p++ = tbl[s[2] & 0x3f];
2497 /* Pad the result if necessary... */
2498 if (i == length + 1)
2500 else if (i == length + 2)
2501 *(p - 1) = *(p - 2) = '=';
2502 /* ...and zero-terminate it. */
2506 /* Create the authentication header contents for the `Basic' scheme.
2507 This is done by encoding the string `USER:PASS' in base64 and
2508 prepending `HEADER: Basic ' to it. */
2510 basic_authentication_encode (const char *user, const char *passwd)
2512 char *t1, *t2, *res;
2513 int len1 = strlen (user) + 1 + strlen (passwd);
2514 int len2 = BASE64_LENGTH (len1);
2516 t1 = (char *)alloca (len1 + 1);
2517 sprintf (t1, "%s:%s", user, passwd);
2519 t2 = (char *)alloca (len2 + 1);
2520 base64_encode (t1, t2, len1);
2522 res = (char *)xmalloc (6 + len2 + 1);
2523 sprintf (res, "Basic %s", t2);
2528 #define SKIP_WS(x) do { \
2529 while (ISSPACE (*(x))) \
2534 /* Parse HTTP `WWW-Authenticate:' header. AU points to the beginning
2535 of a field in such a header. If the field is the one specified by
2536 ATTR_NAME ("realm", "opaque", and "nonce" are used by the current
2537 digest authorization code), extract its value in the (char*)
2538 variable pointed by RET. Returns negative on a malformed header,
2539 or number of bytes that have been parsed by this call. */
2541 extract_header_attr (const char *au, const char *attr_name, char **ret)
2543 const char *cp, *ep;
2547 if (strncmp (cp, attr_name, strlen (attr_name)) == 0)
2549 cp += strlen (attr_name);
2562 for (ep = cp; *ep && *ep != '\"'; ep++)
2567 *ret = strdupdelim (cp, ep);
2574 /* Dump the hexadecimal representation of HASH to BUF. HASH should be
2575 an array of 16 bytes containing the hash keys, and BUF should be a
2576 buffer of 33 writable characters (32 for hex digits plus one for
2577 zero termination). */
2579 dump_hash (unsigned char *buf, const unsigned char *hash)
2583 for (i = 0; i < MD5_HASHLEN; i++, hash++)
2585 *buf++ = XNUM_TO_digit (*hash >> 4);
2586 *buf++ = XNUM_TO_digit (*hash & 0xf);
2591 /* Take the line apart to find the challenge, and compose a digest
2592 authorization header. See RFC2069 section 2.1.2. */
2594 digest_authentication_encode (const char *au, const char *user,
2595 const char *passwd, const char *method,
2598 static char *realm, *opaque, *nonce;
2603 { "realm", &realm },
2604 { "opaque", &opaque },
2609 realm = opaque = nonce = NULL;
2611 au += 6; /* skip over `Digest' */
2617 for (i = 0; i < countof (options); i++)
2619 int skip = extract_header_attr (au, options[i].name,
2620 options[i].variable);
2624 xfree_null (opaque);
2634 if (i == countof (options))
2636 while (*au && *au != '=')
2644 while (*au && *au != '\"')
2651 while (*au && *au != ',')
2656 if (!realm || !nonce || !user || !passwd || !path || !method)
2659 xfree_null (opaque);
2664 /* Calculate the digest value. */
2666 ALLOCA_MD5_CONTEXT (ctx);
2667 unsigned char hash[MD5_HASHLEN];
2668 unsigned char a1buf[MD5_HASHLEN * 2 + 1], a2buf[MD5_HASHLEN * 2 + 1];
2669 unsigned char response_digest[MD5_HASHLEN * 2 + 1];
2671 /* A1BUF = H(user ":" realm ":" password) */
2673 gen_md5_update ((unsigned char *)user, strlen (user), ctx);
2674 gen_md5_update ((unsigned char *)":", 1, ctx);
2675 gen_md5_update ((unsigned char *)realm, strlen (realm), ctx);
2676 gen_md5_update ((unsigned char *)":", 1, ctx);
2677 gen_md5_update ((unsigned char *)passwd, strlen (passwd), ctx);
2678 gen_md5_finish (ctx, hash);
2679 dump_hash (a1buf, hash);
2681 /* A2BUF = H(method ":" path) */
2683 gen_md5_update ((unsigned char *)method, strlen (method), ctx);
2684 gen_md5_update ((unsigned char *)":", 1, ctx);
2685 gen_md5_update ((unsigned char *)path, strlen (path), ctx);
2686 gen_md5_finish (ctx, hash);
2687 dump_hash (a2buf, hash);
2689 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */
2691 gen_md5_update (a1buf, MD5_HASHLEN * 2, ctx);
2692 gen_md5_update ((unsigned char *)":", 1, ctx);
2693 gen_md5_update ((unsigned char *)nonce, strlen (nonce), ctx);
2694 gen_md5_update ((unsigned char *)":", 1, ctx);
2695 gen_md5_update (a2buf, MD5_HASHLEN * 2, ctx);
2696 gen_md5_finish (ctx, hash);
2697 dump_hash (response_digest, hash);
2699 res = (char*) xmalloc (strlen (user)
2704 + 2 * MD5_HASHLEN /*strlen (response_digest)*/
2705 + (opaque ? strlen (opaque) : 0)
2707 sprintf (res, "Digest \
2708 username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"",
2709 user, realm, nonce, path, response_digest);
2712 char *p = res + strlen (res);
2713 strcat (p, ", opaque=\"");
2720 #endif /* USE_DIGEST */
2723 #define BEGINS_WITH(line, string_constant) \
2724 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
2725 && (ISSPACE (line[sizeof (string_constant) - 1]) \
2726 || !line[sizeof (string_constant) - 1]))
2729 known_authentication_scheme_p (const char *au)
2731 return BEGINS_WITH (au, "Basic")
2732 || BEGINS_WITH (au, "Digest")
2733 || BEGINS_WITH (au, "NTLM");
2738 /* Create the HTTP authorization request header. When the
2739 `WWW-Authenticate' response header is seen, according to the
2740 authorization scheme specified in that header (`Basic' and `Digest'
2741 are supported by the current implementation), produce an
2742 appropriate HTTP authorization request header. */
2744 create_authorization_line (const char *au, const char *user,
2745 const char *passwd, const char *method,
2748 if (0 == strncasecmp (au, "Basic", 5))
2749 return basic_authentication_encode (user, passwd);
2751 if (0 == strncasecmp (au, "Digest", 6))
2752 return digest_authentication_encode (au, user, passwd, method, path);
2753 #endif /* USE_DIGEST */