2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
3 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
5 This file is part of GNU Wget.
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
20 Additional permission under GNU GPL version 3 section 7
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
54 # include "http-ntlm.h"
67 #endif /* def __VMS */
69 extern char *version_string;
73 static char *create_authorization_line (const char *, const char *,
74 const char *, const char *,
75 const char *, bool *);
76 static char *basic_authentication_encode (const char *, const char *);
77 static bool known_authentication_scheme_p (const char *, const char *);
78 static void ensure_extension (struct http_stat *, const char *, int *);
79 static void load_cookies (void);
82 # define MIN(x, y) ((x) > (y) ? (y) : (x))
86 static bool cookies_loaded_p;
87 static struct cookie_jar *wget_cookie_jar;
89 #define TEXTHTML_S "text/html"
90 #define TEXTXHTML_S "application/xhtml+xml"
91 #define TEXTCSS_S "text/css"
93 /* Some status code validation macros: */
94 #define H_10X(x) (((x) >= 100) && ((x) < 200))
95 #define H_20X(x) (((x) >= 200) && ((x) < 300))
96 #define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS)
97 #define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \
98 || (x) == HTTP_STATUS_MOVED_TEMPORARILY \
99 || (x) == HTTP_STATUS_SEE_OTHER \
100 || (x) == HTTP_STATUS_TEMPORARY_REDIRECT)
102 /* HTTP/1.0 status codes from RFC1945, provided for reference. */
103 /* Successful 2xx. */
104 #define HTTP_STATUS_OK 200
105 #define HTTP_STATUS_CREATED 201
106 #define HTTP_STATUS_ACCEPTED 202
107 #define HTTP_STATUS_NO_CONTENT 204
108 #define HTTP_STATUS_PARTIAL_CONTENTS 206
110 /* Redirection 3xx. */
111 #define HTTP_STATUS_MULTIPLE_CHOICES 300
112 #define HTTP_STATUS_MOVED_PERMANENTLY 301
113 #define HTTP_STATUS_MOVED_TEMPORARILY 302
114 #define HTTP_STATUS_SEE_OTHER 303 /* from HTTP/1.1 */
115 #define HTTP_STATUS_NOT_MODIFIED 304
116 #define HTTP_STATUS_TEMPORARY_REDIRECT 307 /* from HTTP/1.1 */
118 /* Client error 4xx. */
119 #define HTTP_STATUS_BAD_REQUEST 400
120 #define HTTP_STATUS_UNAUTHORIZED 401
121 #define HTTP_STATUS_FORBIDDEN 403
122 #define HTTP_STATUS_NOT_FOUND 404
123 #define HTTP_STATUS_RANGE_NOT_SATISFIABLE 416
125 /* Server errors 5xx. */
126 #define HTTP_STATUS_INTERNAL 500
127 #define HTTP_STATUS_NOT_IMPLEMENTED 501
128 #define HTTP_STATUS_BAD_GATEWAY 502
129 #define HTTP_STATUS_UNAVAILABLE 503
132 rel_none, rel_name, rel_value, rel_both
139 struct request_header {
141 enum rp release_policy;
143 int hcount, hcapacity;
148 /* Create a new, empty request. At least request_set_method must be
149 called before the request can be used. */
151 static struct request *
154 struct request *req = xnew0 (struct request);
156 req->headers = xnew_array (struct request_header, req->hcapacity);
160 /* Set the request's method and its arguments. METH should be a
161 literal string (or it should outlive the request) because it will
162 not be freed. ARG will be freed by request_free. */
165 request_set_method (struct request *req, const char *meth, char *arg)
171 /* Return the method string passed with the last call to
172 request_set_method. */
175 request_method (const struct request *req)
180 /* Free one header according to the release policy specified with
181 request_set_header. */
184 release_header (struct request_header *hdr)
186 switch (hdr->release_policy)
203 /* Set the request named NAME to VALUE. Specifically, this means that
204 a "NAME: VALUE\r\n" header line will be used in the request. If a
205 header with the same name previously existed in the request, its
206 value will be replaced by this one. A NULL value means do nothing.
208 RELEASE_POLICY determines whether NAME and VALUE should be released
209 (freed) with request_free. Allowed values are:
211 - rel_none - don't free NAME or VALUE
212 - rel_name - free NAME when done
213 - rel_value - free VALUE when done
214 - rel_both - free both NAME and VALUE when done
216 Setting release policy is useful when arguments come from different
217 sources. For example:
219 // Don't free literal strings!
220 request_set_header (req, "Pragma", "no-cache", rel_none);
222 // Don't free a global variable, we'll need it later.
223 request_set_header (req, "Referer", opt.referer, rel_none);
225 // Value freshly allocated, free it when done.
226 request_set_header (req, "Range",
227 aprintf ("bytes=%s-", number_to_static_string (hs->restval)),
232 request_set_header (struct request *req, char *name, char *value,
233 enum rp release_policy)
235 struct request_header *hdr;
240 /* A NULL value is a no-op; if freeing the name is requested,
241 free it now to avoid leaks. */
242 if (release_policy == rel_name || release_policy == rel_both)
247 for (i = 0; i < req->hcount; i++)
249 hdr = &req->headers[i];
250 if (0 == strcasecmp (name, hdr->name))
252 /* Replace existing header. */
253 release_header (hdr);
256 hdr->release_policy = release_policy;
261 /* Install new header. */
263 if (req->hcount >= req->hcapacity)
265 req->hcapacity <<= 1;
266 req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr));
268 hdr = &req->headers[req->hcount++];
271 hdr->release_policy = release_policy;
274 /* Like request_set_header, but sets the whole header line, as
275 provided by the user using the `--header' option. For example,
276 request_set_user_header (req, "Foo: bar") works just like
277 request_set_header (req, "Foo", "bar"). */
280 request_set_user_header (struct request *req, const char *header)
283 const char *p = strchr (header, ':');
286 BOUNDED_TO_ALLOCA (header, p, name);
288 while (c_isspace (*p))
290 request_set_header (req, xstrdup (name), (char *) p, rel_name);
293 /* Remove the header with specified name from REQ. Returns true if
294 the header was actually removed, false otherwise. */
297 request_remove_header (struct request *req, char *name)
300 for (i = 0; i < req->hcount; i++)
302 struct request_header *hdr = &req->headers[i];
303 if (0 == strcasecmp (name, hdr->name))
305 release_header (hdr);
306 /* Move the remaining headers by one. */
307 if (i < req->hcount - 1)
308 memmove (hdr, hdr + 1, (req->hcount - i - 1) * sizeof (*hdr));
316 #define APPEND(p, str) do { \
317 int A_len = strlen (str); \
318 memcpy (p, str, A_len); \
322 /* Construct the request and write it to FD using fd_write. */
325 request_send (const struct request *req, int fd)
327 char *request_string, *p;
328 int i, size, write_error;
330 /* Count the request size. */
333 /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */
334 size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2;
336 for (i = 0; i < req->hcount; i++)
338 struct request_header *hdr = &req->headers[i];
339 /* NAME ": " VALUE "\r\n" */
340 size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2;
346 p = request_string = alloca_array (char, size);
348 /* Generate the request. */
350 APPEND (p, req->method); *p++ = ' ';
351 APPEND (p, req->arg); *p++ = ' ';
352 memcpy (p, "HTTP/1.1\r\n", 10); p += 10;
354 for (i = 0; i < req->hcount; i++)
356 struct request_header *hdr = &req->headers[i];
357 APPEND (p, hdr->name);
358 *p++ = ':', *p++ = ' ';
359 APPEND (p, hdr->value);
360 *p++ = '\r', *p++ = '\n';
363 *p++ = '\r', *p++ = '\n', *p++ = '\0';
364 assert (p - request_string == size);
368 DEBUGP (("\n---request begin---\n%s---request end---\n", request_string));
370 /* Send the request to the server. */
372 write_error = fd_write (fd, request_string, size - 1, -1);
374 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
379 /* Release the resources used by REQ. */
382 request_free (struct request *req)
385 xfree_null (req->arg);
386 for (i = 0; i < req->hcount; i++)
387 release_header (&req->headers[i]);
388 xfree_null (req->headers);
392 static struct hash_table *basic_authed_hosts;
394 /* Find out if this host has issued a Basic challenge yet; if so, give
395 * it the username, password. A temporary measure until we can get
396 * proper authentication in place. */
399 maybe_send_basic_creds (const char *hostname, const char *user,
400 const char *passwd, struct request *req)
402 bool do_challenge = false;
404 if (opt.auth_without_challenge)
406 DEBUGP (("Auth-without-challenge set, sending Basic credentials.\n"));
409 else if (basic_authed_hosts
410 && hash_table_contains(basic_authed_hosts, hostname))
412 DEBUGP (("Found %s in basic_authed_hosts.\n", quote (hostname)));
417 DEBUGP (("Host %s has not issued a general basic challenge.\n",
422 request_set_header (req, "Authorization",
423 basic_authentication_encode (user, passwd),
430 register_basic_auth_host (const char *hostname)
432 if (!basic_authed_hosts)
434 basic_authed_hosts = make_nocase_string_hash_table (1);
436 if (!hash_table_contains(basic_authed_hosts, hostname))
438 hash_table_put (basic_authed_hosts, xstrdup(hostname), NULL);
439 DEBUGP (("Inserted %s into basic_authed_hosts\n", quote (hostname)));
444 /* Send the contents of FILE_NAME to SOCK. Make sure that exactly
445 PROMISED_SIZE bytes are sent over the wire -- if the file is
446 longer, read only that much; if the file is shorter, report an error. */
449 post_file (int sock, const char *file_name, wgint promised_size)
451 static char chunk[8192];
456 DEBUGP (("[writing POST file %s ... ", file_name));
458 fp = fopen (file_name, "rb");
461 while (!feof (fp) && written < promised_size)
464 int length = fread (chunk, 1, sizeof (chunk), fp);
467 towrite = MIN (promised_size - written, length);
468 write_error = fd_write (sock, chunk, towrite, -1);
478 /* If we've written less than was promised, report a (probably
479 nonsensical) error rather than break the promise. */
480 if (written < promised_size)
486 assert (written == promised_size);
487 DEBUGP (("done]\n"));
491 /* Determine whether [START, PEEKED + PEEKLEN) contains an empty line.
492 If so, return the pointer to the position after the line, otherwise
493 return NULL. This is used as callback to fd_read_hunk. The data
494 between START and PEEKED has been read and cannot be "unread"; the
495 data after PEEKED has only been peeked. */
498 response_head_terminator (const char *start, const char *peeked, int peeklen)
502 /* If at first peek, verify whether HUNK starts with "HTTP". If
503 not, this is a HTTP/0.9 request and we must bail out without
505 if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4)))
508 /* Look for "\n[\r]\n", and return the following position if found.
509 Start two chars before the current to cover the possibility that
510 part of the terminator (e.g. "\n\r") arrived in the previous
512 p = peeked - start < 2 ? start : peeked - 2;
513 end = peeked + peeklen;
515 /* Check for \n\r\n or \n\n anywhere in [p, end-2). */
516 for (; p < end - 2; p++)
519 if (p[1] == '\r' && p[2] == '\n')
521 else if (p[1] == '\n')
524 /* p==end-2: check for \n\n directly preceding END. */
525 if (p[0] == '\n' && p[1] == '\n')
531 /* The maximum size of a single HTTP response we care to read. Rather
532 than being a limit of the reader implementation, this limit
533 prevents Wget from slurping all available memory upon encountering
534 malicious or buggy server output, thus protecting the user. Define
535 it to 0 to remove the limit. */
537 #define HTTP_RESPONSE_MAX_SIZE 65536
539 /* Read the HTTP request head from FD and return it. The error
540 conditions are the same as with fd_read_hunk.
542 To support HTTP/0.9 responses, this function tries to make sure
543 that the data begins with "HTTP". If this is not the case, no data
544 is read and an empty request is returned, so that the remaining
545 data can be treated as body. */
548 read_http_response_head (int fd)
550 return fd_read_hunk (fd, response_head_terminator, 512,
551 HTTP_RESPONSE_MAX_SIZE);
555 /* The response data. */
558 /* The array of pointers that indicate where each header starts.
559 For example, given this HTTP response:
566 The headers are located like this:
568 "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n"
570 headers[0] headers[1] headers[2] headers[3]
572 I.e. headers[0] points to the beginning of the request,
573 headers[1] points to the end of the first header and the
574 beginning of the second one, etc. */
576 const char **headers;
579 /* Create a new response object from the text of the HTTP response,
580 available in HEAD. That text is automatically split into
581 constituent header lines for fast retrieval using
584 static struct response *
585 resp_new (const char *head)
590 struct response *resp = xnew0 (struct response);
595 /* Empty head means that we're dealing with a headerless
596 (HTTP/0.9) response. In that case, don't set HEADERS at
601 /* Split HEAD into header lines, so that resp_header_* functions
602 don't need to do this over and over again. */
608 DO_REALLOC (resp->headers, size, count + 1, const char *);
609 resp->headers[count++] = hdr;
611 /* Break upon encountering an empty line. */
612 if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n')
615 /* Find the end of HDR, including continuations. */
618 const char *end = strchr (hdr, '\n');
624 while (*hdr == ' ' || *hdr == '\t');
626 DO_REALLOC (resp->headers, size, count + 1, const char *);
627 resp->headers[count] = NULL;
632 /* Locate the header named NAME in the request data, starting with
633 position START. This allows the code to loop through the request
634 data, filtering for all requests of a given name. Returns the
635 found position, or -1 for failure. The code that uses this
636 function typically looks like this:
638 for (pos = 0; (pos = resp_header_locate (...)) != -1; pos++)
639 ... do something with header ...
641 If you only care about one header, use resp_header_get instead of
645 resp_header_locate (const struct response *resp, const char *name, int start,
646 const char **begptr, const char **endptr)
649 const char **headers = resp->headers;
652 if (!headers || !headers[1])
655 name_len = strlen (name);
661 for (; headers[i + 1]; i++)
663 const char *b = headers[i];
664 const char *e = headers[i + 1];
666 && b[name_len] == ':'
667 && 0 == strncasecmp (b, name, name_len))
670 while (b < e && c_isspace (*b))
672 while (b < e && c_isspace (e[-1]))
682 /* Find and retrieve the header named NAME in the request data. If
683 found, set *BEGPTR to its starting, and *ENDPTR to its ending
684 position, and return true. Otherwise return false.
686 This function is used as a building block for resp_header_copy
687 and resp_header_strdup. */
690 resp_header_get (const struct response *resp, const char *name,
691 const char **begptr, const char **endptr)
693 int pos = resp_header_locate (resp, name, 0, begptr, endptr);
697 /* Copy the response header named NAME to buffer BUF, no longer than
698 BUFSIZE (BUFSIZE includes the terminating 0). If the header
699 exists, true is returned, false otherwise. If there should be no
700 limit on the size of the header, use resp_header_strdup instead.
702 If BUFSIZE is 0, no data is copied, but the boolean indication of
703 whether the header is present is still returned. */
706 resp_header_copy (const struct response *resp, const char *name,
707 char *buf, int bufsize)
710 if (!resp_header_get (resp, name, &b, &e))
714 int len = MIN (e - b, bufsize - 1);
715 memcpy (buf, b, len);
721 /* Return the value of header named NAME in RESP, allocated with
722 malloc. If such a header does not exist in RESP, return NULL. */
725 resp_header_strdup (const struct response *resp, const char *name)
728 if (!resp_header_get (resp, name, &b, &e))
730 return strdupdelim (b, e);
733 /* Parse the HTTP status line, which is of format:
735 HTTP-Version SP Status-Code SP Reason-Phrase
737 The function returns the status-code, or -1 if the status line
738 appears malformed. The pointer to "reason-phrase" message is
739 returned in *MESSAGE. */
742 resp_status (const struct response *resp, char **message)
749 /* For a HTTP/0.9 response, assume status 200. */
751 *message = xstrdup (_("No headers, assuming HTTP/0.9"));
755 p = resp->headers[0];
756 end = resp->headers[1];
762 if (end - p < 4 || 0 != strncmp (p, "HTTP", 4))
766 /* Match the HTTP version. This is optional because Gnutella
767 servers have been reported to not specify HTTP version. */
768 if (p < end && *p == '/')
771 while (p < end && c_isdigit (*p))
773 if (p < end && *p == '.')
775 while (p < end && c_isdigit (*p))
779 while (p < end && c_isspace (*p))
781 if (end - p < 3 || !c_isdigit (p[0]) || !c_isdigit (p[1]) || !c_isdigit (p[2]))
784 status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0');
789 while (p < end && c_isspace (*p))
791 while (p < end && c_isspace (end[-1]))
793 *message = strdupdelim (p, end);
799 /* Release the resources used by RESP. */
802 resp_free (struct response *resp)
804 xfree_null (resp->headers);
808 /* Print a single line of response, the characters [b, e). We tried
810 logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b);
811 but that failed to escape the non-printable characters and, in fact,
812 caused crashes in UTF-8 locales. */
815 print_response_line(const char *prefix, const char *b, const char *e)
818 BOUNDED_TO_ALLOCA(b, e, copy);
819 logprintf (LOG_ALWAYS, "%s%s\n", prefix,
820 quotearg_style (escape_quoting_style, copy));
823 /* Print the server response, line by line, omitting the trailing CRLF
824 from individual header lines, and prefixed with PREFIX. */
827 print_server_response (const struct response *resp, const char *prefix)
832 for (i = 0; resp->headers[i + 1]; i++)
834 const char *b = resp->headers[i];
835 const char *e = resp->headers[i + 1];
837 if (b < e && e[-1] == '\n')
839 if (b < e && e[-1] == '\r')
841 print_response_line(prefix, b, e);
845 /* Parse the `Content-Range' header and extract the information it
846 contains. Returns true if successful, false otherwise. */
848 parse_content_range (const char *hdr, wgint *first_byte_ptr,
849 wgint *last_byte_ptr, wgint *entity_length_ptr)
853 /* Ancient versions of Netscape proxy server, presumably predating
854 rfc2068, sent out `Content-Range' without the "bytes"
856 if (0 == strncasecmp (hdr, "bytes", 5))
859 /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the
863 while (c_isspace (*hdr))
868 if (!c_isdigit (*hdr))
870 for (num = 0; c_isdigit (*hdr); hdr++)
871 num = 10 * num + (*hdr - '0');
872 if (*hdr != '-' || !c_isdigit (*(hdr + 1)))
874 *first_byte_ptr = num;
876 for (num = 0; c_isdigit (*hdr); hdr++)
877 num = 10 * num + (*hdr - '0');
878 if (*hdr != '/' || !c_isdigit (*(hdr + 1)))
880 *last_byte_ptr = num;
885 for (num = 0; c_isdigit (*hdr); hdr++)
886 num = 10 * num + (*hdr - '0');
887 *entity_length_ptr = num;
891 /* Read the body of the request, but don't store it anywhere and don't
892 display a progress gauge. This is useful for reading the bodies of
893 administrative responses to which we will soon issue another
894 request. The response is not useful to the user, but reading it
895 allows us to continue using the same connection to the server.
897 If reading fails, false is returned, true otherwise. In debug
898 mode, the body is displayed for debugging purposes. */
901 skip_short_body (int fd, wgint contlen, bool chunked)
904 SKIP_SIZE = 512, /* size of the download buffer */
905 SKIP_THRESHOLD = 4096 /* the largest size we read */
907 wgint remaining_chunk_size = 0;
908 char dlbuf[SKIP_SIZE + 1];
909 dlbuf[SKIP_SIZE] = '\0'; /* so DEBUGP can safely print it */
911 assert (contlen != -1 || contlen);
913 /* If the body is too large, it makes more sense to simply close the
914 connection than to try to read the body. */
915 if (contlen > SKIP_THRESHOLD)
918 while (contlen > 0 || chunked)
923 if (remaining_chunk_size == 0)
925 char *line = fd_read_line (fd);
930 remaining_chunk_size = strtol (line, &endl, 16);
931 if (remaining_chunk_size == 0)
938 contlen = MIN (remaining_chunk_size, SKIP_SIZE);
941 DEBUGP (("Skipping %s bytes of body: [", number_to_static_string (contlen)));
943 ret = fd_read (fd, dlbuf, MIN (contlen, SKIP_SIZE), -1);
946 /* Don't normally report the error since this is an
947 optimization that should be invisible to the user. */
948 DEBUGP (("] aborting (%s).\n",
949 ret < 0 ? fd_errstr (fd) : "EOF received"));
956 remaining_chunk_size -= ret;
957 if (remaining_chunk_size == 0)
958 if (fd_read_line (fd) == NULL)
962 /* Safe even if %.*s bogusly expects terminating \0 because
963 we've zero-terminated dlbuf above. */
964 DEBUGP (("%.*s", ret, dlbuf));
967 DEBUGP (("] done.\n"));
971 #define NOT_RFC2231 0
972 #define RFC2231_NOENCODING 1
973 #define RFC2231_ENCODING 2
975 /* extract_param extracts the parameter name into NAME.
976 However, if the parameter name is in RFC2231 format then
977 this function adjusts NAME by stripping of the trailing
978 characters that are not part of the name but are present to
979 indicate the presence of encoding information in the value
980 or a fragment of a long parameter value
983 modify_param_name(param_token *name)
985 const char *delim1 = memchr (name->b, '*', name->e - name->b);
986 const char *delim2 = memrchr (name->b, '*', name->e - name->b);
992 result = NOT_RFC2231;
994 else if(delim1 == delim2)
996 if ((name->e - 1) == delim1)
998 result = RFC2231_ENCODING;
1002 result = RFC2231_NOENCODING;
1009 result = RFC2231_ENCODING;
1014 /* extract_param extract the paramater value into VALUE.
1015 Like modify_param_name this function modifies VALUE by
1016 stripping off the encoding information from the actual value
1019 modify_param_value (param_token *value, int encoding_type )
1021 if (RFC2231_ENCODING == encoding_type)
1023 const char *delim = memrchr (value->b, '\'', value->e - value->b);
1024 if ( delim != NULL )
1026 value->b = (delim+1);
1031 /* Extract a parameter from the string (typically an HTTP header) at
1032 **SOURCE and advance SOURCE to the next parameter. Return false
1033 when there are no more parameters to extract. The name of the
1034 parameter is returned in NAME, and the value in VALUE. If the
1035 parameter has no value, the token's value is zeroed out.
1037 For example, if *SOURCE points to the string "attachment;
1038 filename=\"foo bar\"", the first call to this function will return
1039 the token named "attachment" and no value, and the second call will
1040 return the token named "filename" and value "foo bar". The third
1041 call will return false, indicating no more valid tokens. */
1044 extract_param (const char **source, param_token *name, param_token *value,
1047 const char *p = *source;
1049 while (c_isspace (*p)) ++p;
1053 return false; /* no error; nothing more to extract */
1058 while (*p && !c_isspace (*p) && *p != '=' && *p != separator) ++p;
1060 if (name->b == name->e)
1061 return false; /* empty name: error */
1062 while (c_isspace (*p)) ++p;
1063 if (*p == separator || !*p) /* no value */
1066 if (*p == separator) ++p;
1071 return false; /* error */
1073 /* *p is '=', extract value */
1075 while (c_isspace (*p)) ++p;
1076 if (*p == '"') /* quoted */
1079 while (*p && *p != '"') ++p;
1083 /* Currently at closing quote; find the end of param. */
1084 while (c_isspace (*p)) ++p;
1085 while (*p && *p != separator) ++p;
1086 if (*p == separator)
1089 /* garbage after closed quote, e.g. foo="bar"baz */
1095 while (*p && *p != separator) ++p;
1097 while (value->e != value->b && c_isspace (value->e[-1]))
1099 if (*p == separator) ++p;
1103 int param_type = modify_param_name(name);
1104 if (NOT_RFC2231 != param_type)
1106 modify_param_value(value, param_type);
1112 #undef RFC2231_NOENCODING
1113 #undef RFC2231_ENCODING
1115 /* Appends the string represented by VALUE to FILENAME */
1118 append_value_to_filename (char **filename, param_token const * const value)
1120 int original_length = strlen(*filename);
1121 int new_length = strlen(*filename) + (value->e - value->b);
1122 *filename = xrealloc (*filename, new_length+1);
1123 memcpy (*filename + original_length, value->b, (value->e - value->b));
1124 (*filename)[new_length] = '\0';
1128 #define MAX(p, q) ((p) > (q) ? (p) : (q))
1130 /* Parse the contents of the `Content-Disposition' header, extracting
1131 the information useful to Wget. Content-Disposition is a header
1132 borrowed from MIME; when used in HTTP, it typically serves for
1133 specifying the desired file name of the resource. For example:
1135 Content-Disposition: attachment; filename="flora.jpg"
1137 Wget will skip the tokens it doesn't care about, such as
1138 "attachment" in the previous example; it will also skip other
1139 unrecognized params. If the header is syntactically correct and
1140 contains a file name, a copy of the file name is stored in
1141 *filename and true is returned. Otherwise, the function returns
1144 The file name is stripped of directory components and must not be
1147 Historically, this function returned filename prefixed with opt.dir_prefix,
1148 now that logic is handled by the caller, new code should pay attention,
1149 changed by crq, Sep 2010.
1153 parse_content_disposition (const char *hdr, char **filename)
1155 param_token name, value;
1157 while (extract_param (&hdr, &name, &value, ';'))
1159 int isFilename = BOUNDED_EQUAL_NO_CASE ( name.b, name.e, "filename" );
1160 if ( isFilename && value.b != NULL)
1162 /* Make the file name begin at the last slash or backslash. */
1163 const char *last_slash = memrchr (value.b, '/', value.e - value.b);
1164 const char *last_bs = memrchr (value.b, '\\', value.e - value.b);
1165 if (last_slash && last_bs)
1166 value.b = 1 + MAX (last_slash, last_bs);
1167 else if (last_slash || last_bs)
1168 value.b = 1 + (last_slash ? last_slash : last_bs);
1169 if (value.b == value.e)
1173 append_value_to_filename (filename, &value);
1175 *filename = strdupdelim (value.b, value.e);
1186 /* Persistent connections. Currently, we cache the most recently used
1187 connection as persistent, provided that the HTTP server agrees to
1188 make it such. The persistence data is stored in the variables
1189 below. Ideally, it should be possible to cache an arbitrary fixed
1190 number of these connections. */
1192 /* Whether a persistent connection is active. */
1193 static bool pconn_active;
1196 /* The socket of the connection. */
1199 /* Host and port of the currently active persistent connection. */
1203 /* Whether a ssl handshake has occoured on this connection. */
1206 /* Whether the connection was authorized. This is only done by
1207 NTLM, which authorizes *connections* rather than individual
1208 requests. (That practice is peculiar for HTTP, but it is a
1209 useful optimization.) */
1213 /* NTLM data of the current connection. */
1214 struct ntlmdata ntlm;
1218 /* Mark the persistent connection as invalid and free the resources it
1219 uses. This is used by the CLOSE_* macros after they forcefully
1220 close a registered persistent connection. */
1223 invalidate_persistent (void)
1225 DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket));
1226 pconn_active = false;
1227 fd_close (pconn.socket);
1232 /* Register FD, which should be a TCP/IP connection to HOST:PORT, as
1233 persistent. This will enable someone to use the same connection
1234 later. In the context of HTTP, this must be called only AFTER the
1235 response has been received and the server has promised that the
1236 connection will remain alive.
1238 If a previous connection was persistent, it is closed. */
1241 register_persistent (const char *host, int port, int fd, bool ssl)
1245 if (pconn.socket == fd)
1247 /* The connection FD is already registered. */
1252 /* The old persistent connection is still active; close it
1253 first. This situation arises whenever a persistent
1254 connection exists, but we then connect to a different
1255 host, and try to register a persistent connection to that
1257 invalidate_persistent ();
1261 pconn_active = true;
1263 pconn.host = xstrdup (host);
1266 pconn.authorized = false;
1268 DEBUGP (("Registered socket %d for persistent reuse.\n", fd));
1271 /* Return true if a persistent connection is available for connecting
1275 persistent_available_p (const char *host, int port, bool ssl,
1276 bool *host_lookup_failed)
1278 /* First, check whether a persistent connection is active at all. */
1282 /* If we want SSL and the last connection wasn't or vice versa,
1283 don't use it. Checking for host and port is not enough because
1284 HTTP and HTTPS can apparently coexist on the same port. */
1285 if (ssl != pconn.ssl)
1288 /* If we're not connecting to the same port, we're not interested. */
1289 if (port != pconn.port)
1292 /* If the host is the same, we're in business. If not, there is
1293 still hope -- read below. */
1294 if (0 != strcasecmp (host, pconn.host))
1296 /* Check if pconn.socket is talking to HOST under another name.
1297 This happens often when both sites are virtual hosts
1298 distinguished only by name and served by the same network
1299 interface, and hence the same web server (possibly set up by
1300 the ISP and serving many different web sites). This
1301 admittedly unconventional optimization does not contradict
1302 HTTP and works well with popular server software. */
1306 struct address_list *al;
1309 /* Don't try to talk to two different SSL sites over the same
1310 secure connection! (Besides, it's not clear that
1311 name-based virtual hosting is even possible with SSL.) */
1314 /* If pconn.socket's peer is one of the IP addresses HOST
1315 resolves to, pconn.socket is for all intents and purposes
1316 already talking to HOST. */
1318 if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER))
1320 /* Can't get the peer's address -- something must be very
1321 wrong with the connection. */
1322 invalidate_persistent ();
1325 al = lookup_host (host, 0);
1328 *host_lookup_failed = true;
1332 found = address_list_contains (al, &ip);
1333 address_list_release (al);
1338 /* The persistent connection's peer address was found among the
1339 addresses HOST resolved to; therefore, pconn.sock is in fact
1340 already talking to HOST -- no need to reconnect. */
1343 /* Finally, check whether the connection is still open. This is
1344 important because most servers implement liberal (short) timeout
1345 on persistent connections. Wget can of course always reconnect
1346 if the connection doesn't work out, but it's nicer to know in
1347 advance. This test is a logical followup of the first test, but
1348 is "expensive" and therefore placed at the end of the list.
1350 (Current implementation of test_socket_open has a nice side
1351 effect that it treats sockets with pending data as "closed".
1352 This is exactly what we want: if a broken server sends message
1353 body in response to HEAD, or if it sends more than conent-length
1354 data, we won't reuse the corrupted connection.) */
1356 if (!test_socket_open (pconn.socket))
1358 /* Oops, the socket is no longer open. Now that we know that,
1359 let's invalidate the persistent connection before returning
1361 invalidate_persistent ();
1368 /* The idea behind these two CLOSE macros is to distinguish between
1369 two cases: one when the job we've been doing is finished, and we
1370 want to close the connection and leave, and two when something is
1371 seriously wrong and we're closing the connection as part of
1374 In case of keep_alive, CLOSE_FINISH should leave the connection
1375 open, while CLOSE_INVALIDATE should still close it.
1377 Note that the semantics of the flag `keep_alive' is "this
1378 connection *will* be reused (the server has promised not to close
1379 the connection once we're done)", while the semantics of
1380 `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an
1381 active, registered connection". */
1383 #define CLOSE_FINISH(fd) do { \
1386 if (pconn_active && (fd) == pconn.socket) \
1387 invalidate_persistent (); \
1396 #define CLOSE_INVALIDATE(fd) do { \
1397 if (pconn_active && (fd) == pconn.socket) \
1398 invalidate_persistent (); \
1406 wgint len; /* received length */
1407 wgint contlen; /* expected length */
1408 wgint restval; /* the restart value */
1409 int res; /* the result of last read */
1410 char *rderrmsg; /* error message from read error */
1411 char *newloc; /* new location (redirection) */
1412 char *remote_time; /* remote time-stamp string */
1413 char *error; /* textual HTTP error */
1414 int statcode; /* status code */
1415 char *message; /* status message */
1416 wgint rd_size; /* amount of data read from socket */
1417 double dltime; /* time it took to download the data */
1418 const char *referer; /* value of the referer header. */
1419 char *local_file; /* local file name. */
1420 bool existence_checked; /* true if we already checked for a file's
1421 existence after having begun to download
1422 (needed in gethttp for when connection is
1423 interrupted/restarted. */
1424 bool timestamp_checked; /* true if pre-download time-stamping checks
1425 * have already been performed */
1426 char *orig_file_name; /* name of file to compare for time-stamping
1427 * (might be != local_file if -K is set) */
1428 wgint orig_file_size; /* size of file to compare for time-stamping */
1429 time_t orig_file_tstamp; /* time-stamp of file to compare for
1434 free_hstat (struct http_stat *hs)
1436 xfree_null (hs->newloc);
1437 xfree_null (hs->remote_time);
1438 xfree_null (hs->error);
1439 xfree_null (hs->rderrmsg);
1440 xfree_null (hs->local_file);
1441 xfree_null (hs->orig_file_name);
1442 xfree_null (hs->message);
1444 /* Guard against being called twice. */
1446 hs->remote_time = NULL;
1450 #define BEGINS_WITH(line, string_constant) \
1451 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
1452 && (c_isspace (line[sizeof (string_constant) - 1]) \
1453 || !line[sizeof (string_constant) - 1]))
1456 #define SET_USER_AGENT(req) do { \
1457 if (!opt.useragent) \
1458 request_set_header (req, "User-Agent", \
1459 aprintf ("Wget/%s (VMS %s %s)", \
1460 version_string, vms_arch(), vms_vers()), \
1462 else if (*opt.useragent) \
1463 request_set_header (req, "User-Agent", opt.useragent, rel_none); \
1465 #else /* def __VMS */
1466 #define SET_USER_AGENT(req) do { \
1467 if (!opt.useragent) \
1468 request_set_header (req, "User-Agent", \
1469 aprintf ("Wget/%s (%s)", \
1470 version_string, OS_TYPE), \
1472 else if (*opt.useragent) \
1473 request_set_header (req, "User-Agent", opt.useragent, rel_none); \
1475 #endif /* def __VMS [else] */
1477 /* The flags that allow clobbering the file (opening with "wb").
1478 Defined here to avoid repetition later. #### This will require
1480 #define ALLOW_CLOBBER (opt.noclobber || opt.always_rest || opt.timestamping \
1481 || opt.dirstruct || opt.output_document)
1483 /* Retrieve a document through HTTP protocol. It recognizes status
1484 code, and correctly handles redirections. It closes the network
1485 socket. If it receives an error from the functions below it, it
1486 will print it if there is enough information to do so (almost
1487 always), returning the error to the caller (i.e. http_loop).
1489 Various HTTP parameters are stored to hs.
1491 If PROXY is non-NULL, the connection will be made to the proxy
1492 server, and u->url will be requested. */
1494 gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
1495 struct iri *iri, int count)
1497 struct request *req;
1500 char *user, *passwd;
1504 wgint contlen, contrange;
1511 /* Set to 1 when the authorization has already been sent and should
1512 not be tried again. */
1513 bool auth_finished = false;
1515 /* Set to 1 when just globally-set Basic authorization has been sent;
1516 * should prevent further Basic negotiations, but not other
1518 bool basic_auth_finished = false;
1520 /* Whether NTLM authentication is used for this request. */
1521 bool ntlm_seen = false;
1523 /* Whether our connection to the remote host is through SSL. */
1524 bool using_ssl = false;
1526 /* Whether a HEAD request will be issued (as opposed to GET or
1528 bool head_only = !!(*dt & HEAD_ONLY);
1531 struct response *resp;
1535 /* Whether this connection will be kept alive after the HTTP request
1539 /* Is the server using the chunked transfer encoding? */
1540 bool chunked_transfer_encoding = false;
1542 /* Whether keep-alive should be inhibited. */
1543 bool inhibit_keep_alive =
1544 !opt.http_keep_alive || opt.ignore_length;
1546 /* Headers sent when using POST. */
1547 wgint post_data_size = 0;
1549 bool host_lookup_failed = false;
1552 if (u->scheme == SCHEME_HTTPS)
1554 /* Initialize the SSL context. After this has once been done,
1555 it becomes a no-op. */
1558 scheme_disable (SCHEME_HTTPS);
1559 logprintf (LOG_NOTQUIET,
1560 _("Disabling SSL due to encountered errors.\n"));
1561 return SSLINITFAILED;
1564 #endif /* HAVE_SSL */
1566 /* Initialize certain elements of struct http_stat. */
1570 hs->rderrmsg = NULL;
1572 hs->remote_time = NULL;
1578 /* Prepare the request to send. */
1580 req = request_new ();
1583 const char *meth = "GET";
1586 else if (opt.post_file_name || opt.post_data)
1588 /* Use the full path, i.e. one that includes the leading slash and
1589 the query string. E.g. if u->path is "foo/bar" and u->query is
1590 "param=value", full_path will be "/foo/bar?param=value". */
1593 /* When using SSL over proxy, CONNECT establishes a direct
1594 connection to the HTTPS server. Therefore use the same
1595 argument as when talking to the server directly. */
1596 && u->scheme != SCHEME_HTTPS
1599 meth_arg = xstrdup (u->url);
1601 meth_arg = url_full_path (u);
1602 request_set_method (req, meth, meth_arg);
1605 request_set_header (req, "Referer", (char *) hs->referer, rel_none);
1606 if (*dt & SEND_NOCACHE)
1607 request_set_header (req, "Pragma", "no-cache", rel_none);
1608 if (hs->restval && !opt.timestamping)
1609 request_set_header (req, "Range",
1610 aprintf ("bytes=%s-",
1611 number_to_static_string (hs->restval)),
1613 SET_USER_AGENT (req);
1614 request_set_header (req, "Accept", "*/*", rel_none);
1616 /* Find the username and password for authentication. */
1619 search_netrc (u->host, (const char **)&user, (const char **)&passwd, 0);
1620 user = user ? user : (opt.http_user ? opt.http_user : opt.user);
1621 passwd = passwd ? passwd : (opt.http_passwd ? opt.http_passwd : opt.passwd);
1623 /* We only do "site-wide" authentication with "global" user/password
1624 * values unless --auth-no-challange has been requested; URL user/password
1625 * info overrides. */
1626 if (user && passwd && (!u->user || opt.auth_without_challenge))
1628 /* If this is a host for which we've already received a Basic
1629 * challenge, we'll go ahead and send Basic authentication creds. */
1630 basic_auth_finished = maybe_send_basic_creds(u->host, user, passwd, req);
1633 /* Generate the Host header, HOST:PORT. Take into account that:
1635 - Broken server-side software often doesn't recognize the PORT
1636 argument, so we must generate "Host: www.server.com" instead of
1637 "Host: www.server.com:80" (and likewise for https port).
1639 - IPv6 addresses contain ":", so "Host: 3ffe:8100:200:2::2:1234"
1640 becomes ambiguous and needs to be rewritten as "Host:
1641 [3ffe:8100:200:2::2]:1234". */
1643 /* Formats arranged for hfmt[add_port][add_squares]. */
1644 static const char *hfmt[][2] = {
1645 { "%s", "[%s]" }, { "%s:%d", "[%s]:%d" }
1647 int add_port = u->port != scheme_default_port (u->scheme);
1648 int add_squares = strchr (u->host, ':') != NULL;
1649 request_set_header (req, "Host",
1650 aprintf (hfmt[add_port][add_squares], u->host, u->port),
1654 if (inhibit_keep_alive)
1655 request_set_header (req, "Connection", "Close", rel_none);
1659 request_set_header (req, "Connection", "Keep-Alive", rel_none);
1662 request_set_header (req, "Connection", "Close", rel_none);
1663 request_set_header (req, "Proxy-Connection", "Keep-Alive", rel_none);
1667 if (opt.post_data || opt.post_file_name)
1669 request_set_header (req, "Content-Type",
1670 "application/x-www-form-urlencoded", rel_none);
1672 post_data_size = strlen (opt.post_data);
1675 post_data_size = file_size (opt.post_file_name);
1676 if (post_data_size == -1)
1678 logprintf (LOG_NOTQUIET, _("POST data file %s missing: %s\n"),
1679 quote (opt.post_file_name), strerror (errno));
1683 request_set_header (req, "Content-Length",
1684 xstrdup (number_to_static_string (post_data_size)),
1689 /* We need to come back here when the initial attempt to retrieve
1690 without authorization header fails. (Expected to happen at least
1691 for the Digest authorization scheme.) */
1694 request_set_header (req, "Cookie",
1695 cookie_header (wget_cookie_jar,
1696 u->host, u->port, u->path,
1698 u->scheme == SCHEME_HTTPS
1705 /* Add the user headers. */
1706 if (opt.user_headers)
1709 for (i = 0; opt.user_headers[i]; i++)
1710 request_set_user_header (req, opt.user_headers[i]);
1716 char *proxy_user, *proxy_passwd;
1717 /* For normal username and password, URL components override
1718 command-line/wgetrc parameters. With proxy
1719 authentication, it's the reverse, because proxy URLs are
1720 normally the "permanent" ones, so command-line args
1721 should take precedence. */
1722 if (opt.proxy_user && opt.proxy_passwd)
1724 proxy_user = opt.proxy_user;
1725 proxy_passwd = opt.proxy_passwd;
1729 proxy_user = proxy->user;
1730 proxy_passwd = proxy->passwd;
1732 /* #### This does not appear right. Can't the proxy request,
1733 say, `Digest' authentication? */
1734 if (proxy_user && proxy_passwd)
1735 proxyauth = basic_authentication_encode (proxy_user, proxy_passwd);
1737 /* If we're using a proxy, we will be connecting to the proxy
1741 /* Proxy authorization over SSL is handled below. */
1743 if (u->scheme != SCHEME_HTTPS)
1745 request_set_header (req, "Proxy-Authorization", proxyauth, rel_value);
1750 /* Establish the connection. */
1752 if (inhibit_keep_alive)
1756 /* Look for a persistent connection to target host, unless a
1757 proxy is used. The exception is when SSL is in use, in which
1758 case the proxy is nothing but a passthrough to the target
1759 host, registered as a connection to the latter. */
1760 struct url *relevant = conn;
1762 if (u->scheme == SCHEME_HTTPS)
1766 if (persistent_available_p (relevant->host, relevant->port,
1768 relevant->scheme == SCHEME_HTTPS,
1772 &host_lookup_failed))
1774 sock = pconn.socket;
1775 using_ssl = pconn.ssl;
1776 logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"),
1777 quotearg_style (escape_quoting_style, pconn.host),
1779 DEBUGP (("Reusing fd %d.\n", sock));
1780 if (pconn.authorized)
1781 /* If the connection is already authorized, the "Basic"
1782 authorization added by code above is unnecessary and
1784 request_remove_header (req, "Authorization");
1786 else if (host_lookup_failed)
1789 logprintf(LOG_NOTQUIET,
1790 _("%s: unable to resolve host address %s\n"),
1791 exec_name, quote (relevant->host));
1798 sock = connect_to_host (conn->host, conn->port);
1807 return (retryable_socket_connect_error (errno)
1808 ? CONERROR : CONIMPOSSIBLE);
1812 if (proxy && u->scheme == SCHEME_HTTPS)
1814 /* When requesting SSL URLs through proxies, use the
1815 CONNECT method to request passthrough. */
1816 struct request *connreq = request_new ();
1817 request_set_method (connreq, "CONNECT",
1818 aprintf ("%s:%d", u->host, u->port));
1819 SET_USER_AGENT (connreq);
1822 request_set_header (connreq, "Proxy-Authorization",
1823 proxyauth, rel_value);
1824 /* Now that PROXYAUTH is part of the CONNECT request,
1825 zero it out so we don't send proxy authorization with
1826 the regular request below. */
1829 /* Examples in rfc2817 use the Host header in CONNECT
1830 requests. I don't see how that gains anything, given
1831 that the contents of Host would be exactly the same as
1832 the contents of CONNECT. */
1834 write_error = request_send (connreq, sock);
1835 request_free (connreq);
1836 if (write_error < 0)
1838 CLOSE_INVALIDATE (sock);
1842 head = read_http_response_head (sock);
1845 logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"),
1847 CLOSE_INVALIDATE (sock);
1856 DEBUGP (("proxy responded with: [%s]\n", head));
1858 resp = resp_new (head);
1859 statcode = resp_status (resp, &message);
1862 char *tms = datetime_str (time (NULL));
1863 logprintf (LOG_VERBOSE, "%d\n", statcode);
1864 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode,
1865 quotearg_style (escape_quoting_style,
1866 _("Malformed status line")));
1870 hs->message = xstrdup (message);
1873 if (statcode != 200)
1876 logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"),
1877 message ? quotearg_style (escape_quoting_style, message) : "?");
1878 xfree_null (message);
1881 xfree_null (message);
1883 /* SOCK is now *really* connected to u->host, so update CONN
1884 to reflect this. That way register_persistent will
1885 register SOCK as being connected to u->host:u->port. */
1889 if (conn->scheme == SCHEME_HTTPS)
1891 if (!ssl_connect_wget (sock))
1896 else if (!ssl_check_certificate (sock, u->host))
1899 return VERIFCERTERR;
1903 #endif /* HAVE_SSL */
1906 /* Send the request to server. */
1907 write_error = request_send (req, sock);
1909 if (write_error >= 0)
1913 DEBUGP (("[POST data: %s]\n", opt.post_data));
1914 write_error = fd_write (sock, opt.post_data, post_data_size, -1);
1916 else if (opt.post_file_name && post_data_size != 0)
1917 write_error = post_file (sock, opt.post_file_name, post_data_size);
1920 if (write_error < 0)
1922 CLOSE_INVALIDATE (sock);
1926 logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
1927 proxy ? "Proxy" : "HTTP");
1933 head = read_http_response_head (sock);
1938 logputs (LOG_NOTQUIET, _("No data received.\n"));
1939 CLOSE_INVALIDATE (sock);
1945 logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"),
1947 CLOSE_INVALIDATE (sock);
1952 DEBUGP (("\n---response begin---\n%s---response end---\n", head));
1954 resp = resp_new (head);
1956 /* Check for status line. */
1958 statcode = resp_status (resp, &message);
1961 char *tms = datetime_str (time (NULL));
1962 logprintf (LOG_VERBOSE, "%d\n", statcode);
1963 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode,
1964 quotearg_style (escape_quoting_style,
1965 _("Malformed status line")));
1966 CLOSE_INVALIDATE (sock);
1971 if (H_10X (statcode))
1973 DEBUGP (("Ignoring response\n"));
1977 hs->message = xstrdup (message);
1978 if (!opt.server_response)
1979 logprintf (LOG_VERBOSE, "%2d %s\n", statcode,
1980 message ? quotearg_style (escape_quoting_style, message) : "");
1983 logprintf (LOG_VERBOSE, "\n");
1984 print_server_response (resp, " ");
1987 if (!opt.ignore_length
1988 && resp_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval)))
1992 parsed = str_to_wgint (hdrval, NULL, 10);
1993 if (parsed == WGINT_MAX && errno == ERANGE)
1996 #### If Content-Length is out of range, it most likely
1997 means that the file is larger than 2G and that we're
1998 compiled without LFS. In that case we should probably
1999 refuse to even attempt to download the file. */
2002 else if (parsed < 0)
2004 /* Negative Content-Length; nonsensical, so we can't
2005 assume any information about the content to receive. */
2012 /* Check for keep-alive related responses. */
2013 if (!inhibit_keep_alive && contlen != -1)
2015 if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval)))
2017 if (0 == strcasecmp (hdrval, "Close"))
2022 resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval));
2023 if (0 == strcasecmp (hdrval, "chunked"))
2024 chunked_transfer_encoding = true;
2026 /* Handle (possibly multiple instances of) the Set-Cookie header. */
2030 const char *scbeg, *scend;
2031 /* The jar should have been created by now. */
2032 assert (wget_cookie_jar != NULL);
2034 (scpos = resp_header_locate (resp, "Set-Cookie", scpos,
2035 &scbeg, &scend)) != -1;
2038 char *set_cookie; BOUNDED_TO_ALLOCA (scbeg, scend, set_cookie);
2039 cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port,
2040 u->path, set_cookie);
2045 /* The server has promised that it will not close the connection
2046 when we're done. This means that we can register it. */
2047 register_persistent (conn->host, conn->port, sock, using_ssl);
2049 if (statcode == HTTP_STATUS_UNAUTHORIZED)
2051 /* Authorization is required. */
2052 if (keep_alive && !head_only
2053 && skip_short_body (sock, contlen, chunked_transfer_encoding))
2054 CLOSE_FINISH (sock);
2056 CLOSE_INVALIDATE (sock);
2057 pconn.authorized = false;
2058 if (!auth_finished && (user && passwd))
2060 /* IIS sends multiple copies of WWW-Authenticate, one with
2061 the value "negotiate", and other(s) with data. Loop over
2062 all the occurrences and pick the one we recognize. */
2064 const char *wabeg, *waend;
2065 char *www_authenticate = NULL;
2067 (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos,
2068 &wabeg, &waend)) != -1;
2070 if (known_authentication_scheme_p (wabeg, waend))
2072 BOUNDED_TO_ALLOCA (wabeg, waend, www_authenticate);
2076 if (!www_authenticate)
2078 /* If the authentication header is missing or
2079 unrecognized, there's no sense in retrying. */
2080 logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n"));
2082 else if (!basic_auth_finished
2083 || !BEGINS_WITH (www_authenticate, "Basic"))
2086 pth = url_full_path (u);
2087 request_set_header (req, "Authorization",
2088 create_authorization_line (www_authenticate,
2090 request_method (req),
2094 if (BEGINS_WITH (www_authenticate, "NTLM"))
2096 else if (!u->user && BEGINS_WITH (www_authenticate, "Basic"))
2098 /* Need to register this host as using basic auth,
2099 * so we automatically send creds next time. */
2100 register_basic_auth_host (u->host);
2103 xfree_null (message);
2106 goto retry_with_auth;
2110 /* We already did Basic auth, and it failed. Gotta
2114 logputs (LOG_NOTQUIET, _("Authorization failed.\n"));
2116 xfree_null (message);
2121 else /* statcode != HTTP_STATUS_UNAUTHORIZED */
2123 /* Kludge: if NTLM is used, mark the TCP connection as authorized. */
2125 pconn.authorized = true;
2128 /* Determine the local filename if needed. Notice that if -O is used
2129 * hstat.local_file is set by http_loop to the argument of -O. */
2130 if (!hs->local_file)
2132 char *local_file = NULL;
2134 /* Honor Content-Disposition whether possible. */
2135 if (!opt.content_disposition
2136 || !resp_header_copy (resp, "Content-Disposition",
2137 hdrval, sizeof (hdrval))
2138 || !parse_content_disposition (hdrval, &local_file))
2140 /* The Content-Disposition header is missing or broken.
2141 * Choose unique file name according to given URL. */
2142 hs->local_file = url_file_name (u, NULL);
2146 DEBUGP (("Parsed filename from Content-Disposition: %s\n",
2148 hs->local_file = url_file_name (u, local_file);
2152 /* TODO: perform this check only once. */
2153 if (!hs->existence_checked && file_exists_p (hs->local_file))
2155 if (opt.noclobber && !opt.output_document)
2157 /* If opt.noclobber is turned on and file already exists, do not
2158 retrieve the file. But if the output_document was given, then this
2159 test was already done and the file didn't exist. Hence the !opt.output_document */
2160 logprintf (LOG_VERBOSE, _("\
2161 File %s already there; not retrieving.\n\n"), quote (hs->local_file));
2162 /* If the file is there, we suppose it's retrieved OK. */
2165 /* #### Bogusness alert. */
2166 /* If its suffix is "html" or "htm" or similar, assume text/html. */
2167 if (has_html_suffix_p (hs->local_file))
2171 xfree_null (message);
2172 return RETRUNNEEDED;
2174 else if (!ALLOW_CLOBBER)
2176 char *unique = unique_name (hs->local_file, true);
2177 if (unique != hs->local_file)
2178 xfree (hs->local_file);
2179 hs->local_file = unique;
2182 hs->existence_checked = true;
2184 /* Support timestamping */
2185 /* TODO: move this code out of gethttp. */
2186 if (opt.timestamping && !hs->timestamp_checked)
2188 size_t filename_len = strlen (hs->local_file);
2189 char *filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX));
2190 bool local_dot_orig_file_exists = false;
2191 char *local_filename = NULL;
2194 if (opt.backup_converted)
2195 /* If -K is specified, we'll act on the assumption that it was specified
2196 last time these files were downloaded as well, and instead of just
2197 comparing local file X against server file X, we'll compare local
2198 file X.orig (if extant, else X) against server file X. If -K
2199 _wasn't_ specified last time, or the server contains files called
2200 *.orig, -N will be back to not operating correctly with -k. */
2202 /* Would a single s[n]printf() call be faster? --dan
2204 Definitely not. sprintf() is horribly slow. It's a
2205 different question whether the difference between the two
2206 affects a program. Usually I'd say "no", but at one
2207 point I profiled Wget, and found that a measurable and
2208 non-negligible amount of time was lost calling sprintf()
2209 in url.c. Replacing sprintf with inline calls to
2210 strcpy() and number_to_string() made a difference.
2212 memcpy (filename_plus_orig_suffix, hs->local_file, filename_len);
2213 memcpy (filename_plus_orig_suffix + filename_len,
2214 ORIG_SFX, sizeof (ORIG_SFX));
2216 /* Try to stat() the .orig file. */
2217 if (stat (filename_plus_orig_suffix, &st) == 0)
2219 local_dot_orig_file_exists = true;
2220 local_filename = filename_plus_orig_suffix;
2224 if (!local_dot_orig_file_exists)
2225 /* Couldn't stat() <file>.orig, so try to stat() <file>. */
2226 if (stat (hs->local_file, &st) == 0)
2227 local_filename = hs->local_file;
2229 if (local_filename != NULL)
2230 /* There was a local file, so we'll check later to see if the version
2231 the server has is the same version we already have, allowing us to
2234 hs->orig_file_name = xstrdup (local_filename);
2235 hs->orig_file_size = st.st_size;
2236 hs->orig_file_tstamp = st.st_mtime;
2238 /* Modification time granularity is 2 seconds for Windows, so
2239 increase local time by 1 second for later comparison. */
2240 ++hs->orig_file_tstamp;
2247 hs->statcode = statcode;
2249 hs->error = xstrdup (_("Malformed status line"));
2251 hs->error = xstrdup (_("(no description)"));
2253 hs->error = xstrdup (message);
2254 xfree_null (message);
2256 type = resp_header_strdup (resp, "Content-Type");
2259 char *tmp = strchr (type, ';');
2262 /* sXXXav: only needed if IRI support is enabled */
2263 char *tmp2 = tmp + 1;
2265 while (tmp > type && c_isspace (tmp[-1]))
2269 /* Try to get remote encoding if needed */
2270 if (opt.enable_iri && !opt.encoding_remote)
2272 tmp = parse_charset (tmp2);
2274 set_content_encoding (iri, tmp);
2278 hs->newloc = resp_header_strdup (resp, "Location");
2279 hs->remote_time = resp_header_strdup (resp, "Last-Modified");
2281 if (resp_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval)))
2283 wgint first_byte_pos, last_byte_pos, entity_length;
2284 if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos,
2287 contrange = first_byte_pos;
2288 contlen = last_byte_pos - first_byte_pos + 1;
2293 /* 20x responses are counted among successful by default. */
2294 if (H_20X (statcode))
2297 /* Return if redirected. */
2298 if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES)
2300 /* RFC2068 says that in case of the 300 (multiple choices)
2301 response, the server can output a preferred URL through
2302 `Location' header; otherwise, the request should be treated
2303 like GET. So, if the location is set, it will be a
2304 redirection; otherwise, just proceed normally. */
2305 if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc)
2309 logprintf (LOG_VERBOSE,
2310 _("Location: %s%s\n"),
2311 hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
2312 hs->newloc ? _(" [following]") : "");
2313 if (keep_alive && !head_only
2314 && skip_short_body (sock, contlen, chunked_transfer_encoding))
2315 CLOSE_FINISH (sock);
2317 CLOSE_INVALIDATE (sock);
2324 /* If content-type is not given, assume text/html. This is because
2325 of the multitude of broken CGI's that "forget" to generate the
2328 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) ||
2329 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
2335 0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S)))
2340 if (opt.adjust_extension)
2343 /* -E / --adjust-extension / adjust_extension = on was specified,
2344 and this is a text/html file. If some case-insensitive
2345 variation on ".htm[l]" isn't already the file's suffix,
2348 ensure_extension (hs, ".html", dt);
2350 else if (*dt & TEXTCSS)
2352 ensure_extension (hs, ".css", dt);
2356 if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE
2357 || (!opt.timestamping && hs->restval > 0 && statcode == HTTP_STATUS_OK
2358 && contrange == 0 && contlen >= 0 && hs->restval >= contlen))
2360 /* If `-c' is in use and the file has been fully downloaded (or
2361 the remote file has shrunk), Wget effectively requests bytes
2362 after the end of file and the server response with 416
2363 (or 200 with a <= Content-Length. */
2364 logputs (LOG_VERBOSE, _("\
2365 \n The file is already fully retrieved; nothing to do.\n\n"));
2366 /* In case the caller inspects. */
2369 /* Mark as successfully retrieved. */
2372 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there
2373 might be more bytes in the body. */
2375 return RETRUNNEEDED;
2377 if ((contrange != 0 && contrange != hs->restval)
2378 || (H_PARTIAL (statcode) && !contrange))
2380 /* The Range request was somehow misunderstood by the server.
2383 CLOSE_INVALIDATE (sock);
2390 hs->contlen = contlen + contrange;
2396 /* No need to print this output if the body won't be
2397 downloaded at all, or if the original server response is
2399 logputs (LOG_VERBOSE, _("Length: "));
2402 logputs (LOG_VERBOSE, number_to_static_string (contlen + contrange));
2403 if (contlen + contrange >= 1024)
2404 logprintf (LOG_VERBOSE, " (%s)",
2405 human_readable (contlen + contrange));
2408 if (contlen >= 1024)
2409 logprintf (LOG_VERBOSE, _(", %s (%s) remaining"),
2410 number_to_static_string (contlen),
2411 human_readable (contlen));
2413 logprintf (LOG_VERBOSE, _(", %s remaining"),
2414 number_to_static_string (contlen));
2418 logputs (LOG_VERBOSE,
2419 opt.ignore_length ? _("ignored") : _("unspecified"));
2421 logprintf (LOG_VERBOSE, " [%s]\n", quotearg_style (escape_quoting_style, type));
2423 logputs (LOG_VERBOSE, "\n");
2427 type = NULL; /* We don't need it any more. */
2429 /* Return if we have no intention of further downloading. */
2430 if (!(*dt & RETROKF) || head_only)
2432 /* In case the caller cares to look... */
2437 /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
2438 servers not to send body in response to a HEAD request, and
2439 those that do will likely be caught by test_socket_open.
2440 If not, they can be worked around using
2441 `--no-http-keep-alive'. */
2442 CLOSE_FINISH (sock);
2444 && skip_short_body (sock, contlen, chunked_transfer_encoding))
2445 /* Successfully skipped the body; also keep using the socket. */
2446 CLOSE_FINISH (sock);
2448 CLOSE_INVALIDATE (sock);
2450 return RETRFINISHED;
2454 For VMS, define common fopen() optional arguments.
2457 # define FOPEN_OPT_ARGS "fop=sqo", "acc", acc_cb, &open_id
2458 # define FOPEN_BIN_FLAG 3
2459 #else /* def __VMS */
2460 # define FOPEN_BIN_FLAG true
2461 #endif /* def __VMS [else] */
2463 /* Open the local file. */
2466 mkalldirs (hs->local_file);
2468 rotate_backups (hs->local_file);
2475 fp = fopen (hs->local_file, "ab", FOPEN_OPT_ARGS);
2476 #else /* def __VMS */
2477 fp = fopen (hs->local_file, "ab");
2478 #endif /* def __VMS [else] */
2480 else if (ALLOW_CLOBBER || count > 0)
2482 if (opt.unlink && file_exists_p (hs->local_file))
2484 int res = unlink (hs->local_file);
2487 logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file,
2489 CLOSE_INVALIDATE (sock);
2499 fp = fopen (hs->local_file, "wb", FOPEN_OPT_ARGS);
2500 #else /* def __VMS */
2501 fp = fopen (hs->local_file, "wb");
2502 #endif /* def __VMS [else] */
2506 fp = fopen_excl (hs->local_file, FOPEN_BIN_FLAG);
2507 if (!fp && errno == EEXIST)
2509 /* We cannot just invent a new name and use it (which is
2510 what functions like unique_create typically do)
2511 because we told the user we'd use this name.
2512 Instead, return and retry the download. */
2513 logprintf (LOG_NOTQUIET,
2514 _("%s has sprung into existence.\n"),
2516 CLOSE_INVALIDATE (sock);
2518 return FOPEN_EXCL_ERR;
2523 logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
2524 CLOSE_INVALIDATE (sock);
2532 /* Print fetch message, if opt.verbose. */
2535 logprintf (LOG_NOTQUIET, _("Saving to: %s\n"),
2536 HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
2539 /* This confuses the timestamping code that checks for file size.
2540 #### The timestamping code should be smarter about file size. */
2541 if (opt.save_headers && hs->restval == 0)
2542 fwrite (head, 1, strlen (head), fp);
2544 /* Now we no longer need to store the response header. */
2547 /* Download the request body. */
2550 /* If content-length is present, read that much; otherwise, read
2551 until EOF. The HTTP spec doesn't require the server to
2552 actually close the connection when it's done sending data. */
2553 flags |= rb_read_exactly;
2554 if (hs->restval > 0 && contrange == 0)
2555 /* If the server ignored our range request, instruct fd_read_body
2556 to skip the first RESTVAL bytes of body. */
2557 flags |= rb_skip_startpos;
2559 if (chunked_transfer_encoding)
2560 flags |= rb_chunked_transfer_encoding;
2562 hs->len = hs->restval;
2564 hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
2565 hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
2569 CLOSE_FINISH (sock);
2573 hs->rderrmsg = xstrdup (fd_errstr (sock));
2574 CLOSE_INVALIDATE (sock);
2581 return RETRFINISHED;
2584 /* The genuine HTTP loop! This is the part where the retrieval is
2585 retried, and retried, and retried, and... */
2587 http_loop (struct url *u, struct url *original_url, char **newloc,
2588 char **local_file, const char *referer, int *dt, struct url *proxy,
2592 bool got_head = false; /* used for time-stamping and filename detection */
2593 bool time_came_from_head = false;
2594 bool got_name = false;
2597 uerr_t err, ret = TRYLIMEXC;
2598 time_t tmr = -1; /* remote time-stamp */
2599 struct http_stat hstat; /* HTTP status */
2601 bool send_head_first = true;
2603 bool force_full_retrieve = false;
2605 /* Assert that no value for *LOCAL_FILE was passed. */
2606 assert (local_file == NULL || *local_file == NULL);
2608 /* Set LOCAL_FILE parameter. */
2609 if (local_file && opt.output_document)
2610 *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
2612 /* Reset NEWLOC parameter. */
2615 /* This used to be done in main(), but it's a better idea to do it
2616 here so that we don't go through the hoops if we're just using
2621 /* Warn on (likely bogus) wildcard usage in HTTP. */
2622 if (opt.ftp_glob && has_wildcards_p (u->path))
2623 logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
2625 /* Setup hstat struct. */
2627 hstat.referer = referer;
2629 if (opt.output_document)
2631 hstat.local_file = xstrdup (opt.output_document);
2634 else if (!opt.content_disposition)
2637 url_file_name (opt.trustservernames ? u : original_url, NULL);
2641 /* TODO: Ick! This code is now in both gethttp and http_loop, and is
2642 * screaming for some refactoring. */
2643 if (got_name && file_exists_p (hstat.local_file) && opt.noclobber && !opt.output_document)
2645 /* If opt.noclobber is turned on and file already exists, do not
2646 retrieve the file. But if the output_document was given, then this
2647 test was already done and the file didn't exist. Hence the !opt.output_document */
2648 logprintf (LOG_VERBOSE, _("\
2649 File %s already there; not retrieving.\n\n"),
2650 quote (hstat.local_file));
2651 /* If the file is there, we suppose it's retrieved OK. */
2654 /* #### Bogusness alert. */
2655 /* If its suffix is "html" or "htm" or similar, assume text/html. */
2656 if (has_html_suffix_p (hstat.local_file))
2663 /* Reset the counter. */
2666 /* Reset the document type. */
2669 /* Skip preliminary HEAD request if we're not in spider mode. */
2671 send_head_first = false;
2673 /* Send preliminary HEAD request if -N is given and we have an existing
2674 * destination file. */
2675 file_name = url_file_name (opt.trustservernames ? u : original_url, NULL);
2676 if (opt.timestamping && (file_exists_p (file_name)
2677 || opt.content_disposition))
2678 send_head_first = true;
2684 /* Increment the pass counter. */
2686 sleep_between_retrievals (count);
2688 /* Get the current time string. */
2689 tms = datetime_str (time (NULL));
2691 if (opt.spider && !got_head)
2692 logprintf (LOG_VERBOSE, _("\
2693 Spider mode enabled. Check if remote file exists.\n"));
2695 /* Print fetch message, if opt.verbose. */
2698 char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
2703 sprintf (tmp, _("(try:%2d)"), count);
2704 logprintf (LOG_NOTQUIET, "--%s-- %s %s\n",
2709 logprintf (LOG_NOTQUIET, "--%s-- %s\n",
2714 ws_changetitle (hurl);
2719 /* Default document type is empty. However, if spider mode is
2720 on or time-stamping is employed, HEAD_ONLY commands is
2721 encoded within *dt. */
2722 if (send_head_first && !got_head)
2727 /* Decide whether or not to restart. */
2728 if (force_full_retrieve)
2729 hstat.restval = hstat.len;
2730 else if (opt.always_rest
2732 && stat (hstat.local_file, &st) == 0
2733 && S_ISREG (st.st_mode))
2734 /* When -c is used, continue from on-disk size. (Can't use
2735 hstat.len even if count>1 because we don't want a failed
2736 first attempt to clobber existing data.) */
2737 hstat.restval = st.st_size;
2739 /* otherwise, continue where the previous try left off */
2740 hstat.restval = hstat.len;
2744 /* Decide whether to send the no-cache directive. We send it in
2746 a) we're using a proxy, and we're past our first retrieval.
2747 Some proxies are notorious for caching incomplete data, so
2748 we require a fresh get.
2749 b) caching is explicitly inhibited. */
2750 if ((proxy && count > 1) /* a */
2751 || !opt.allow_cache) /* b */
2752 *dt |= SEND_NOCACHE;
2754 *dt &= ~SEND_NOCACHE;
2756 /* Try fetching the document, or at least its head. */
2757 err = gethttp (u, &hstat, dt, proxy, iri, count);
2760 tms = datetime_str (time (NULL));
2762 /* Get the new location (with or without the redirection). */
2764 *newloc = xstrdup (hstat.newloc);
2768 case HERR: case HEOF: case CONSOCKERR: case CONCLOSED:
2769 case CONERROR: case READERR: case WRITEFAILED:
2770 case RANGEERR: case FOPEN_EXCL_ERR:
2771 /* Non-fatal errors continue executing the loop, which will
2772 bring them to "while" statement at the end, to judge
2773 whether the number of tries was exceeded. */
2774 printwhat (count, opt.ntry);
2776 case FWRITEERR: case FOPENERR:
2777 /* Another fatal error. */
2778 logputs (LOG_VERBOSE, "\n");
2779 logprintf (LOG_NOTQUIET, _("Cannot write to %s (%s).\n"),
2780 quote (hstat.local_file), strerror (errno));
2781 case HOSTERR: case CONIMPOSSIBLE: case PROXERR: case AUTHFAILED:
2782 case SSLINITFAILED: case CONTNOTSUPPORTED: case VERIFCERTERR:
2783 /* Fatal errors just return from the function. */
2787 /* Another fatal error. */
2788 logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
2792 /* Another fatal error. */
2793 logputs (LOG_VERBOSE, "\n");
2794 logprintf (LOG_NOTQUIET, _("Cannot unlink %s (%s).\n"),
2795 quote (hstat.local_file), strerror (errno));
2799 /* Return the new location to the caller. */
2802 logprintf (LOG_NOTQUIET,
2803 _("ERROR: Redirection (%d) without location.\n"),
2813 /* The file was already fully retrieved. */
2817 /* Deal with you later. */
2820 /* All possibilities should have been exhausted. */
2824 if (!(*dt & RETROKF))
2829 /* #### Ugly ugly ugly! */
2830 hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
2831 logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
2834 /* Fall back to GET if HEAD fails with a 500 or 501 error code. */
2836 && (hstat.statcode == 500 || hstat.statcode == 501))
2841 /* Maybe we should always keep track of broken links, not just in
2843 * Don't log error if it was UTF-8 encoded because we will try
2844 * once unencoded. */
2845 else if (opt.spider && !iri->utf8_encode)
2847 /* #### Again: ugly ugly ugly! */
2849 hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
2850 nonexisting_url (hurl);
2851 logprintf (LOG_NOTQUIET, _("\
2852 Remote file does not exist -- broken link!!!\n"));
2856 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
2857 tms, hstat.statcode,
2858 quotearg_style (escape_quoting_style, hstat.error));
2860 logputs (LOG_VERBOSE, "\n");
2866 /* Did we get the time-stamp? */
2869 got_head = true; /* no more time-stamping */
2871 if (opt.timestamping && !hstat.remote_time)
2873 logputs (LOG_NOTQUIET, _("\
2874 Last-modified header missing -- time-stamps turned off.\n"));
2876 else if (hstat.remote_time)
2878 /* Convert the date-string into struct tm. */
2879 tmr = http_atotm (hstat.remote_time);
2880 if (tmr == (time_t) (-1))
2881 logputs (LOG_VERBOSE, _("\
2882 Last-modified header invalid -- time-stamp ignored.\n"));
2883 if (*dt & HEAD_ONLY)
2884 time_came_from_head = true;
2887 if (send_head_first)
2889 /* The time-stamping section. */
2890 if (opt.timestamping)
2892 if (hstat.orig_file_name) /* Perform the following
2893 checks only if the file
2895 download already exists. */
2897 if (hstat.remote_time &&
2898 tmr != (time_t) (-1))
2900 /* Now time-stamping can be used validly.
2901 Time-stamping means that if the sizes of
2902 the local and remote file match, and local
2903 file is newer than the remote file, it will
2904 not be retrieved. Otherwise, the normal
2905 download procedure is resumed. */
2906 if (hstat.orig_file_tstamp >= tmr)
2908 if (hstat.contlen == -1
2909 || hstat.orig_file_size == hstat.contlen)
2911 logprintf (LOG_VERBOSE, _("\
2912 Server file no newer than local file %s -- not retrieving.\n\n"),
2913 quote (hstat.orig_file_name));
2919 logprintf (LOG_VERBOSE, _("\
2920 The sizes do not match (local %s) -- retrieving.\n"),
2921 number_to_static_string (hstat.orig_file_size));
2926 force_full_retrieve = true;
2927 logputs (LOG_VERBOSE,
2928 _("Remote file is newer, retrieving.\n"));
2931 logputs (LOG_VERBOSE, "\n");
2935 /* free_hstat (&hstat); */
2936 hstat.timestamp_checked = true;
2941 bool finished = true;
2946 logputs (LOG_VERBOSE, _("\
2947 Remote file exists and could contain links to other resources -- retrieving.\n\n"));
2952 logprintf (LOG_VERBOSE, _("\
2953 Remote file exists but does not contain any link -- not retrieving.\n\n"));
2954 ret = RETROK; /* RETRUNNEEDED is not for caller. */
2961 logprintf (LOG_VERBOSE, _("\
2962 Remote file exists and could contain further links,\n\
2963 but recursion is disabled -- not retrieving.\n\n"));
2967 logprintf (LOG_VERBOSE, _("\
2968 Remote file exists.\n\n"));
2970 ret = RETROK; /* RETRUNNEEDED is not for caller. */
2975 logprintf (LOG_NONVERBOSE,
2976 _("%s URL: %s %2d %s\n"),
2977 tms, u->url, hstat.statcode,
2978 hstat.message ? quotearg_style (escape_quoting_style, hstat.message) : "");
2985 count = 0; /* the retrieve count for HEAD is reset */
2987 } /* send_head_first */
2990 if (opt.useservertimestamps
2991 && (tmr != (time_t) (-1))
2992 && ((hstat.len == hstat.contlen) ||
2993 ((hstat.res == 0) && (hstat.contlen == -1))))
2995 const char *fl = NULL;
2996 set_local_file (&fl, hstat.local_file);
3000 /* Reparse time header, in case it's changed. */
3001 if (time_came_from_head
3002 && hstat.remote_time && hstat.remote_time[0])
3004 newtmr = http_atotm (hstat.remote_time);
3005 if (newtmr != (time_t)-1)
3011 /* End of time-stamping section. */
3013 tmrate = retr_rate (hstat.rd_size, hstat.dltime);
3014 total_download_time += hstat.dltime;
3016 if (hstat.len == hstat.contlen)
3020 bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document));
3022 logprintf (LOG_VERBOSE,
3024 ? _("%s (%s) - written to stdout %s[%s/%s]\n\n")
3025 : _("%s (%s) - %s saved [%s/%s]\n\n"),
3027 write_to_stdout ? "" : quote (hstat.local_file),
3028 number_to_static_string (hstat.len),
3029 number_to_static_string (hstat.contlen));
3030 logprintf (LOG_NONVERBOSE,
3031 "%s URL:%s [%s/%s] -> \"%s\" [%d]\n",
3033 number_to_static_string (hstat.len),
3034 number_to_static_string (hstat.contlen),
3035 hstat.local_file, count);
3038 total_downloaded_bytes += hstat.rd_size;
3040 /* Remember that we downloaded the file for later ".orig" code. */
3041 if (*dt & ADDED_HTML_EXTENSION)
3042 downloaded_file (FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file);
3044 downloaded_file (FILE_DOWNLOADED_NORMALLY, hstat.local_file);
3049 else if (hstat.res == 0) /* No read error */
3051 if (hstat.contlen == -1) /* We don't know how much we were supposed
3052 to get, so assume we succeeded. */
3056 bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document));
3058 logprintf (LOG_VERBOSE,
3060 ? _("%s (%s) - written to stdout %s[%s]\n\n")
3061 : _("%s (%s) - %s saved [%s]\n\n"),
3063 write_to_stdout ? "" : quote (hstat.local_file),
3064 number_to_static_string (hstat.len));
3065 logprintf (LOG_NONVERBOSE,
3066 "%s URL:%s [%s] -> \"%s\" [%d]\n",
3067 tms, u->url, number_to_static_string (hstat.len),
3068 hstat.local_file, count);
3071 total_downloaded_bytes += hstat.rd_size;
3073 /* Remember that we downloaded the file for later ".orig" code. */
3074 if (*dt & ADDED_HTML_EXTENSION)
3075 downloaded_file (FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file);
3077 downloaded_file (FILE_DOWNLOADED_NORMALLY, hstat.local_file);
3082 else if (hstat.len < hstat.contlen) /* meaning we lost the
3083 connection too soon */
3085 logprintf (LOG_VERBOSE,
3086 _("%s (%s) - Connection closed at byte %s. "),
3087 tms, tmrate, number_to_static_string (hstat.len));
3088 printwhat (count, opt.ntry);
3091 else if (hstat.len != hstat.restval)
3092 /* Getting here would mean reading more data than
3093 requested with content-length, which we never do. */
3097 /* Getting here probably means that the content-length was
3098 * _less_ than the original, local size. We should probably
3099 * truncate or re-read, or something. FIXME */
3104 else /* from now on hstat.res can only be -1 */
3106 if (hstat.contlen == -1)
3108 logprintf (LOG_VERBOSE,
3109 _("%s (%s) - Read error at byte %s (%s)."),
3110 tms, tmrate, number_to_static_string (hstat.len),
3112 printwhat (count, opt.ntry);
3115 else /* hstat.res == -1 and contlen is given */
3117 logprintf (LOG_VERBOSE,
3118 _("%s (%s) - Read error at byte %s/%s (%s). "),
3120 number_to_static_string (hstat.len),
3121 number_to_static_string (hstat.contlen),
3123 printwhat (count, opt.ntry);
3129 while (!opt.ntry || (count < opt.ntry));
3132 if (ret == RETROK && local_file)
3133 *local_file = xstrdup (hstat.local_file);
3134 free_hstat (&hstat);
3139 /* Check whether the result of strptime() indicates success.
3140 strptime() returns the pointer to how far it got to in the string.
3141 The processing has been successful if the string is at `GMT' or
3142 `+X', or at the end of the string.
3144 In extended regexp parlance, the function returns 1 if P matches
3145 "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime
3146 can return) is considered a failure and 0 is returned. */
3148 check_end (const char *p)
3152 while (c_isspace (*p))
3155 || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T')
3156 || ((p[0] == '+' || p[0] == '-') && c_isdigit (p[1])))
3162 /* Convert the textual specification of time in TIME_STRING to the
3163 number of seconds since the Epoch.
3165 TIME_STRING can be in any of the three formats RFC2616 allows the
3166 HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date,
3167 as well as the time format used in the Set-Cookie header.
3168 Timezones are ignored, and should be GMT.
3170 Return the computed time_t representation, or -1 if the conversion
3173 This function uses strptime with various string formats for parsing
3174 TIME_STRING. This results in a parser that is not as lenient in
3175 interpreting TIME_STRING as I would like it to be. Being based on
3176 strptime, it always allows shortened months, one-digit days, etc.,
3177 but due to the multitude of formats in which time can be
3178 represented, an ideal HTTP time parser would be even more
3179 forgiving. It should completely ignore things like week days and
3180 concentrate only on the various forms of representing years,
3181 months, days, hours, minutes, and seconds. For example, it would
3182 be nice if it accepted ISO 8601 out of the box.
3184 I've investigated free and PD code for this purpose, but none was
3185 usable. getdate was big and unwieldy, and had potential copyright
3186 issues, or so I was informed. Dr. Marcus Hennecke's atotm(),
3187 distributed with phttpd, is excellent, but we cannot use it because
3188 it is not assigned to the FSF. So I stuck it with strptime. */
3191 http_atotm (const char *time_string)
3193 /* NOTE: Solaris strptime man page claims that %n and %t match white
3194 space, but that's not universally available. Instead, we simply
3195 use ` ' to mean "skip all WS", which works under all strptime
3196 implementations I've tested. */
3198 static const char *time_formats[] = {
3199 "%a, %d %b %Y %T", /* rfc1123: Thu, 29 Jan 1998 22:12:57 */
3200 "%A, %d-%b-%y %T", /* rfc850: Thursday, 29-Jan-98 22:12:57 */
3201 "%a %b %d %T %Y", /* asctime: Thu Jan 29 22:12:57 1998 */
3202 "%a, %d-%b-%Y %T" /* cookies: Thu, 29-Jan-1998 22:12:57
3203 (used in Set-Cookie, defined in the
3204 Netscape cookie specification.) */
3206 const char *oldlocale;
3207 char savedlocale[256];
3209 time_t ret = (time_t) -1;
3211 /* Solaris strptime fails to recognize English month names in
3212 non-English locales, which we work around by temporarily setting
3213 locale to C before invoking strptime. */
3214 oldlocale = setlocale (LC_TIME, NULL);
3217 size_t l = strlen (oldlocale) + 1;
3218 if (l >= sizeof savedlocale)
3219 savedlocale[0] = '\0';
3221 memcpy (savedlocale, oldlocale, l);
3223 else savedlocale[0] = '\0';
3225 setlocale (LC_TIME, "C");
3227 for (i = 0; i < countof (time_formats); i++)
3231 /* Some versions of strptime use the existing contents of struct
3232 tm to recalculate the date according to format. Zero it out
3233 to prevent stack garbage from influencing strptime. */
3236 if (check_end (strptime (time_string, time_formats[i], &t)))
3243 /* Restore the previous locale. */
3245 setlocale (LC_TIME, savedlocale);
3250 /* Authorization support: We support three authorization schemes:
3252 * `Basic' scheme, consisting of base64-ing USER:PASSWORD string;
3254 * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>,
3255 consisting of answering to the server's challenge with the proper
3258 * `NTLM' ("NT Lan Manager") scheme, based on code written by Daniel
3259 Stenberg for libcurl. Like digest, NTLM is based on a
3260 challenge-response mechanism, but unlike digest, it is non-standard
3261 (authenticates TCP connections rather than requests), undocumented
3262 and Microsoft-specific. */
3264 /* Create the authentication header contents for the `Basic' scheme.
3265 This is done by encoding the string "USER:PASS" to base64 and
3266 prepending the string "Basic " in front of it. */
3269 basic_authentication_encode (const char *user, const char *passwd)
3272 int len1 = strlen (user) + 1 + strlen (passwd);
3274 t1 = (char *)alloca (len1 + 1);
3275 sprintf (t1, "%s:%s", user, passwd);
3277 t2 = (char *)alloca (BASE64_LENGTH (len1) + 1);
3278 base64_encode (t1, len1, t2);
3280 return concat_strings ("Basic ", t2, (char *) 0);
3283 #define SKIP_WS(x) do { \
3284 while (c_isspace (*(x))) \
3288 #ifdef ENABLE_DIGEST
3289 /* Dump the hexadecimal representation of HASH to BUF. HASH should be
3290 an array of 16 bytes containing the hash keys, and BUF should be a
3291 buffer of 33 writable characters (32 for hex digits plus one for
3292 zero termination). */
3294 dump_hash (char *buf, const unsigned char *hash)
3298 for (i = 0; i < MD5_DIGEST_SIZE; i++, hash++)
3300 *buf++ = XNUM_TO_digit (*hash >> 4);
3301 *buf++ = XNUM_TO_digit (*hash & 0xf);
3306 /* Take the line apart to find the challenge, and compose a digest
3307 authorization header. See RFC2069 section 2.1.2. */
3309 digest_authentication_encode (const char *au, const char *user,
3310 const char *passwd, const char *method,
3313 static char *realm, *opaque, *nonce;
3318 { "realm", &realm },
3319 { "opaque", &opaque },
3323 param_token name, value;
3325 realm = opaque = nonce = NULL;
3327 au += 6; /* skip over `Digest' */
3328 while (extract_param (&au, &name, &value, ','))
3331 size_t namelen = name.e - name.b;
3332 for (i = 0; i < countof (options); i++)
3333 if (namelen == strlen (options[i].name)
3334 && 0 == strncmp (name.b, options[i].name,
3337 *options[i].variable = strdupdelim (value.b, value.e);
3341 if (!realm || !nonce || !user || !passwd || !path || !method)
3344 xfree_null (opaque);
3349 /* Calculate the digest value. */
3352 unsigned char hash[MD5_DIGEST_SIZE];
3353 char a1buf[MD5_DIGEST_SIZE * 2 + 1], a2buf[MD5_DIGEST_SIZE * 2 + 1];
3354 char response_digest[MD5_DIGEST_SIZE * 2 + 1];
3356 /* A1BUF = H(user ":" realm ":" password) */
3357 md5_init_ctx (&ctx);
3358 md5_process_bytes ((unsigned char *)user, strlen (user), &ctx);
3359 md5_process_bytes ((unsigned char *)":", 1, &ctx);
3360 md5_process_bytes ((unsigned char *)realm, strlen (realm), &ctx);
3361 md5_process_bytes ((unsigned char *)":", 1, &ctx);
3362 md5_process_bytes ((unsigned char *)passwd, strlen (passwd), &ctx);
3363 md5_finish_ctx (&ctx, hash);
3364 dump_hash (a1buf, hash);
3366 /* A2BUF = H(method ":" path) */
3367 md5_init_ctx (&ctx);
3368 md5_process_bytes ((unsigned char *)method, strlen (method), &ctx);
3369 md5_process_bytes ((unsigned char *)":", 1, &ctx);
3370 md5_process_bytes ((unsigned char *)path, strlen (path), &ctx);
3371 md5_finish_ctx (&ctx, hash);
3372 dump_hash (a2buf, hash);
3374 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */
3375 md5_init_ctx (&ctx);
3376 md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx);
3377 md5_process_bytes ((unsigned char *)":", 1, &ctx);
3378 md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx);
3379 md5_process_bytes ((unsigned char *)":", 1, &ctx);
3380 md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx);
3381 md5_finish_ctx (&ctx, hash);
3382 dump_hash (response_digest, hash);
3384 res = xmalloc (strlen (user)
3389 + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/
3390 + (opaque ? strlen (opaque) : 0)
3392 sprintf (res, "Digest \
3393 username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"",
3394 user, realm, nonce, path, response_digest);
3397 char *p = res + strlen (res);
3398 strcat (p, ", opaque=\"");
3405 #endif /* ENABLE_DIGEST */
3407 /* Computing the size of a string literal must take into account that
3408 value returned by sizeof includes the terminating \0. */
3409 #define STRSIZE(literal) (sizeof (literal) - 1)
3411 /* Whether chars in [b, e) begin with the literal string provided as
3412 first argument and are followed by whitespace or terminating \0.
3413 The comparison is case-insensitive. */
3414 #define STARTS(literal, b, e) \
3416 && ((size_t) ((e) - (b))) >= STRSIZE (literal) \
3417 && 0 == strncasecmp (b, literal, STRSIZE (literal)) \
3418 && ((size_t) ((e) - (b)) == STRSIZE (literal) \
3419 || c_isspace (b[STRSIZE (literal)])))
3422 known_authentication_scheme_p (const char *hdrbeg, const char *hdrend)
3424 return STARTS ("Basic", hdrbeg, hdrend)
3425 #ifdef ENABLE_DIGEST
3426 || STARTS ("Digest", hdrbeg, hdrend)
3429 || STARTS ("NTLM", hdrbeg, hdrend)
3436 /* Create the HTTP authorization request header. When the
3437 `WWW-Authenticate' response header is seen, according to the
3438 authorization scheme specified in that header (`Basic' and `Digest'
3439 are supported by the current implementation), produce an
3440 appropriate HTTP authorization request header. */
3442 create_authorization_line (const char *au, const char *user,
3443 const char *passwd, const char *method,
3444 const char *path, bool *finished)
3446 /* We are called only with known schemes, so we can dispatch on the
3448 switch (c_toupper (*au))
3450 case 'B': /* Basic */
3452 return basic_authentication_encode (user, passwd);
3453 #ifdef ENABLE_DIGEST
3454 case 'D': /* Digest */
3456 return digest_authentication_encode (au, user, passwd, method, path);
3459 case 'N': /* NTLM */
3460 if (!ntlm_input (&pconn.ntlm, au))
3465 return ntlm_output (&pconn.ntlm, user, passwd, finished);
3468 /* We shouldn't get here -- this function should be only called
3469 with values approved by known_authentication_scheme_p. */
3477 if (!wget_cookie_jar)
3478 wget_cookie_jar = cookie_jar_new ();
3479 if (opt.cookies_input && !cookies_loaded_p)
3481 cookie_jar_load (wget_cookie_jar, opt.cookies_input);
3482 cookies_loaded_p = true;
3489 if (wget_cookie_jar)
3490 cookie_jar_save (wget_cookie_jar, opt.cookies_output);
3496 xfree_null (pconn.host);
3497 if (wget_cookie_jar)
3498 cookie_jar_delete (wget_cookie_jar);
3502 ensure_extension (struct http_stat *hs, const char *ext, int *dt)
3504 char *last_period_in_local_filename = strrchr (hs->local_file, '.');
3506 int len = strlen (ext);
3509 strncpy (shortext, ext, len - 1);
3510 shortext[len - 2] = '\0';
3513 if (last_period_in_local_filename == NULL
3514 || !(0 == strcasecmp (last_period_in_local_filename, shortext)
3515 || 0 == strcasecmp (last_period_in_local_filename, ext)))
3517 int local_filename_len = strlen (hs->local_file);
3518 /* Resize the local file, allowing for ".html" preceded by
3519 optional ".NUMBER". */
3520 hs->local_file = xrealloc (hs->local_file,
3521 local_filename_len + 24 + len);
3522 strcpy (hs->local_file + local_filename_len, ext);
3523 /* If clobbering is not allowed and the file, as named,
3524 exists, tack on ".NUMBER.html" instead. */
3525 if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
3529 sprintf (hs->local_file + local_filename_len,
3530 ".%d%s", ext_num++, ext);
3531 while (file_exists_p (hs->local_file));
3533 *dt |= ADDED_HTML_EXTENSION;
3541 test_parse_content_disposition()
3549 { "filename=\"file.ext\"", "file.ext", true },
3550 { "attachment; filename=\"file.ext\"", "file.ext", true },
3551 { "attachment; filename=\"file.ext\"; dummy", "file.ext", true },
3552 { "attachment", NULL, false },
3553 { "attachement; filename*=UTF-8'en-US'hello.txt", "hello.txt", true },
3554 { "attachement; filename*0=\"hello\"; filename*1=\"world.txt\"", "helloworld.txt", true },
3557 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
3562 res = parse_content_disposition (test_array[i].hdrval, &filename);
3564 mu_assert ("test_parse_content_disposition: wrong result",
3565 res == test_array[i].result
3567 || 0 == strcmp (test_array[i].filename, filename)));
3573 #endif /* TESTING */
3576 * vim: et sts=2 sw=2 cino+={s