X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhttp.c;h=9f274dc618abc8c65acbfe2e001206acf7ff7c54;hp=142b9219442a8c5f2888ac8bea9edb76910786d0;hb=e9cc8b2f7c4678b832ad56f7119bba86a8db08ef;hpb=2f6aa1d7417df1dfc58597777686fbd77179b9fd diff --git a/src/http.c b/src/http.c index 142b9219..9f274dc6 100644 --- a/src/http.c +++ b/src/http.c @@ -1,6 +1,6 @@ /* HTTP support. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -58,6 +58,7 @@ as that of the covered work. */ #include "md5.h" #include "convert.h" #include "spider.h" +#include "warc.h" #ifdef TESTING #include "test.h" @@ -146,27 +147,20 @@ struct request { extern int numurls; -/* Create a new, empty request. At least request_set_method must be - called before the request can be used. */ +/* Create a new, empty request. Set the request's method and its + arguments. METHOD should be a literal string (or it should outlive + the request) because it will not be freed. ARG will be freed by + request_free. */ static struct request * -request_new (void) +request_new (const char *method, char *arg) { struct request *req = xnew0 (struct request); req->hcapacity = 8; req->headers = xnew_array (struct request_header, req->hcapacity); - return req; -} - -/* Set the request's method and its arguments. METH should be a - literal string (or it should outlive the request) because it will - not be freed. ARG will be freed by request_free. */ - -static void -request_set_method (struct request *req, const char *meth, char *arg) -{ - req->method = meth; + req->method = method; req->arg = arg; + return req; } /* Return the method string passed with the last call to @@ -230,7 +224,7 @@ release_header (struct request_header *hdr) */ static void -request_set_header (struct request *req, char *name, char *value, +request_set_header (struct request *req, const char *name, const char *value, enum rp release_policy) { struct request_header *hdr; @@ -241,7 +235,7 @@ request_set_header (struct request *req, char *name, char *value, /* A NULL value is a no-op; if freeing the name is requested, free it now to avoid leaks. */ if (release_policy == rel_name || release_policy == rel_both) - xfree (name); + xfree ((void *)name); return; } @@ -252,8 +246,8 @@ request_set_header (struct request *req, char *name, char *value, { /* Replace existing header. */ release_header (hdr); - hdr->name = name; - hdr->value = value; + hdr->name = (void *)name; + hdr->value = (void *)value; hdr->release_policy = release_policy; return; } @@ -267,8 +261,8 @@ request_set_header (struct request *req, char *name, char *value, req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr)); } hdr = &req->headers[req->hcount++]; - hdr->name = name; - hdr->value = value; + hdr->name = (void *)name; + hdr->value = (void *)value; hdr->release_policy = release_policy; } @@ -295,7 +289,7 @@ request_set_user_header (struct request *req, const char *header) the header was actually removed, false otherwise. */ static bool -request_remove_header (struct request *req, char *name) +request_remove_header (struct request *req, const char *name) { int i; for (i = 0; i < req->hcount; i++) @@ -320,10 +314,12 @@ request_remove_header (struct request *req, char *name) p += A_len; \ } while (0) -/* Construct the request and write it to FD using fd_write. */ +/* Construct the request and write it to FD using fd_write. + If warc_tmp is set to a file pointer, the request string will + also be written to that file. */ static int -request_send (const struct request *req, int fd) +request_send (const struct request *req, int fd, FILE *warc_tmp) { char *request_string, *p; int i, size, write_error; @@ -374,6 +370,13 @@ request_send (const struct request *req, int fd) if (write_error < 0) logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), fd_errstr (fd)); + else if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp); + if (warc_tmp_written != size - 1) + return -2; + } return write_error; } @@ -444,17 +447,19 @@ register_basic_auth_host (const char *hostname) /* Send the contents of FILE_NAME to SOCK. Make sure that exactly PROMISED_SIZE bytes are sent over the wire -- if the file is - longer, read only that much; if the file is shorter, report an error. */ + longer, read only that much; if the file is shorter, report an error. + If warc_tmp is set to a file pointer, the post data will + also be written to that file. */ static int -post_file (int sock, const char *file_name, wgint promised_size) +body_file_send (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp) { static char chunk[8192]; wgint written = 0; int write_error; FILE *fp; - DEBUGP (("[writing POST file %s ... ", file_name)); + DEBUGP (("[writing BODY file %s ... ", file_name)); fp = fopen (file_name, "rb"); if (!fp) @@ -472,6 +477,16 @@ post_file (int sock, const char *file_name, wgint promised_size) fclose (fp); return -1; } + if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp); + if (warc_tmp_written != towrite) + { + fclose (fp); + return -2; + } + } written += towrite; } fclose (fp); @@ -929,9 +944,12 @@ skip_short_body (int fd, wgint contlen, bool chunked) break; remaining_chunk_size = strtol (line, &endl, 16); + xfree (line); + if (remaining_chunk_size == 0) { - fd_read_line (fd); + line = fd_read_line (fd); + xfree_null (line); break; } } @@ -956,8 +974,13 @@ skip_short_body (int fd, wgint contlen, bool chunked) { remaining_chunk_size -= ret; if (remaining_chunk_size == 0) - if (fd_read_line (fd) == NULL) - return false; + { + char *line = fd_read_line (fd); + if (line == NULL) + return false; + else + xfree (line); + } } /* Safe even if %.*s bogusly expects terminating \0 because @@ -1448,6 +1471,149 @@ free_hstat (struct http_stat *hs) hs->error = NULL; } +static void +get_file_flags (const char *filename, int *dt) +{ + logprintf (LOG_VERBOSE, _("\ +File %s already there; not retrieving.\n\n"), quote (filename)); + /* If the file is there, we suppose it's retrieved OK. */ + *dt |= RETROKF; + + /* #### Bogusness alert. */ + /* If its suffix is "html" or "htm" or similar, assume text/html. */ + if (has_html_suffix_p (filename)) + *dt |= TEXTHTML; +} + +/* Download the response body from the socket and writes it to + an output file. The headers have already been read from the + socket. If WARC is enabled, the response body will also be + written to a WARC response record. + + hs, contlen, contrange, chunked_transfer_encoding and url are + parameters from the gethttp method. fp is a pointer to the + output file. + + url, warc_timestamp_str, warc_request_uuid, warc_ip, type + and statcode will be saved in the headers of the WARC record. + The head parameter contains the HTTP headers of the response. + + If fp is NULL and WARC is enabled, the response body will be + written only to the WARC file. If WARC is disabled and fp + is a file pointer, the data will be written to the file. + If fp is a file pointer and WARC is enabled, the body will + be written to both destinations. + + Returns the error code. */ +static int +read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, + wgint contrange, bool chunked_transfer_encoding, + char *url, char *warc_timestamp_str, char *warc_request_uuid, + ip_address *warc_ip, char *type, int statcode, char *head) +{ + int warc_payload_offset = 0; + FILE *warc_tmp = NULL; + int warcerr = 0; + + if (opt.warc_filename != NULL) + { + /* Open a temporary file where we can write the response before we + add it to the WARC record. */ + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + warcerr = WARC_TMP_FOPENERR; + + if (warcerr == 0) + { + /* We should keep the response headers for the WARC record. */ + int head_len = strlen (head); + int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp); + if (warc_tmp_written != head_len) + warcerr = WARC_TMP_FWRITEERR; + warc_payload_offset = head_len; + } + + if (warcerr != 0) + { + if (warc_tmp != NULL) + fclose (warc_tmp); + return warcerr; + } + } + + if (fp != NULL) + { + /* This confuses the timestamping code that checks for file size. + #### The timestamping code should be smarter about file size. */ + if (opt.save_headers && hs->restval == 0) + fwrite (head, 1, strlen (head), fp); + } + + /* Read the response body. */ + int flags = 0; + if (contlen != -1) + /* If content-length is present, read that much; otherwise, read + until EOF. The HTTP spec doesn't require the server to + actually close the connection when it's done sending data. */ + flags |= rb_read_exactly; + if (fp != NULL && hs->restval > 0 && contrange == 0) + /* If the server ignored our range request, instruct fd_read_body + to skip the first RESTVAL bytes of body. */ + flags |= rb_skip_startpos; + if (chunked_transfer_encoding) + flags |= rb_chunked_transfer_encoding; + + hs->len = hs->restval; + hs->rd_size = 0; + /* Download the response body and write it to fp. + If we are working on a WARC file, we simultaneously write the + response body to warc_tmp. */ + hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, + hs->restval, &hs->rd_size, &hs->len, &hs->dltime, + flags, warc_tmp); + if (hs->res >= 0) + { + if (warc_tmp != NULL) + { + /* Create a response record and write it to the WARC file. + Note: per the WARC standard, the request and response should share + the same date header. We re-use the timestamp of the request. + The response record should also refer to the uuid of the request. */ + bool r = warc_write_response_record (url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset, + type, statcode, hs->newloc); + + /* warc_write_response_record has closed warc_tmp. */ + + if (! r) + return WARC_ERR; + } + + return RETRFINISHED; + } + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (hs->res == -2) + { + /* Error while writing to fd. */ + return FWRITEERR; + } + else if (hs->res == -3) + { + /* Error while writing to warc_tmp. */ + return WARC_TMP_FWRITEERR; + } + else + { + /* A read error! */ + hs->rderrmsg = xstrdup (fd_errstr (sock)); + return RETRFINISHED; + } +} + #define BEGINS_WITH(line, string_constant) \ (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ && (c_isspace (line[sizeof (string_constant) - 1]) \ @@ -1505,9 +1671,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, wgint contlen, contrange; struct url *conn; FILE *fp; + int err; int sock = -1; - int flags; /* Set to 1 when the authorization has already been sent and should not be tried again. */ @@ -1533,6 +1699,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, char hdrval[256]; char *message; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + char warc_timestamp_str [21]; + char warc_request_uuid [48]; + ip_address *warc_ip = NULL; + off_t warc_payload_offset = -1; + /* Whether this connection will be kept alive after the HTTP request is done. */ bool keep_alive; @@ -1545,7 +1719,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, !opt.http_keep_alive || opt.ignore_length; /* Headers sent when using POST. */ - wgint post_data_size = 0; + wgint body_data_size = 0; bool host_lookup_failed = false; @@ -1577,15 +1751,13 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, conn = u; /* Prepare the request to send. */ - - req = request_new (); { char *meth_arg; const char *meth = "GET"; if (head_only) meth = "HEAD"; - else if (opt.post_file_name || opt.post_data) - meth = "POST"; + else if (opt.method) + meth = opt.method; /* Use the full path, i.e. one that includes the leading slash and the query string. E.g. if u->path is "foo/bar" and u->query is "param=value", full_path will be "/foo/bar?param=value". */ @@ -1600,13 +1772,19 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, meth_arg = xstrdup (u->url); else meth_arg = url_full_path (u); - request_set_method (req, meth, meth_arg); + req = request_new (meth, meth_arg); } request_set_header (req, "Referer", (char *) hs->referer, rel_none); if (*dt & SEND_NOCACHE) - request_set_header (req, "Pragma", "no-cache", rel_none); - if (hs->restval && !opt.timestamping) + { + /* Cache-Control MUST be obeyed by all HTTP/1.1 caching mechanisms... */ + request_set_header (req, "Cache-Control", "no-cache, must-revalidate", rel_none); + + /* ... but some HTTP/1.0 caches doesn't implement Cache-Control. */ + request_set_header (req, "Pragma", "no-cache", rel_none); + } + if (hs->restval) request_set_header (req, "Range", aprintf ("bytes=%s-", number_to_static_string (hs->restval)), @@ -1665,25 +1843,30 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, } } - if (opt.post_data || opt.post_file_name) + if (opt.method) { - request_set_header (req, "Content-Type", - "application/x-www-form-urlencoded", rel_none); - if (opt.post_data) - post_data_size = strlen (opt.post_data); - else + + if (opt.body_data || opt.body_file) { - post_data_size = file_size (opt.post_file_name); - if (post_data_size == -1) + request_set_header (req, "Content-Type", + "application/x-www-form-urlencoded", rel_none); + + if (opt.body_data) + body_data_size = strlen (opt.body_data); + else { - logprintf (LOG_NOTQUIET, _("POST data file %s missing: %s\n"), - quote (opt.post_file_name), strerror (errno)); - post_data_size = 0; + body_data_size = file_size (opt.body_file); + if (body_data_size == -1) + { + logprintf (LOG_NOTQUIET, _("BODY data file %s missing: %s\n"), + quote (opt.body_file), strerror (errno)); + return FILEBADFILE; + } } + request_set_header (req, "Content-Length", + xstrdup (number_to_static_string (body_data_size)), + rel_value); } - request_set_header (req, "Content-Length", - xstrdup (number_to_static_string (post_data_size)), - rel_value); } retry_with_auth: @@ -1772,11 +1955,19 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif &host_lookup_failed)) { + int family = socket_family (pconn.socket, ENDPOINT_PEER); sock = pconn.socket; using_ssl = pconn.ssl; - logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), - quotearg_style (escape_quoting_style, pconn.host), - pconn.port); +#if ENABLE_IPV6 + if (family == AF_INET6) + logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); + else +#endif + logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); DEBUGP (("Reusing fd %d.\n", sock)); if (pconn.authorized) /* If the connection is already authorized, the "Basic" @@ -1792,6 +1983,10 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, exec_name, quote (relevant->host)); return HOSTERR; } + else if (sock != -1) + { + sock = -1; + } } if (sock < 0) @@ -1814,8 +2009,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, { /* When requesting SSL URLs through proxies, use the CONNECT method to request passthrough. */ - struct request *connreq = request_new (); - request_set_method (connreq, "CONNECT", + struct request *connreq = request_new ("CONNECT", aprintf ("%s:%d", u->host, u->port)); SET_USER_AGENT (connreq); if (proxyauth) @@ -1832,11 +2026,12 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, that the contents of Host would be exactly the same as the contents of CONNECT. */ - write_error = request_send (connreq, sock); + write_error = request_send (connreq, sock, 0); request_free (connreq); if (write_error < 0) { CLOSE_INVALIDATE (sock); + request_free (req); return WRITEFAILED; } @@ -1846,6 +2041,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"), fd_errstr (sock)); CLOSE_INVALIDATE (sock); + request_free (req); return HERR; } message = NULL; @@ -1866,6 +2062,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, quotearg_style (escape_quoting_style, _("Malformed status line"))); xfree (head); + request_free (req); return HERR; } hs->message = xstrdup (message); @@ -1877,6 +2074,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"), message ? quotearg_style (escape_quoting_style, message) : "?"); xfree_null (message); + request_free (req); return CONSSLERR; } xfree_null (message); @@ -1889,14 +2087,16 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, if (conn->scheme == SCHEME_HTTPS) { - if (!ssl_connect_wget (sock)) + if (!ssl_connect_wget (sock, u->host)) { fd_close (sock); + request_free (req); return CONSSLERR; } else if (!ssl_check_certificate (sock, u->host)) { fd_close (sock); + request_free (req); return VERIFCERTERR; } using_ssl = true; @@ -1904,25 +2104,66 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif /* HAVE_SSL */ } + /* Open the temporary file where we will write the request. */ + if (warc_enabled) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_TMP_FOPENERR; + } + + if (! proxy) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (sock, warc_ip, ENDPOINT_PEER); + } + } + /* Send the request to server. */ - write_error = request_send (req, sock); + write_error = request_send (req, sock, warc_tmp); if (write_error >= 0) { - if (opt.post_data) + if (opt.body_data) + { + DEBUGP (("[BODY data: %s]\n", opt.body_data)); + write_error = fd_write (sock, opt.body_data, body_data_size, -1); + if (write_error >= 0 && warc_tmp != NULL) + { + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (opt.body_data, 1, body_data_size, warc_tmp); + if (warc_tmp_written != body_data_size) + write_error = -2; + } + } + else if (opt.body_file && body_data_size != 0) { - DEBUGP (("[POST data: %s]\n", opt.post_data)); - write_error = fd_write (sock, opt.post_data, post_data_size, -1); + if (warc_tmp != NULL) + /* Remember end of headers / start of payload */ + warc_payload_offset = ftello (warc_tmp); + + write_error = body_file_send (sock, opt.body_file, body_data_size, warc_tmp); } - else if (opt.post_file_name && post_data_size != 0) - write_error = post_file (sock, opt.post_file_name, post_data_size); } if (write_error < 0) { CLOSE_INVALIDATE (sock); request_free (req); - return WRITEFAILED; + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (write_error == -2) + return WARC_TMP_FWRITEERR; + else + return WRITEFAILED; } logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), proxy ? "Proxy" : "HTTP"); @@ -1930,6 +2171,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, contrange = 0; *dt &= ~RETROKF; + + if (warc_enabled) + { + bool warc_result; + /* Generate a timestamp and uuid for this request. */ + warc_timestamp (warc_timestamp_str); + warc_uuid_str (warc_request_uuid); + + /* Create a request record and store it in the WARC file. */ + warc_result = warc_write_request_record (u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset); + if (! warc_result) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_ERR; + } + + /* warc_write_request_record has also closed warc_tmp. */ + } + + read_header: head = read_http_response_head (sock); if (!head) @@ -1965,13 +2229,17 @@ read_header: quotearg_style (escape_quoting_style, _("Malformed status line"))); CLOSE_INVALIDATE (sock); + resp_free (resp); request_free (req); + xfree (head); return HERR; } if (H_10X (statcode)) { DEBUGP (("Ignoring response\n")); + resp_free (resp); + xfree (head); goto read_header; } @@ -2020,8 +2288,9 @@ read_header: } } - resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)); - if (0 == strcasecmp (hdrval, "chunked")) + chunked_transfer_encoding = false; + if (resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)) + && 0 == strcasecmp (hdrval, "chunked")) chunked_transfer_encoding = true; /* Handle (possibly multiple instances of) the Set-Cookie header. */ @@ -2050,11 +2319,42 @@ read_header: if (statcode == HTTP_STATUS_UNAUTHORIZED) { /* Authorization is required. */ - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* Normally we are not interested in the response body. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err; + type = resp_header_strdup (resp, "Content-Type"); + err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + xfree_null (type); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + request_free (req); + xfree_null (message); + resp_free (resp); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + pconn.authorized = false; if (!auth_finished && (user && passwd)) { @@ -2158,16 +2458,9 @@ read_header: /* If opt.noclobber is turned on and file already exists, do not retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ - logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), quote (hs->local_file)); - /* If the file is there, we suppose it's retrieved OK. */ - *dt |= RETROKF; - - /* #### Bogusness alert. */ - /* If its suffix is "html" or "htm" or similar, assume text/html. */ - if (has_html_suffix_p (hs->local_file)) - *dt |= TEXTHTML; - + get_file_flags (hs->local_file, dt); + request_free (req); + resp_free (resp); xfree (head); xfree_null (message); return RETRUNNEEDED; @@ -2311,13 +2604,76 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); _("Location: %s%s\n"), hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), hs->newloc ? _(" [following]") : ""); - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* In case the caller cares to look... */ + hs->len = 0; + hs->res = 0; + hs->restval = 0; + + /* Normally we are not interested in the response body of a redirect. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree_null (type); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree_null (type); xfree (head); + /* From RFC2616: The status codes 303 and 307 have + been added for servers that wish to make unambiguously + clear which kind of reaction is expected of the client. + + A 307 should be redirected using the same method, + in other words, a POST should be preserved and not + converted to a GET in that case. + + With strict adherence to RFC2616, POST requests are not + converted to a GET request on 301 Permanent Redirect + or 302 Temporary Redirect. + + A switch may be provided later based on the HTTPbis draft + that allows clients to convert POST requests to GET + requests on 301 and 302 response codes. */ + switch (statcode) + { + case HTTP_STATUS_TEMPORARY_REDIRECT: + return NEWLOCATION_KEEP_POST; + break; + case HTTP_STATUS_MOVED_PERMANENTLY: + if (opt.method && strcasecmp (opt.method, "post") != 0) + return NEWLOCATION_KEEP_POST; + break; + case HTTP_STATUS_MOVED_TEMPORARILY: + if (opt.method && strcasecmp (opt.method, "post") != 0) + return NEWLOCATION_KEEP_POST; + break; + default: + return NEWLOCATION; + break; + } return NEWLOCATION; } } @@ -2424,30 +2780,55 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); logputs (LOG_VERBOSE, "\n"); } } - xfree_null (type); - type = NULL; /* We don't need it any more. */ /* Return if we have no intention of further downloading. */ - if (!(*dt & RETROKF) || head_only) + if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only) { /* In case the caller cares to look... */ hs->len = 0; hs->res = 0; - xfree_null (type); - if (head_only) - /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the - servers not to send body in response to a HEAD request, and - those that do will likely be caught by test_socket_open. - If not, they can be worked around using - `--no-http-keep-alive'. */ - CLOSE_FINISH (sock); - else if (keep_alive - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - /* Successfully skipped the body; also keep using the socket. */ - CLOSE_FINISH (sock); + hs->restval = 0; + + /* Normally we are not interested in the response body of a error responses. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree (head); + xfree_null (type); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (head_only) + /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the + servers not to send body in response to a HEAD request, and + those that do will likely be caught by test_socket_open. + If not, they can be worked around using + `--no-http-keep-alive'. */ + CLOSE_FINISH (sock); + else if (keep_alive + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + /* Successfully skipped the body; also keep using the socket. */ + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree (head); + xfree_null (type); return RETRFINISHED; } @@ -2489,6 +2870,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return UNLINKERR; } } @@ -2516,6 +2898,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); hs->local_file); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPEN_EXCL_ERR; } } @@ -2524,6 +2907,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPENERR; } } @@ -2537,49 +2921,26 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); } - /* This confuses the timestamping code that checks for file size. - #### The timestamping code should be smarter about file size. */ - if (opt.save_headers && hs->restval == 0) - fwrite (head, 1, strlen (head), fp); + + err = read_response_body (hs, sock, fp, contlen, contrange, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); /* Now we no longer need to store the response header. */ xfree (head); - - /* Download the request body. */ - flags = 0; - if (contlen != -1) - /* If content-length is present, read that much; otherwise, read - until EOF. The HTTP spec doesn't require the server to - actually close the connection when it's done sending data. */ - flags |= rb_read_exactly; - if (hs->restval > 0 && contrange == 0) - /* If the server ignored our range request, instruct fd_read_body - to skip the first RESTVAL bytes of body. */ - flags |= rb_skip_startpos; - - if (chunked_transfer_encoding) - flags |= rb_chunked_transfer_encoding; - - hs->len = hs->restval; - hs->rd_size = 0; - hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, - hs->restval, &hs->rd_size, &hs->len, &hs->dltime, - flags); + xfree_null (type); if (hs->res >= 0) CLOSE_FINISH (sock); else - { - if (hs->res < 0) - hs->rderrmsg = xstrdup (fd_errstr (sock)); - CLOSE_INVALIDATE (sock); - } + CLOSE_INVALIDATE (sock); if (!output_stream) fclose (fp); - if (hs->res == -2) - return FWRITEERR; - return RETRFINISHED; + + return err; } /* The genuine HTTP loop! This is the part where the retrieval is @@ -2603,6 +2964,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc, char *file_name; bool force_full_retrieve = false; + + /* If we are writing to a WARC file: always retrieve the whole file. */ + if (opt.warc_filename != NULL) + force_full_retrieve = true; + + /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); @@ -2639,24 +3006,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc, got_name = true; } - /* TODO: Ick! This code is now in both gethttp and http_loop, and is - * screaming for some refactoring. */ if (got_name && file_exists_p (hstat.local_file) && opt.noclobber && !opt.output_document) { /* If opt.noclobber is turned on and file already exists, do not retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ - logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), - quote (hstat.local_file)); - /* If the file is there, we suppose it's retrieved OK. */ - *dt |= RETROKF; - - /* #### Bogusness alert. */ - /* If its suffix is "html" or "htm" or similar, assume text/html. */ - if (has_html_suffix_p (hstat.local_file)) - *dt |= TEXTHTML; - + get_file_flags (hstat.local_file, dt); ret = RETROK; goto exit; } @@ -2671,6 +3026,11 @@ File %s already there; not retrieving.\n\n"), if (!opt.spider) send_head_first = false; + /* Send preliminary HEAD request if --content-disposition and -c are used + together. */ + if (opt.content_disposition && opt.always_rest) + send_head_first = true; + /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ file_name = url_file_name (opt.trustservernames ? u : original_url, NULL); @@ -2781,9 +3141,22 @@ Spider mode enabled. Check if remote file exists.\n")); quote (hstat.local_file), strerror (errno)); case HOSTERR: case CONIMPOSSIBLE: case PROXERR: case AUTHFAILED: case SSLINITFAILED: case CONTNOTSUPPORTED: case VERIFCERTERR: + case FILEBADFILE: /* Fatal errors just return from the function. */ ret = err; goto exit; + case WARC_ERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to WARC file.\n")); + ret = err; + goto exit; + case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n")); + ret = err; + goto exit; case CONSSLERR: /* Another fatal error. */ logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n")); @@ -2797,6 +3170,7 @@ Spider mode enabled. Check if remote file exists.\n")); ret = err; goto exit; case NEWLOCATION: + case NEWLOCATION_KEEP_POST: /* Return the new location to the caller. */ if (!*newloc) { @@ -2807,7 +3181,7 @@ Spider mode enabled. Check if remote file exists.\n")); } else { - ret = NEWLOCATION; + ret = err; } goto exit; case RETRUNNEEDED: @@ -3311,19 +3685,26 @@ digest_authentication_encode (const char *au, const char *user, const char *passwd, const char *method, const char *path) { - static char *realm, *opaque, *nonce; + static char *realm, *opaque, *nonce, *qop, *algorithm; static struct { const char *name; char **variable; } options[] = { { "realm", &realm }, { "opaque", &opaque }, - { "nonce", &nonce } + { "nonce", &nonce }, + { "qop", &qop }, + { "algorithm", &algorithm } }; + char cnonce[16] = ""; char *res; + int res_len; + size_t res_size; param_token name, value; - realm = opaque = nonce = NULL; + + realm = opaque = nonce = qop = NULL; + algorithm = "MD5"; au += 6; /* skip over `Digest' */ while (extract_param (&au, &name, &value, ',')) @@ -3339,11 +3720,26 @@ digest_authentication_encode (const char *au, const char *user, break; } } + + if (qop != NULL && strcmp(qop,"auth")) + { + logprintf (LOG_NOTQUIET, _("Unsupported quality of protection '%s'.\n"), qop); + user = NULL; /* force freeing mem and return */ + } + + if (algorithm != NULL && strcmp (algorithm,"MD5") && strcmp (algorithm,"MD5-sess")) + { + logprintf (LOG_NOTQUIET, _("Unsupported algorithm '%s'.\n"), algorithm); + user = NULL; /* force freeing mem and return */ + } + if (!realm || !nonce || !user || !passwd || !path || !method) { xfree_null (realm); xfree_null (opaque); xfree_null (nonce); + xfree_null (qop); + xfree_null (algorithm); return NULL; } @@ -3362,8 +3758,26 @@ digest_authentication_encode (const char *au, const char *user, md5_process_bytes ((unsigned char *)":", 1, &ctx); md5_process_bytes ((unsigned char *)passwd, strlen (passwd), &ctx); md5_finish_ctx (&ctx, hash); + dump_hash (a1buf, hash); + if (! strcmp (algorithm, "MD5-sess")) + { + /* A1BUF = H( H(user ":" realm ":" password) ":" nonce ":" cnonce ) */ + snprintf (cnonce, sizeof (cnonce), "%08x", random_number(INT_MAX)); + + md5_init_ctx (&ctx); + // md5_process_bytes (hash, MD5_DIGEST_SIZE, &ctx); + md5_process_bytes (a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)cnonce, strlen (cnonce), &ctx); + md5_finish_ctx (&ctx, hash); + + dump_hash (a1buf, hash); + } + /* A2BUF = H(method ":" path) */ md5_init_ctx (&ctx); md5_process_bytes ((unsigned char *)method, strlen (method), &ctx); @@ -3372,33 +3786,79 @@ digest_authentication_encode (const char *au, const char *user, md5_finish_ctx (&ctx, hash); dump_hash (a2buf, hash); - /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ - md5_init_ctx (&ctx); - md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); - md5_process_bytes ((unsigned char *)":", 1, &ctx); - md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); - md5_process_bytes ((unsigned char *)":", 1, &ctx); - md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); - md5_finish_ctx (&ctx, hash); + if (qop && (!strcmp(qop, "auth") || !strcmp (qop, "auth-int"))) + { + /* RFC 2617 Digest Access Authentication */ + /* generate random hex string */ + if (!*cnonce) + snprintf(cnonce, sizeof(cnonce), "%08x", random_number(INT_MAX)); + + /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" noncecount ":" clientnonce ":" qop ": " A2BUF) */ + md5_init_ctx (&ctx); + md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)"00000001", 8, &ctx); /* TODO: keep track of server nonce values */ + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)cnonce, strlen(cnonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)qop, strlen(qop), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_finish_ctx (&ctx, hash); + } + else + { + /* RFC 2069 Digest Access Authentication */ + /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ + md5_init_ctx (&ctx); + md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_finish_ctx (&ctx, hash); + } + dump_hash (response_digest, hash); - res = xmalloc (strlen (user) - + strlen (user) - + strlen (realm) - + strlen (nonce) - + strlen (path) - + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ - + (opaque ? strlen (opaque) : 0) - + 128); - sprintf (res, "Digest \ -username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", - user, realm, nonce, path, response_digest); + res_size = strlen (user) + + strlen (realm) + + strlen (nonce) + + strlen (path) + + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ + + (opaque ? strlen (opaque) : 0) + + (algorithm ? strlen (algorithm) : 0) + + (qop ? 128: 0) + + strlen (cnonce) + + 128; + + res = xmalloc (res_size); + + if (qop && !strcmp (qop, "auth")) + { + res_len = snprintf (res, res_size, "Digest "\ + "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\""\ + ", qop=auth, nc=00000001, cnonce=\"%s\"", + user, realm, nonce, path, response_digest, cnonce); + + } + else + { + res_len = snprintf (res, res_size, "Digest "\ + "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", + user, realm, nonce, path, response_digest); + } + if (opaque) { - char *p = res + strlen (res); - strcat (p, ", opaque=\""); - strcat (p, opaque); - strcat (p, "\""); + res_len += snprintf(res + res_len, res_size - res_len, ", opaque=\"%s\"", opaque); + } + + if (algorithm) + { + snprintf(res + res_len, res_size - res_len, ", algorithm=\"%s\"", algorithm); } } return res; @@ -3508,7 +3968,7 @@ ensure_extension (struct http_stat *hs, const char *ext, int *dt) if (len == 5) { strncpy (shortext, ext, len - 1); - shortext[len - 2] = '\0'; + shortext[len - 1] = '\0'; } if (last_period_in_local_filename == NULL