X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhttp.c;h=cf901929435679d914345745f41c040e792dd600;hp=247ba6748096f56153d4922a23a4314552529bb0;hb=0fcd1bb235fd3a29df49a70683ac4156658b4e17;hpb=1d9bb1c7b05d877f7dc2f8b537feada3f2c373a3 diff --git a/src/http.c b/src/http.c index 247ba674..cf901929 100644 --- a/src/http.c +++ b/src/http.c @@ -1,6 +1,6 @@ /* HTTP support. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -58,6 +58,7 @@ as that of the covered work. */ #include "md5.h" #include "convert.h" #include "spider.h" +#include "warc.h" #ifdef TESTING #include "test.h" @@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name) p += A_len; \ } while (0) -/* Construct the request and write it to FD using fd_write. */ +/* Construct the request and write it to FD using fd_write. + If warc_tmp is set to a file pointer, the request string will + also be written to that file. */ static int -request_send (const struct request *req, int fd) +request_send (const struct request *req, int fd, FILE *warc_tmp) { char *request_string, *p; int i, size, write_error; @@ -374,6 +377,13 @@ request_send (const struct request *req, int fd) if (write_error < 0) logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), fd_errstr (fd)); + else if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp); + if (warc_tmp_written != size - 1) + return -2; + } return write_error; } @@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname) /* Send the contents of FILE_NAME to SOCK. Make sure that exactly PROMISED_SIZE bytes are sent over the wire -- if the file is - longer, read only that much; if the file is shorter, report an error. */ + longer, read only that much; if the file is shorter, report an error. + If warc_tmp is set to a file pointer, the post data will + also be written to that file. */ static int -post_file (int sock, const char *file_name, wgint promised_size) +post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp) { static char chunk[8192]; wgint written = 0; @@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size) fclose (fp); return -1; } + if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp); + if (warc_tmp_written != towrite) + { + fclose (fp); + return -2; + } + } written += towrite; } fclose (fp); @@ -929,9 +951,12 @@ skip_short_body (int fd, wgint contlen, bool chunked) break; remaining_chunk_size = strtol (line, &endl, 16); + xfree (line); + if (remaining_chunk_size == 0) { - fd_read_line (fd); + line = fd_read_line (fd); + xfree_null (line); break; } } @@ -956,8 +981,13 @@ skip_short_body (int fd, wgint contlen, bool chunked) { remaining_chunk_size -= ret; if (remaining_chunk_size == 0) - if (fd_read_line (fd) == NULL) - return false; + { + char *line = fd_read_line (fd); + if (line == NULL) + return false; + else + xfree (line); + } } /* Safe even if %.*s bogusly expects terminating \0 because @@ -1462,6 +1492,135 @@ File %s already there; not retrieving.\n\n"), quote (filename)); *dt |= TEXTHTML; } +/* Download the response body from the socket and writes it to + an output file. The headers have already been read from the + socket. If WARC is enabled, the response body will also be + written to a WARC response record. + + hs, contlen, contrange, chunked_transfer_encoding and url are + parameters from the gethttp method. fp is a pointer to the + output file. + + url, warc_timestamp_str, warc_request_uuid, warc_ip, type + and statcode will be saved in the headers of the WARC record. + The head parameter contains the HTTP headers of the response. + + If fp is NULL and WARC is enabled, the response body will be + written only to the WARC file. If WARC is disabled and fp + is a file pointer, the data will be written to the file. + If fp is a file pointer and WARC is enabled, the body will + be written to both destinations. + + Returns the error code. */ +static int +read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, + wgint contrange, bool chunked_transfer_encoding, + char *url, char *warc_timestamp_str, char *warc_request_uuid, + ip_address *warc_ip, char *type, int statcode, char *head) +{ + int warc_payload_offset = 0; + FILE *warc_tmp = NULL; + int warcerr = 0; + + if (opt.warc_filename != NULL) + { + /* Open a temporary file where we can write the response before we + add it to the WARC record. */ + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + warcerr = WARC_TMP_FOPENERR; + + if (warcerr == 0) + { + /* We should keep the response headers for the WARC record. */ + int head_len = strlen (head); + int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp); + if (warc_tmp_written != head_len) + warcerr = WARC_TMP_FWRITEERR; + warc_payload_offset = head_len; + } + + if (warcerr != 0) + { + if (warc_tmp != NULL) + fclose (warc_tmp); + return warcerr; + } + } + + if (fp != NULL) + { + /* This confuses the timestamping code that checks for file size. + #### The timestamping code should be smarter about file size. */ + if (opt.save_headers && hs->restval == 0) + fwrite (head, 1, strlen (head), fp); + } + + /* Read the response body. */ + int flags = 0; + if (contlen != -1) + /* If content-length is present, read that much; otherwise, read + until EOF. The HTTP spec doesn't require the server to + actually close the connection when it's done sending data. */ + flags |= rb_read_exactly; + if (fp != NULL && hs->restval > 0 && contrange == 0) + /* If the server ignored our range request, instruct fd_read_body + to skip the first RESTVAL bytes of body. */ + flags |= rb_skip_startpos; + if (chunked_transfer_encoding) + flags |= rb_chunked_transfer_encoding; + + hs->len = hs->restval; + hs->rd_size = 0; + /* Download the response body and write it to fp. + If we are working on a WARC file, we simultaneously write the + response body to warc_tmp. */ + hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, + hs->restval, &hs->rd_size, &hs->len, &hs->dltime, + flags, warc_tmp); + if (hs->res >= 0) + { + if (warc_tmp != NULL) + { + /* Create a response record and write it to the WARC file. + Note: per the WARC standard, the request and response should share + the same date header. We re-use the timestamp of the request. + The response record should also refer to the uuid of the request. */ + bool r = warc_write_response_record (url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset, + type, statcode, hs->newloc); + + /* warc_write_response_record has closed warc_tmp. */ + + if (! r) + return WARC_ERR; + } + + return RETRFINISHED; + } + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (hs->res == -2) + { + /* Error while writing to fd. */ + return FWRITEERR; + } + else if (hs->res == -3) + { + /* Error while writing to warc_tmp. */ + return WARC_TMP_FWRITEERR; + } + else + { + /* A read error! */ + hs->rderrmsg = xstrdup (fd_errstr (sock)); + return RETRFINISHED; + } +} + #define BEGINS_WITH(line, string_constant) \ (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ && (c_isspace (line[sizeof (string_constant) - 1]) \ @@ -1519,9 +1678,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, wgint contlen, contrange; struct url *conn; FILE *fp; + int err; int sock = -1; - int flags; /* Set to 1 when the authorization has already been sent and should not be tried again. */ @@ -1547,6 +1706,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, char hdrval[256]; char *message; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + char warc_timestamp_str [21]; + char warc_request_uuid [48]; + ip_address *warc_ip = NULL; + off_t warc_payload_offset = -1; + /* Whether this connection will be kept alive after the HTTP request is done. */ bool keep_alive; @@ -1626,7 +1793,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, /* ... but some HTTP/1.0 caches doesn't implement Cache-Control. */ request_set_header (req, "Pragma", "no-cache", rel_none); } - if (hs->restval && !opt.timestamping) + if (hs->restval) request_set_header (req, "Range", aprintf ("bytes=%s-", number_to_static_string (hs->restval)), @@ -1792,11 +1959,17 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif &host_lookup_failed)) { + int family = socket_family (pconn.socket, ENDPOINT_PEER); sock = pconn.socket; using_ssl = pconn.ssl; - logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), - quotearg_style (escape_quoting_style, pconn.host), - pconn.port); + if (family == AF_INET6) + logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); + else + logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); DEBUGP (("Reusing fd %d.\n", sock)); if (pconn.authorized) /* If the connection is already authorized, the "Basic" @@ -1852,11 +2025,12 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, that the contents of Host would be exactly the same as the contents of CONNECT. */ - write_error = request_send (connreq, sock); + write_error = request_send (connreq, sock, 0); request_free (connreq); if (write_error < 0) { CLOSE_INVALIDATE (sock); + request_free (req); return WRITEFAILED; } @@ -1866,6 +2040,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"), fd_errstr (sock)); CLOSE_INVALIDATE (sock); + request_free (req); return HERR; } message = NULL; @@ -1886,6 +2061,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, quotearg_style (escape_quoting_style, _("Malformed status line"))); xfree (head); + request_free (req); return HERR; } hs->message = xstrdup (message); @@ -1897,6 +2073,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"), message ? quotearg_style (escape_quoting_style, message) : "?"); xfree_null (message); + request_free (req); return CONSSLERR; } xfree_null (message); @@ -1909,14 +2086,16 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, if (conn->scheme == SCHEME_HTTPS) { - if (!ssl_connect_wget (sock)) + if (!ssl_connect_wget (sock, u->host)) { fd_close (sock); + request_free (req); return CONSSLERR; } else if (!ssl_check_certificate (sock, u->host)) { fd_close (sock); + request_free (req); return VERIFCERTERR; } using_ssl = true; @@ -1924,8 +2103,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif /* HAVE_SSL */ } + /* Open the temporary file where we will write the request. */ + if (warc_enabled) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_TMP_FOPENERR; + } + + if (! proxy) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (sock, warc_ip, ENDPOINT_PEER); + } + } + /* Send the request to server. */ - write_error = request_send (req, sock); + write_error = request_send (req, sock, warc_tmp); if (write_error >= 0) { @@ -1933,16 +2130,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, { DEBUGP (("[POST data: %s]\n", opt.post_data)); write_error = fd_write (sock, opt.post_data, post_data_size, -1); + if (write_error >= 0 && warc_tmp != NULL) + { + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp); + if (warc_tmp_written != post_data_size) + write_error = -2; + } } else if (opt.post_file_name && post_data_size != 0) - write_error = post_file (sock, opt.post_file_name, post_data_size); + { + if (warc_tmp != NULL) + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp); + } } if (write_error < 0) { CLOSE_INVALIDATE (sock); request_free (req); - return WRITEFAILED; + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (write_error == -2) + return WARC_TMP_FWRITEERR; + else + return WRITEFAILED; } logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), proxy ? "Proxy" : "HTTP"); @@ -1950,6 +2170,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, contrange = 0; *dt &= ~RETROKF; + + if (warc_enabled) + { + bool warc_result; + /* Generate a timestamp and uuid for this request. */ + warc_timestamp (warc_timestamp_str); + warc_uuid_str (warc_request_uuid); + + /* Create a request record and store it in the WARC file. */ + warc_result = warc_write_request_record (u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset); + if (! warc_result) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_ERR; + } + + /* warc_write_request_record has also closed warc_tmp. */ + } + + read_header: head = read_http_response_head (sock); if (!head) @@ -1985,13 +2228,17 @@ read_header: quotearg_style (escape_quoting_style, _("Malformed status line"))); CLOSE_INVALIDATE (sock); + resp_free (resp); request_free (req); + xfree (head); return HERR; } if (H_10X (statcode)) { DEBUGP (("Ignoring response\n")); + resp_free (resp); + xfree (head); goto read_header; } @@ -2040,8 +2287,9 @@ read_header: } } - resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)); - if (0 == strcasecmp (hdrval, "chunked")) + chunked_transfer_encoding = false; + if (resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)) + && 0 == strcasecmp (hdrval, "chunked")) chunked_transfer_encoding = true; /* Handle (possibly multiple instances of) the Set-Cookie header. */ @@ -2070,11 +2318,42 @@ read_header: if (statcode == HTTP_STATUS_UNAUTHORIZED) { /* Authorization is required. */ - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* Normally we are not interested in the response body. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err; + type = resp_header_strdup (resp, "Content-Type"); + err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + xfree_null (type); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + request_free (req); + xfree_null (message); + resp_free (resp); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + pconn.authorized = false; if (!auth_finished && (user && passwd)) { @@ -2179,6 +2458,8 @@ read_header: retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ get_file_flags (hs->local_file, dt); + request_free (req); + resp_free (resp); xfree (head); xfree_null (message); return RETRUNNEEDED; @@ -2322,11 +2603,42 @@ read_header: _("Location: %s%s\n"), hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), hs->newloc ? _(" [following]") : ""); - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* In case the caller cares to look... */ + hs->len = 0; + hs->res = 0; + hs->restval = 0; + + /* Normally we are not interested in the response body of a redirect. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree_null (type); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree_null (type); xfree (head); /* From RFC2616: The status codes 303 and 307 have @@ -2444,30 +2756,55 @@ read_header: logputs (LOG_VERBOSE, "\n"); } } - xfree_null (type); - type = NULL; /* We don't need it any more. */ /* Return if we have no intention of further downloading. */ - if (!(*dt & RETROKF) || head_only) + if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only) { /* In case the caller cares to look... */ hs->len = 0; hs->res = 0; - xfree_null (type); - if (head_only) - /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the - servers not to send body in response to a HEAD request, and - those that do will likely be caught by test_socket_open. - If not, they can be worked around using - `--no-http-keep-alive'. */ - CLOSE_FINISH (sock); - else if (keep_alive - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - /* Successfully skipped the body; also keep using the socket. */ - CLOSE_FINISH (sock); + hs->restval = 0; + + /* Normally we are not interested in the response body of a error responses. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree (head); + xfree_null (type); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (head_only) + /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the + servers not to send body in response to a HEAD request, and + those that do will likely be caught by test_socket_open. + If not, they can be worked around using + `--no-http-keep-alive'. */ + CLOSE_FINISH (sock); + else if (keep_alive + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + /* Successfully skipped the body; also keep using the socket. */ + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree (head); + xfree_null (type); return RETRFINISHED; } @@ -2509,6 +2846,7 @@ read_header: strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return UNLINKERR; } } @@ -2536,6 +2874,7 @@ read_header: hs->local_file); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPEN_EXCL_ERR; } } @@ -2544,6 +2883,7 @@ read_header: logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPENERR; } } @@ -2557,49 +2897,26 @@ read_header: HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); } - /* This confuses the timestamping code that checks for file size. - #### The timestamping code should be smarter about file size. */ - if (opt.save_headers && hs->restval == 0) - fwrite (head, 1, strlen (head), fp); + + err = read_response_body (hs, sock, fp, contlen, contrange, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); /* Now we no longer need to store the response header. */ xfree (head); - - /* Download the request body. */ - flags = 0; - if (contlen != -1) - /* If content-length is present, read that much; otherwise, read - until EOF. The HTTP spec doesn't require the server to - actually close the connection when it's done sending data. */ - flags |= rb_read_exactly; - if (hs->restval > 0 && contrange == 0) - /* If the server ignored our range request, instruct fd_read_body - to skip the first RESTVAL bytes of body. */ - flags |= rb_skip_startpos; - - if (chunked_transfer_encoding) - flags |= rb_chunked_transfer_encoding; - - hs->len = hs->restval; - hs->rd_size = 0; - hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, - hs->restval, &hs->rd_size, &hs->len, &hs->dltime, - flags); + xfree_null (type); if (hs->res >= 0) CLOSE_FINISH (sock); else - { - if (hs->res < 0) - hs->rderrmsg = xstrdup (fd_errstr (sock)); - CLOSE_INVALIDATE (sock); - } + CLOSE_INVALIDATE (sock); if (!output_stream) fclose (fp); - if (hs->res == -2) - return FWRITEERR; - return RETRFINISHED; + + return err; } /* The genuine HTTP loop! This is the part where the retrieval is @@ -2623,6 +2940,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc, char *file_name; bool force_full_retrieve = false; + + /* If we are writing to a WARC file: always retrieve the whole file. */ + if (opt.warc_filename != NULL) + force_full_retrieve = true; + + /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); @@ -2792,6 +3115,18 @@ Spider mode enabled. Check if remote file exists.\n")); /* Fatal errors just return from the function. */ ret = err; goto exit; + case WARC_ERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to WARC file..\n")); + ret = err; + goto exit; + case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n")); + ret = err; + goto exit; case CONSSLERR: /* Another fatal error. */ logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));