X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fhttp.c;h=588847486a5ea93dfe8adafed682663bbd59265f;hp=496e64a73dc5f976c3507a4c60eef6bcaac86fe9;hb=359dd167602071cfa62d6c586ca846ede5ed7c29;hpb=2107eb06abfb4154834e025018aa9f628e8010db diff --git a/src/http.c b/src/http.c index 496e64a7..58884748 100644 --- a/src/http.c +++ b/src/http.c @@ -1,6 +1,7 @@ /* HTTP support. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, + Inc. This file is part of GNU Wget. @@ -33,9 +34,7 @@ as that of the covered work. */ #include #include #include -#ifdef HAVE_UNISTD_H -# include -#endif +#include #include #include #include @@ -59,6 +58,7 @@ as that of the covered work. */ #include "md5.h" #include "convert.h" #include "spider.h" +#include "warc.h" #ifdef TESTING #include "test.h" @@ -231,7 +231,7 @@ release_header (struct request_header *hdr) */ static void -request_set_header (struct request *req, char *name, char *value, +request_set_header (struct request *req, const char *name, const char *value, enum rp release_policy) { struct request_header *hdr; @@ -242,7 +242,7 @@ request_set_header (struct request *req, char *name, char *value, /* A NULL value is a no-op; if freeing the name is requested, free it now to avoid leaks. */ if (release_policy == rel_name || release_policy == rel_both) - xfree (name); + xfree ((void *)name); return; } @@ -253,8 +253,8 @@ request_set_header (struct request *req, char *name, char *value, { /* Replace existing header. */ release_header (hdr); - hdr->name = name; - hdr->value = value; + hdr->name = (void *)name; + hdr->value = (void *)value; hdr->release_policy = release_policy; return; } @@ -268,8 +268,8 @@ request_set_header (struct request *req, char *name, char *value, req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr)); } hdr = &req->headers[req->hcount++]; - hdr->name = name; - hdr->value = value; + hdr->name = (void *)name; + hdr->value = (void *)value; hdr->release_policy = release_policy; } @@ -296,7 +296,7 @@ request_set_user_header (struct request *req, const char *header) the header was actually removed, false otherwise. */ static bool -request_remove_header (struct request *req, char *name) +request_remove_header (struct request *req, const char *name) { int i; for (i = 0; i < req->hcount; i++) @@ -321,10 +321,12 @@ request_remove_header (struct request *req, char *name) p += A_len; \ } while (0) -/* Construct the request and write it to FD using fd_write. */ +/* Construct the request and write it to FD using fd_write. + If warc_tmp is set to a file pointer, the request string will + also be written to that file. */ static int -request_send (const struct request *req, int fd) +request_send (const struct request *req, int fd, FILE *warc_tmp) { char *request_string, *p; int i, size, write_error; @@ -375,6 +377,13 @@ request_send (const struct request *req, int fd) if (write_error < 0) logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), fd_errstr (fd)); + else if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp); + if (warc_tmp_written != size - 1) + return -2; + } return write_error; } @@ -445,10 +454,12 @@ register_basic_auth_host (const char *hostname) /* Send the contents of FILE_NAME to SOCK. Make sure that exactly PROMISED_SIZE bytes are sent over the wire -- if the file is - longer, read only that much; if the file is shorter, report an error. */ + longer, read only that much; if the file is shorter, report an error. + If warc_tmp is set to a file pointer, the post data will + also be written to that file. */ static int -post_file (int sock, const char *file_name, wgint promised_size) +post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp) { static char chunk[8192]; wgint written = 0; @@ -473,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size) fclose (fp); return -1; } + if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp); + if (warc_tmp_written != towrite) + { + fclose (fp); + return -2; + } + } written += towrite; } fclose (fp); @@ -927,17 +948,15 @@ skip_short_body (int fd, wgint contlen, bool chunked) char *line = fd_read_line (fd); char *endl; if (line == NULL) - { - ret = -1; - break; - } + break; remaining_chunk_size = strtol (line, &endl, 16); + xfree (line); + if (remaining_chunk_size == 0) { - ret = 0; - if (fd_read_line (fd) == NULL) - ret = -1; + line = fd_read_line (fd); + xfree_null (line); break; } } @@ -962,8 +981,13 @@ skip_short_body (int fd, wgint contlen, bool chunked) { remaining_chunk_size -= ret; if (remaining_chunk_size == 0) - if (fd_read_line (fd) == NULL) - return false; + { + char *line = fd_read_line (fd); + if (line == NULL) + return false; + else + xfree (line); + } } /* Safe even if %.*s bogusly expects terminating \0 because @@ -1149,71 +1173,44 @@ append_value_to_filename (char **filename, param_token const * const value) false. The file name is stripped of directory components and must not be - empty. */ + empty. + + Historically, this function returned filename prefixed with opt.dir_prefix, + now that logic is handled by the caller, new code should pay attention, + changed by crq, Sep 2010. + +*/ static bool parse_content_disposition (const char *hdr, char **filename) { - *filename = NULL; param_token name, value; + *filename = NULL; while (extract_param (&hdr, &name, &value, ';')) { int isFilename = BOUNDED_EQUAL_NO_CASE ( name.b, name.e, "filename" ); if ( isFilename && value.b != NULL) - { - /* Make the file name begin at the last slash or backslash. */ - const char *last_slash = memrchr (value.b, '/', value.e - value.b); - const char *last_bs = memrchr (value.b, '\\', value.e - value.b); - if (last_slash && last_bs) - value.b = 1 + MAX (last_slash, last_bs); - else if (last_slash || last_bs) - value.b = 1 + (last_slash ? last_slash : last_bs); - if (value.b == value.e) - continue; - /* Start with the directory prefix, if specified. */ - if (opt.dir_prefix) - { - if (!(*filename)) - { - int prefix_length = strlen (opt.dir_prefix); - bool add_slash = (opt.dir_prefix[prefix_length - 1] != '/'); - int total_length; - - if (add_slash) - ++prefix_length; - total_length = prefix_length + (value.e - value.b); - *filename = xmalloc (total_length + 1); - strcpy (*filename, opt.dir_prefix); - if (add_slash) - (*filename)[prefix_length - 1] = '/'; - memcpy (*filename + prefix_length, value.b, (value.e - value.b)); - (*filename)[total_length] = '\0'; - } - else - { - append_value_to_filename (filename, &value); - } - } - else - { - if (*filename) - { - append_value_to_filename (filename, &value); - } - else - { - *filename = strdupdelim (value.b, value.e); - } - } - } + { + /* Make the file name begin at the last slash or backslash. */ + const char *last_slash = memrchr (value.b, '/', value.e - value.b); + const char *last_bs = memrchr (value.b, '\\', value.e - value.b); + if (last_slash && last_bs) + value.b = 1 + MAX (last_slash, last_bs); + else if (last_slash || last_bs) + value.b = 1 + (last_slash ? last_slash : last_bs); + if (value.b == value.e) + continue; + + if (*filename) + append_value_to_filename (filename, &value); + else + *filename = strdupdelim (value.b, value.e); + } } + if (*filename) - { - return true; - } + return true; else - { - return false; - } + return false; } @@ -1481,6 +1478,149 @@ free_hstat (struct http_stat *hs) hs->error = NULL; } +static void +get_file_flags (const char *filename, int *dt) +{ + logprintf (LOG_VERBOSE, _("\ +File %s already there; not retrieving.\n\n"), quote (filename)); + /* If the file is there, we suppose it's retrieved OK. */ + *dt |= RETROKF; + + /* #### Bogusness alert. */ + /* If its suffix is "html" or "htm" or similar, assume text/html. */ + if (has_html_suffix_p (filename)) + *dt |= TEXTHTML; +} + +/* Download the response body from the socket and writes it to + an output file. The headers have already been read from the + socket. If WARC is enabled, the response body will also be + written to a WARC response record. + + hs, contlen, contrange, chunked_transfer_encoding and url are + parameters from the gethttp method. fp is a pointer to the + output file. + + url, warc_timestamp_str, warc_request_uuid, warc_ip, type + and statcode will be saved in the headers of the WARC record. + The head parameter contains the HTTP headers of the response. + + If fp is NULL and WARC is enabled, the response body will be + written only to the WARC file. If WARC is disabled and fp + is a file pointer, the data will be written to the file. + If fp is a file pointer and WARC is enabled, the body will + be written to both destinations. + + Returns the error code. */ +static int +read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, + wgint contrange, bool chunked_transfer_encoding, + char *url, char *warc_timestamp_str, char *warc_request_uuid, + ip_address *warc_ip, char *type, int statcode, char *head) +{ + int warc_payload_offset = 0; + FILE *warc_tmp = NULL; + int warcerr = 0; + + if (opt.warc_filename != NULL) + { + /* Open a temporary file where we can write the response before we + add it to the WARC record. */ + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + warcerr = WARC_TMP_FOPENERR; + + if (warcerr == 0) + { + /* We should keep the response headers for the WARC record. */ + int head_len = strlen (head); + int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp); + if (warc_tmp_written != head_len) + warcerr = WARC_TMP_FWRITEERR; + warc_payload_offset = head_len; + } + + if (warcerr != 0) + { + if (warc_tmp != NULL) + fclose (warc_tmp); + return warcerr; + } + } + + if (fp != NULL) + { + /* This confuses the timestamping code that checks for file size. + #### The timestamping code should be smarter about file size. */ + if (opt.save_headers && hs->restval == 0) + fwrite (head, 1, strlen (head), fp); + } + + /* Read the response body. */ + int flags = 0; + if (contlen != -1) + /* If content-length is present, read that much; otherwise, read + until EOF. The HTTP spec doesn't require the server to + actually close the connection when it's done sending data. */ + flags |= rb_read_exactly; + if (fp != NULL && hs->restval > 0 && contrange == 0) + /* If the server ignored our range request, instruct fd_read_body + to skip the first RESTVAL bytes of body. */ + flags |= rb_skip_startpos; + if (chunked_transfer_encoding) + flags |= rb_chunked_transfer_encoding; + + hs->len = hs->restval; + hs->rd_size = 0; + /* Download the response body and write it to fp. + If we are working on a WARC file, we simultaneously write the + response body to warc_tmp. */ + hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, + hs->restval, &hs->rd_size, &hs->len, &hs->dltime, + flags, warc_tmp); + if (hs->res >= 0) + { + if (warc_tmp != NULL) + { + /* Create a response record and write it to the WARC file. + Note: per the WARC standard, the request and response should share + the same date header. We re-use the timestamp of the request. + The response record should also refer to the uuid of the request. */ + bool r = warc_write_response_record (url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset, + type, statcode, hs->newloc); + + /* warc_write_response_record has closed warc_tmp. */ + + if (! r) + return WARC_ERR; + } + + return RETRFINISHED; + } + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (hs->res == -2) + { + /* Error while writing to fd. */ + return FWRITEERR; + } + else if (hs->res == -3) + { + /* Error while writing to warc_tmp. */ + return WARC_TMP_FWRITEERR; + } + else + { + /* A read error! */ + hs->rderrmsg = xstrdup (fd_errstr (sock)); + return RETRFINISHED; + } +} + #define BEGINS_WITH(line, string_constant) \ (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ && (c_isspace (line[sizeof (string_constant) - 1]) \ @@ -1526,7 +1666,7 @@ free_hstat (struct http_stat *hs) server, and u->url will be requested. */ static uerr_t gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, - struct iri *iri) + struct iri *iri, int count) { struct request *req; @@ -1538,9 +1678,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, wgint contlen, contrange; struct url *conn; FILE *fp; + int err; int sock = -1; - int flags; /* Set to 1 when the authorization has already been sent and should not be tried again. */ @@ -1566,6 +1706,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, char hdrval[256]; char *message; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + char warc_timestamp_str [21]; + char warc_request_uuid [48]; + ip_address *warc_ip = NULL; + off_t warc_payload_offset = -1; + /* Whether this connection will be kept alive after the HTTP request is done. */ bool keep_alive; @@ -1638,8 +1786,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, request_set_header (req, "Referer", (char *) hs->referer, rel_none); if (*dt & SEND_NOCACHE) - request_set_header (req, "Pragma", "no-cache", rel_none); - if (hs->restval && !opt.timestamping) + { + /* Cache-Control MUST be obeyed by all HTTP/1.1 caching mechanisms... */ + request_set_header (req, "Cache-Control", "no-cache, must-revalidate", rel_none); + + /* ... but some HTTP/1.0 caches doesn't implement Cache-Control. */ + request_set_header (req, "Pragma", "no-cache", rel_none); + } + if (hs->restval) request_set_header (req, "Range", aprintf ("bytes=%s-", number_to_static_string (hs->restval)), @@ -1805,11 +1959,19 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif &host_lookup_failed)) { + int family = socket_family (pconn.socket, ENDPOINT_PEER); sock = pconn.socket; using_ssl = pconn.ssl; - logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), - quotearg_style (escape_quoting_style, pconn.host), - pconn.port); +#if ENABLE_IPV6 + if (family == AF_INET6) + logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); + else +#endif + logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); DEBUGP (("Reusing fd %d.\n", sock)); if (pconn.authorized) /* If the connection is already authorized, the "Basic" @@ -1865,11 +2027,12 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, that the contents of Host would be exactly the same as the contents of CONNECT. */ - write_error = request_send (connreq, sock); + write_error = request_send (connreq, sock, 0); request_free (connreq); if (write_error < 0) { CLOSE_INVALIDATE (sock); + request_free (req); return WRITEFAILED; } @@ -1879,6 +2042,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"), fd_errstr (sock)); CLOSE_INVALIDATE (sock); + request_free (req); return HERR; } message = NULL; @@ -1899,6 +2063,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, quotearg_style (escape_quoting_style, _("Malformed status line"))); xfree (head); + request_free (req); return HERR; } hs->message = xstrdup (message); @@ -1910,6 +2075,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"), message ? quotearg_style (escape_quoting_style, message) : "?"); xfree_null (message); + request_free (req); return CONSSLERR; } xfree_null (message); @@ -1922,14 +2088,16 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, if (conn->scheme == SCHEME_HTTPS) { - if (!ssl_connect_wget (sock)) + if (!ssl_connect_wget (sock, u->host)) { fd_close (sock); + request_free (req); return CONSSLERR; } else if (!ssl_check_certificate (sock, u->host)) { fd_close (sock); + request_free (req); return VERIFCERTERR; } using_ssl = true; @@ -1937,8 +2105,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif /* HAVE_SSL */ } + /* Open the temporary file where we will write the request. */ + if (warc_enabled) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_TMP_FOPENERR; + } + + if (! proxy) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (sock, warc_ip, ENDPOINT_PEER); + } + } + /* Send the request to server. */ - write_error = request_send (req, sock); + write_error = request_send (req, sock, warc_tmp); if (write_error >= 0) { @@ -1946,16 +2132,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, { DEBUGP (("[POST data: %s]\n", opt.post_data)); write_error = fd_write (sock, opt.post_data, post_data_size, -1); + if (write_error >= 0 && warc_tmp != NULL) + { + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp); + if (warc_tmp_written != post_data_size) + write_error = -2; + } } else if (opt.post_file_name && post_data_size != 0) - write_error = post_file (sock, opt.post_file_name, post_data_size); + { + if (warc_tmp != NULL) + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp); + } } if (write_error < 0) { CLOSE_INVALIDATE (sock); request_free (req); - return WRITEFAILED; + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (write_error == -2) + return WARC_TMP_FWRITEERR; + else + return WRITEFAILED; } logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), proxy ? "Proxy" : "HTTP"); @@ -1963,6 +2172,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, contrange = 0; *dt &= ~RETROKF; + + if (warc_enabled) + { + bool warc_result; + /* Generate a timestamp and uuid for this request. */ + warc_timestamp (warc_timestamp_str); + warc_uuid_str (warc_request_uuid); + + /* Create a request record and store it in the WARC file. */ + warc_result = warc_write_request_record (u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset); + if (! warc_result) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_ERR; + } + + /* warc_write_request_record has also closed warc_tmp. */ + } + + read_header: head = read_http_response_head (sock); if (!head) @@ -1998,13 +2230,17 @@ read_header: quotearg_style (escape_quoting_style, _("Malformed status line"))); CLOSE_INVALIDATE (sock); + resp_free (resp); request_free (req); + xfree (head); return HERR; } if (H_10X (statcode)) { DEBUGP (("Ignoring response\n")); + resp_free (resp); + xfree (head); goto read_header; } @@ -2053,8 +2289,9 @@ read_header: } } - resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)); - if (0 == strcasecmp (hdrval, "chunked")) + chunked_transfer_encoding = false; + if (resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)) + && 0 == strcasecmp (hdrval, "chunked")) chunked_transfer_encoding = true; /* Handle (possibly multiple instances of) the Set-Cookie header. */ @@ -2083,11 +2320,42 @@ read_header: if (statcode == HTTP_STATUS_UNAUTHORIZED) { /* Authorization is required. */ - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* Normally we are not interested in the response body. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err; + type = resp_header_strdup (resp, "Content-Type"); + err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + xfree_null (type); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + request_free (req); + xfree_null (message); + resp_free (resp); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + pconn.authorized = false; if (!auth_finished && (user && passwd)) { @@ -2163,15 +2431,23 @@ read_header: * hstat.local_file is set by http_loop to the argument of -O. */ if (!hs->local_file) { + char *local_file = NULL; + /* Honor Content-Disposition whether possible. */ if (!opt.content_disposition || !resp_header_copy (resp, "Content-Disposition", hdrval, sizeof (hdrval)) - || !parse_content_disposition (hdrval, &hs->local_file)) + || !parse_content_disposition (hdrval, &local_file)) { /* The Content-Disposition header is missing or broken. * Choose unique file name according to given URL. */ - hs->local_file = url_file_name (u); + hs->local_file = url_file_name (u, NULL); + } + else + { + DEBUGP (("Parsed filename from Content-Disposition: %s\n", + local_file)); + hs->local_file = url_file_name (u, local_file); } } @@ -2183,16 +2459,9 @@ read_header: /* If opt.noclobber is turned on and file already exists, do not retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ - logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), quote (hs->local_file)); - /* If the file is there, we suppose it's retrieved OK. */ - *dt |= RETROKF; - - /* #### Bogusness alert. */ - /* If its suffix is "html" or "htm" or similar, assume text/html. */ - if (has_html_suffix_p (hs->local_file)) - *dt |= TEXTHTML; - + get_file_flags (hs->local_file, dt); + request_free (req); + resp_free (resp); xfree (head); xfree_null (message); return RETRUNNEEDED; @@ -2336,13 +2605,53 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); _("Location: %s%s\n"), hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), hs->newloc ? _(" [following]") : ""); - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* In case the caller cares to look... */ + hs->len = 0; + hs->res = 0; + hs->restval = 0; + + /* Normally we are not interested in the response body of a redirect. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree_null (type); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree_null (type); xfree (head); + /* From RFC2616: The status codes 303 and 307 have + been added for servers that wish to make unambiguously + clear which kind of reaction is expected of the client. + + A 307 should be redirected using the same method, + in other words, a POST should be preserved and not + converted to a GET in that case. */ + if (statcode == HTTP_STATUS_TEMPORARY_REDIRECT) + return NEWLOCATION_KEEP_POST; return NEWLOCATION; } } @@ -2449,30 +2758,55 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); logputs (LOG_VERBOSE, "\n"); } } - xfree_null (type); - type = NULL; /* We don't need it any more. */ /* Return if we have no intention of further downloading. */ - if (!(*dt & RETROKF) || head_only) + if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only) { /* In case the caller cares to look... */ hs->len = 0; hs->res = 0; - xfree_null (type); - if (head_only) - /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the - servers not to send body in response to a HEAD request, and - those that do will likely be caught by test_socket_open. - If not, they can be worked around using - `--no-http-keep-alive'. */ - CLOSE_FINISH (sock); - else if (keep_alive - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - /* Successfully skipped the body; also keep using the socket. */ - CLOSE_FINISH (sock); + hs->restval = 0; + + /* Normally we are not interested in the response body of a error responses. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree (head); + xfree_null (type); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (head_only) + /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the + servers not to send body in response to a HEAD request, and + those that do will likely be caught by test_socket_open. + If not, they can be worked around using + `--no-http-keep-alive'. */ + CLOSE_FINISH (sock); + else if (keep_alive + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + /* Successfully skipped the body; also keep using the socket. */ + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree (head); + xfree_null (type); return RETRFINISHED; } @@ -2503,8 +2837,22 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); fp = fopen (hs->local_file, "ab"); #endif /* def __VMS [else] */ } - else if (ALLOW_CLOBBER) + else if (ALLOW_CLOBBER || count > 0) { + if (opt.unlink && file_exists_p (hs->local_file)) + { + int res = unlink (hs->local_file); + if (res < 0) + { + logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, + strerror (errno)); + CLOSE_INVALIDATE (sock); + xfree (head); + xfree_null (type); + return UNLINKERR; + } + } + #ifdef __VMS int open_id; @@ -2528,6 +2876,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); hs->local_file); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPEN_EXCL_ERR; } } @@ -2536,6 +2885,7 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPENERR; } } @@ -2549,49 +2899,26 @@ File %s already there; not retrieving.\n\n"), quote (hs->local_file)); HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); } - /* This confuses the timestamping code that checks for file size. - #### The timestamping code should be smarter about file size. */ - if (opt.save_headers && hs->restval == 0) - fwrite (head, 1, strlen (head), fp); + + err = read_response_body (hs, sock, fp, contlen, contrange, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); /* Now we no longer need to store the response header. */ xfree (head); - - /* Download the request body. */ - flags = 0; - if (contlen != -1) - /* If content-length is present, read that much; otherwise, read - until EOF. The HTTP spec doesn't require the server to - actually close the connection when it's done sending data. */ - flags |= rb_read_exactly; - if (hs->restval > 0 && contrange == 0) - /* If the server ignored our range request, instruct fd_read_body - to skip the first RESTVAL bytes of body. */ - flags |= rb_skip_startpos; - - if (chunked_transfer_encoding) - flags |= rb_chunked_transfer_encoding; - - hs->len = hs->restval; - hs->rd_size = 0; - hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, - hs->restval, &hs->rd_size, &hs->len, &hs->dltime, - flags); + xfree_null (type); if (hs->res >= 0) CLOSE_FINISH (sock); else - { - if (hs->res < 0) - hs->rderrmsg = xstrdup (fd_errstr (sock)); - CLOSE_INVALIDATE (sock); - } + CLOSE_INVALIDATE (sock); if (!output_stream) fclose (fp); - if (hs->res == -2) - return FWRITEERR; - return RETRFINISHED; + + return err; } /* The genuine HTTP loop! This is the part where the retrieval is @@ -2615,6 +2942,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc, char *file_name; bool force_full_retrieve = false; + + /* If we are writing to a WARC file: always retrieve the whole file. */ + if (opt.warc_filename != NULL) + force_full_retrieve = true; + + /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); @@ -2647,28 +2980,16 @@ http_loop (struct url *u, struct url *original_url, char **newloc, else if (!opt.content_disposition) { hstat.local_file = - url_file_name (opt.trustservernames ? u : original_url); + url_file_name (opt.trustservernames ? u : original_url, NULL); got_name = true; } - /* TODO: Ick! This code is now in both gethttp and http_loop, and is - * screaming for some refactoring. */ if (got_name && file_exists_p (hstat.local_file) && opt.noclobber && !opt.output_document) { /* If opt.noclobber is turned on and file already exists, do not retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ - logprintf (LOG_VERBOSE, _("\ -File %s already there; not retrieving.\n\n"), - quote (hstat.local_file)); - /* If the file is there, we suppose it's retrieved OK. */ - *dt |= RETROKF; - - /* #### Bogusness alert. */ - /* If its suffix is "html" or "htm" or similar, assume text/html. */ - if (has_html_suffix_p (hstat.local_file)) - *dt |= TEXTHTML; - + get_file_flags (hstat.local_file, dt); ret = RETROK; goto exit; } @@ -2683,9 +3004,14 @@ File %s already there; not retrieving.\n\n"), if (!opt.spider) send_head_first = false; + /* Send preliminary HEAD request if --content-disposition and -c are used + together. */ + if (opt.content_disposition && opt.always_rest) + send_head_first = true; + /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ - file_name = url_file_name (opt.trustservernames ? u : original_url); + file_name = url_file_name (opt.trustservernames ? u : original_url, NULL); if (opt.timestamping && (file_exists_p (file_name) || opt.content_disposition)) send_head_first = true; @@ -2767,7 +3093,7 @@ Spider mode enabled. Check if remote file exists.\n")); *dt &= ~SEND_NOCACHE; /* Try fetching the document, or at least its head. */ - err = gethttp (u, &hstat, dt, proxy, iri); + err = gethttp (u, &hstat, dt, proxy, iri, count); /* Time? */ tms = datetime_str (time (NULL)); @@ -2796,12 +3122,32 @@ Spider mode enabled. Check if remote file exists.\n")); /* Fatal errors just return from the function. */ ret = err; goto exit; + case WARC_ERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to WARC file.\n")); + ret = err; + goto exit; + case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n")); + ret = err; + goto exit; case CONSSLERR: /* Another fatal error. */ logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n")); ret = err; goto exit; + case UNLINKERR: + /* Another fatal error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot unlink %s (%s).\n"), + quote (hstat.local_file), strerror (errno)); + ret = err; + goto exit; case NEWLOCATION: + case NEWLOCATION_KEEP_POST: /* Return the new location to the caller. */ if (!*newloc) { @@ -2812,7 +3158,7 @@ Spider mode enabled. Check if remote file exists.\n")); } else { - ret = NEWLOCATION; + ret = err; } goto exit; case RETRUNNEEDED: @@ -3135,7 +3481,7 @@ Remote file exists.\n\n")); while (!opt.ntry || (count < opt.ntry)); exit: - if (ret == RETROK) + if (ret == RETROK && local_file) *local_file = xstrdup (hstat.local_file); free_hstat (&hstat); @@ -3316,19 +3662,23 @@ digest_authentication_encode (const char *au, const char *user, const char *passwd, const char *method, const char *path) { - static char *realm, *opaque, *nonce; + static char *realm, *opaque, *nonce, *qop; static struct { const char *name; char **variable; } options[] = { { "realm", &realm }, { "opaque", &opaque }, - { "nonce", &nonce } + { "nonce", &nonce }, + { "qop", &qop } }; + char cnonce[16] = ""; char *res; + size_t res_size; param_token name, value; - realm = opaque = nonce = NULL; + + realm = opaque = nonce = qop = NULL; au += 6; /* skip over `Digest' */ while (extract_param (&au, &name, &value, ',')) @@ -3344,11 +3694,19 @@ digest_authentication_encode (const char *au, const char *user, break; } } + + if (qop != NULL && strcmp(qop,"auth")) + { + logprintf (LOG_NOTQUIET, _("Unsupported quality of protection '%s'.\n"), qop); + user = NULL; /* force freeing mem and return */ + } + if (!realm || !nonce || !user || !passwd || !path || !method) { xfree_null (realm); xfree_null (opaque); xfree_null (nonce); + xfree_null (qop); return NULL; } @@ -3377,27 +3735,69 @@ digest_authentication_encode (const char *au, const char *user, md5_finish_ctx (&ctx, hash); dump_hash (a2buf, hash); - /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ - md5_init_ctx (&ctx); - md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); - md5_process_bytes ((unsigned char *)":", 1, &ctx); - md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); - md5_process_bytes ((unsigned char *)":", 1, &ctx); - md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); - md5_finish_ctx (&ctx, hash); + if (!strcmp(qop,"auth")) + { + /* RFC 2617 Digest Access Authentication */ + /* generate random hex string */ + snprintf(cnonce, sizeof(cnonce), "%08x", random_number(INT_MAX)); + + /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" noncecount ":" clientnonce ":" qop ": " A2BUF) */ + md5_init_ctx (&ctx); + md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)"00000001", 8, &ctx); /* TODO: keep track of server nonce values */ + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)cnonce, strlen(cnonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)qop, strlen(qop), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_finish_ctx (&ctx, hash); + } + else + { + /* RFC 2069 Digest Access Authentication */ + /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ + md5_init_ctx (&ctx); + md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_finish_ctx (&ctx, hash); + } + dump_hash (response_digest, hash); - res = xmalloc (strlen (user) - + strlen (user) - + strlen (realm) - + strlen (nonce) - + strlen (path) - + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ - + (opaque ? strlen (opaque) : 0) - + 128); - sprintf (res, "Digest \ -username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", - user, realm, nonce, path, response_digest); + res_size = strlen (user) + + strlen (user) + + strlen (realm) + + strlen (nonce) + + strlen (path) + + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ + + (opaque ? strlen (opaque) : 0) + + (qop ? 128: 0) + + 128; + + res = xmalloc (res_size); + + if (!strcmp(qop,"auth")) + { + snprintf (res, res_size, "Digest "\ + "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\""\ + ", qop=auth, nc=00000001, cnonce=\"%s\"", + user, realm, nonce, path, response_digest, cnonce); + + } + else + { + snprintf (res, res_size, "Digest "\ + "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", + user, realm, nonce, path, response_digest); + } + if (opaque) { char *p = res + strlen (res); @@ -3513,7 +3913,7 @@ ensure_extension (struct http_stat *hs, const char *ext, int *dt) if (len == 5) { strncpy (shortext, ext, len - 1); - shortext[len - 2] = '\0'; + shortext[len - 1] = '\0'; } if (last_period_in_local_filename == NULL @@ -3549,20 +3949,15 @@ test_parse_content_disposition() int i; struct { char *hdrval; - char *opt_dir_prefix; char *filename; bool result; } test_array[] = { - { "filename=\"file.ext\"", NULL, "file.ext", true }, - { "filename=\"file.ext\"", "somedir", "somedir/file.ext", true }, - { "attachment; filename=\"file.ext\"", NULL, "file.ext", true }, - { "attachment; filename=\"file.ext\"", "somedir", "somedir/file.ext", true }, - { "attachment; filename=\"file.ext\"; dummy", NULL, "file.ext", true }, - { "attachment; filename=\"file.ext\"; dummy", "somedir", "somedir/file.ext", true }, - { "attachment", NULL, NULL, false }, - { "attachment", "somedir", NULL, false }, - { "attachement; filename*=UTF-8'en-US'hello.txt", NULL, "hello.txt", true }, - { "attachement; filename*0=\"hello\"; filename*1=\"world.txt\"", NULL, "helloworld.txt", true }, + { "filename=\"file.ext\"", "file.ext", true }, + { "attachment; filename=\"file.ext\"", "file.ext", true }, + { "attachment; filename=\"file.ext\"; dummy", "file.ext", true }, + { "attachment", NULL, false }, + { "attachement; filename*=UTF-8'en-US'hello.txt", "hello.txt", true }, + { "attachement; filename*0=\"hello\"; filename*1=\"world.txt\"", "helloworld.txt", true }, }; for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) @@ -3570,7 +3965,6 @@ test_parse_content_disposition() char *filename; bool res; - opt.dir_prefix = test_array[i].opt_dir_prefix; res = parse_content_disposition (test_array[i].hdrval, &filename); mu_assert ("test_parse_content_disposition: wrong result",