Add support for WARC files.

author Gijs van Tulder <gvtulder@gmail.com>

Fri, 4 Nov 2011 21:25:00 +0000 (22:25 +0100)

committer Giuseppe Scrivano <gscrivano@gnu.org>

Fri, 4 Nov 2011 21:25:00 +0000 (22:25 +0100)
author Gijs van Tulder <gvtulder@gmail.com>
Fri, 4 Nov 2011 21:25:00 +0000 (22:25 +0100)
committer Giuseppe Scrivano <gscrivano@gnu.org>
Fri, 4 Nov 2011 21:25:00 +0000 (22:25 +0100)
diff --git a/bootstrap.conf b/bootstrap.conf

index 77230dbb9c1f145905e3715f43a249e010589b2c..6473cbba1127b68ac33bab65054f127059d29fda 100644 (file)
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -28,6 +28,7 @@ gnulib_modules="
  accept
  alloca
  announce-gen
+base32
  bind
  c-ctype
  clock-time
@@ -49,6 +50,7 @@ maintainer-makefile
  mbtowc
  mkdir
  crypto/md5
+crypto/sha1
  pipe
  quote
  quotearg
@@ -63,6 +65,7 @@ socket
  stdbool
  strcasestr
  strerror_r-posix
+tmpdir
  unlocked-io
  update-copyright
  vasprintf
diff --git a/configure.ac b/configure.ac

index 76c6fa28a61a483871101e4ab64f78bb9ae6f95a..360f6c91aef5d14f44a4d7a569d81ce5a0aa1c22 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -511,7 +511,19 @@ if test "X$iri" != "Xno"; then
    fi
  fi
  
+dnl
+dnl Check for UUID
+dnl
+
+AC_CHECK_HEADER(uuid/uuid.h,
+                AC_CHECK_LIB(uuid, uuid_generate,
+                  [LIBS="${LIBS} -luuid"
+                   AC_DEFINE([HAVE_LIBUUID], 1,
+                             [Define if libuuid is available.])
+                  ])
+)
  
+ 
  dnl Needed by src/Makefile.am
  AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
  
diff --git a/src/ChangeLog b/src/ChangeLog

index c2af118e347b1e06e05a58d7131d6c51b8ab187c..65c48072620b75120a44ce5d1edfd343fe188c3f 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,6 @@
+2011-11-04  Giuseppe Scrivano  <gscrivano@gnu.org>
+
+
  2011-10-07  Steven Schweda  <address@hidden>
  
         * connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions
@@ -21,7 +24,10 @@
         * openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const"
         "meth" argument to accommodate OpenSSL version 0.9.8, where that
         argument is not "const" in the OpenSSL function (SSL_CTX_new).
+       * test.c: Declare "program_argstring".
         * utils.c (fopen_excl): Comment typography.
+       * warc.h: New file.
+       * warc.c: New file.
  
  2011-10-02  Henrik Holst <henrik.holst@millistream.com> (tiny change)
         * http.c (gethttp): If 'contentonerror' is used then do not
diff --git a/src/Makefile.am b/src/Makefile.am

index 6b951988becff448f351cf8b2679c99042109817..8ef931a6ff3340f07111e0650dbfd0fb44b74de6 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -46,13 +46,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c                     \
                css_.c css-url.c \
                ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
                http.c init.c log.c main.c netrc.c progress.c ptimer.c     \
-              recur.c res.c retr.c spider.c url.c                        \
+              recur.c res.c retr.c spider.c url.c warc.c                         \
                utils.c exits.c build_info.c $(IRI_OBJ)                    \
                css-url.h css-tokens.h connect.h convert.h cookies.h       \
                ftp.h hash.h host.h html-parse.h html-url.h      \
                http.h http-ntlm.h init.h log.h mswindows.h netrc.h        \
                options.h progress.h ptimer.h recur.h res.h retr.h         \
-              spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h         \
+              spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h          \
                exits.h gettext.h
  nodist_wget_SOURCES = version.c
  EXTRA_wget_SOURCES = iri.c
diff --git a/src/ftp.c b/src/ftp.c

index f75397d09b181870d76737a0f8262152e1d69cf3..989a1ddab4d934f7e8c151fe99b527aedfcdaac9 100644 (file)
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -49,6 +49,7 @@ as that of the covered work.  */
  #include "netrc.h"
  #include "convert.h"            /* for downloaded_file */
  #include "recur.h"              /* for INFINITE_RECURSION */
+#include "warc.h"
  
  #ifdef __VMS
  # include "vms.h"
@@ -237,10 +238,11 @@ static uerr_t ftp_get_listing (struct url *, ccon *, struct fileinfo **);
  
  /* Retrieves a file with denoted parameters through opening an FTP
     connection to the server.  It always closes the data connection,
-   and closes the control connection in case of error.  */
+   and closes the control connection in case of error.  If warc_tmp
+   is non-NULL, the downloaded data will be written there as well.  */
  static uerr_t
  getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread,
-        wgint restval, ccon *con, int count)
+        wgint restval, ccon *con, int count, FILE *warc_tmp)
  {
    int csock, dtsock, local_sock, res;
    uerr_t err = RETROK;          /* appease the compiler */
@@ -1155,7 +1157,7 @@ Error in server response, closing control connection.\n"));
  /* 2011-09-30 SMS.
     Added listing files to the set of non-"binary" (text, Stream_LF)
     files.  (Wget works either way, but other programs, like, say, text
-   editors, work better on listing files which have text attributes.) 
+   editors, work better on listing files which have text attributes.)
     Now we use "binary" attributes for a binary ("IMAGE") transfer,
     unless "--ftp-stmlf" was specified, and we always use non-"binary"
     (text, Stream_LF) attributes for a listing file, or for an ASCII
@@ -1194,7 +1196,7 @@ Error in server response, closing control connection.\n"));
          }
        else if (opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct
                 || opt.output_document || count > 0)
-        {        
+        {
           if (opt.unlink && file_exists_p (con->target))
             {
               int res = unlink (con->target);
@@ -1274,7 +1276,7 @@ Error in server response, closing control connection.\n"));
    rd_size = 0;
    res = fd_read_body (dtsock, fp,
                        expected_bytes ? expected_bytes - restval : 0,
-                      restval, &rd_size, qtyread, &con->dltime, flags);
+                      restval, &rd_size, qtyread, &con->dltime, flags, warc_tmp);
  
    tms = datetime_str (time (NULL));
    tmrate = retr_rate (rd_size, con->dltime);
@@ -1285,15 +1287,18 @@ Error in server response, closing control connection.\n"));
    if (!output_stream || con->cmd & DO_LIST)
      fclose (fp);
  
-  /* If fd_read_body couldn't write to fp, bail out.  */
-  if (res == -2)
+  /* If fd_read_body couldn't write to fp or warc_tmp, bail out.  */
+  if (res == -2 || (warc_tmp != NULL && res == -3))
      {
        logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"),
                   con->target, strerror (errno));
        fd_close (csock);
        con->csock = -1;
        fd_close (dtsock);
-      return FWRITEERR;
+      if (res == -2)
+        return FWRITEERR;
+      else if (res == -3)
+        return WARC_TMP_FWRITEERR;
      }
    else if (res == -1)
      {
@@ -1409,6 +1414,11 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
    uerr_t err;
    struct_stat st;
  
+  /* Declare WARC variables. */
+  bool warc_enabled = (opt.warc_filename != NULL);
+  FILE *warc_tmp = NULL;
+  ip_address *warc_ip = NULL;
+
    /* Get the target, and set the name for the message accordingly. */
    if ((f == NULL) && (con->target))
      {
@@ -1445,6 +1455,21 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
  
    orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0;
  
+  /* For file RETR requests, we can write a WARC record.
+     We record the file contents to a temporary file. */
+  if (warc_enabled && (con->cmd & DO_RETR))
+    {
+      warc_tmp = warc_tempfile ();
+      if (warc_tmp == NULL)
+        return WARC_TMP_FOPENERR;
+
+      if (!con->proxy && con->csock != -1)
+        {
+          warc_ip = (ip_address *) alloca (sizeof (ip_address));
+          socket_ip_address (con->csock, warc_ip, ENDPOINT_PEER);
+        }
+    }
+
    /* THE loop.  */
    do
      {
@@ -1509,7 +1534,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
          len = f->size;
        else
          len = 0;
-      err = getftp (u, len, &qtyread, restval, con, count);
+
+      /* If we are working on a WARC record, getftp should also write
+         to the warc_tmp file. */
+      err = getftp (u, len, &qtyread, restval, con, count, warc_tmp);
  
        if (con->csock == -1)
          con->st &= ~DONE_CWD;
@@ -1520,8 +1548,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
          {
          case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR:
          case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED:
-        case UNLINKERR:
+        case UNLINKERR: case WARC_TMP_FWRITEERR:
            /* Fatal errors, give up.  */
+          if (warc_tmp != NULL)
+            fclose (warc_tmp);
            return err;
          case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR:
          case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR:
@@ -1589,6 +1619,19 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
            xfree (hurl);
          }
  
+      if (warc_enabled && (con->cmd & DO_RETR))
+        {
+          /* Create and store a WARC resource record for the retrieved file. */
+          bool warc_res;
+
+          warc_res = warc_write_resource_record (NULL, u->url, NULL, NULL,
+                                                  warc_ip, NULL, warc_tmp, -1);
+          if (! warc_res)
+            return WARC_ERR;
+
+          /* warc_write_resource_record has also closed warc_tmp. */
+        }
+
        if ((con->cmd & DO_LIST))
          /* This is a directory listing file. */
          {
@@ -1928,7 +1971,9 @@ Already have correct symlink %s -> %s\n\n"),
        xfree (ofile);
  
        /* Break on fatals.  */
-      if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR)
+      if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR
+          || err == WARC_ERR || err == WARC_TMP_FOPENERR
+          || err == WARC_TMP_FWRITEERR)
          break;
        con->cmd &= ~ (DO_CWD | DO_LOGIN);
        f = f->next;
diff --git a/src/http.c b/src/http.c

index 7eef453f6ccca5cedb0bb6f59d7f9b81e75c4a97..6a2ffe86aa0b5c44d62aaf124d43978e8754d4cb 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -58,6 +58,7 @@ as that of the covered work.  */
  #include "md5.h"
  #include "convert.h"
  #include "spider.h"
+#include "warc.h"
  
  #ifdef TESTING
  #include "test.h"
@@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name)
    p += A_len;                                   \
  } while (0)
  
-/* Construct the request and write it to FD using fd_write.  */
+/* Construct the request and write it to FD using fd_write.
+   If warc_tmp is set to a file pointer, the request string will
+   also be written to that file. */
  
  static int
-request_send (const struct request *req, int fd)
+request_send (const struct request *req, int fd, FILE *warc_tmp)
  {
    char *request_string, *p;
    int i, size, write_error;
@@ -374,6 +377,13 @@ request_send (const struct request *req, int fd)
    if (write_error < 0)
      logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
                 fd_errstr (fd));
+  else if (warc_tmp != NULL)
+    {
+      /* Write a copy of the data to the WARC record. */
+      int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp);
+      if (warc_tmp_written != size - 1)
+        return -2;
+    }
    return write_error;
  }
  
@@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname)
  
  /* Send the contents of FILE_NAME to SOCK.  Make sure that exactly
     PROMISED_SIZE bytes are sent over the wire -- if the file is
-   longer, read only that much; if the file is shorter, report an error.  */
+   longer, read only that much; if the file is shorter, report an error.
+   If warc_tmp is set to a file pointer, the post data will
+   also be written to that file.  */
  
  static int
-post_file (int sock, const char *file_name, wgint promised_size)
+post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp)
  {
    static char chunk[8192];
    wgint written = 0;
@@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size)
            fclose (fp);
            return -1;
          }
+      if (warc_tmp != NULL)
+        {
+          /* Write a copy of the data to the WARC record. */
+          int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp);
+          if (warc_tmp_written != towrite)
+            {
+              fclose (fp);
+              return -2;
+            }
+        }
        written += towrite;
      }
    fclose (fp);
@@ -1462,6 +1484,135 @@ File %s already there; not retrieving.\n\n"), quote (filename));
      *dt |= TEXTHTML;
  }
  
+/* Download the response body from the socket and writes it to
+   an output file.  The headers have already been read from the
+   socket.  If WARC is enabled, the response body will also be
+   written to a WARC response record.
+
+   hs, contlen, contrange, chunked_transfer_encoding and url are
+   parameters from the gethttp method.  fp is a pointer to the
+   output file.
+
+   url, warc_timestamp_str, warc_request_uuid, warc_ip, type
+   and statcode will be saved in the headers of the WARC record.
+   The head parameter contains the HTTP headers of the response.
+ 
+   If fp is NULL and WARC is enabled, the response body will be
+   written only to the WARC file.  If WARC is disabled and fp
+   is a file pointer, the data will be written to the file.
+   If fp is a file pointer and WARC is enabled, the body will
+   be written to both destinations.
+   
+   Returns the error code.   */
+static int
+read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen,
+                    wgint contrange, bool chunked_transfer_encoding,
+                    char *url, char *warc_timestamp_str, char *warc_request_uuid,
+                    ip_address *warc_ip, char *type, int statcode, char *head)
+{
+  int warc_payload_offset = 0;
+  FILE *warc_tmp = NULL;
+  int warcerr = 0;
+
+  if (opt.warc_filename != NULL)
+    {
+      /* Open a temporary file where we can write the response before we
+         add it to the WARC record.  */
+      warc_tmp = warc_tempfile ();
+      if (warc_tmp == NULL)
+        warcerr = WARC_TMP_FOPENERR;
+
+      if (warcerr == 0)
+        {
+          /* We should keep the response headers for the WARC record.  */
+          int head_len = strlen (head);
+          int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp);
+          if (warc_tmp_written != head_len)
+            warcerr = WARC_TMP_FWRITEERR;
+          warc_payload_offset = head_len;
+        }
+
+      if (warcerr != 0)
+        {
+          if (warc_tmp != NULL)
+            fclose (warc_tmp);
+          return warcerr;
+        }
+    }
+
+  if (fp != NULL)
+    {
+      /* This confuses the timestamping code that checks for file size.
+         #### The timestamping code should be smarter about file size.  */
+      if (opt.save_headers && hs->restval == 0)
+        fwrite (head, 1, strlen (head), fp);
+    }
+
+  /* Read the response body.  */
+  int flags = 0;
+  if (contlen != -1)
+    /* If content-length is present, read that much; otherwise, read
+       until EOF.  The HTTP spec doesn't require the server to
+       actually close the connection when it's done sending data. */
+    flags |= rb_read_exactly;
+  if (fp != NULL && hs->restval > 0 && contrange == 0)
+    /* If the server ignored our range request, instruct fd_read_body
+       to skip the first RESTVAL bytes of body.  */
+    flags |= rb_skip_startpos;
+  if (chunked_transfer_encoding)
+    flags |= rb_chunked_transfer_encoding;
+
+  hs->len = hs->restval;
+  hs->rd_size = 0;
+  /* Download the response body and write it to fp.
+     If we are working on a WARC file, we simultaneously write the
+     response body to warc_tmp.  */
+  hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
+                          hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
+                          flags, warc_tmp);
+  if (hs->res >= 0)
+    {
+      if (warc_tmp != NULL)
+        {
+          /* Create a response record and write it to the WARC file.
+             Note: per the WARC standard, the request and response should share
+             the same date header.  We re-use the timestamp of the request.
+             The response record should also refer to the uuid of the request.  */
+          bool r = warc_write_response_record (url, warc_timestamp_str,
+                                               warc_request_uuid, warc_ip,
+                                               warc_tmp, warc_payload_offset,
+                                               type, statcode, hs->newloc);
+
+          /* warc_write_response_record has closed warc_tmp. */
+
+          if (! r)
+            return WARC_ERR;
+        }
+
+      return RETRFINISHED;
+    }
+  
+  if (warc_tmp != NULL)
+    fclose (warc_tmp);
+
+  if (hs->res == -2)
+    {
+      /* Error while writing to fd. */
+      return FWRITEERR;
+    }
+  else if (hs->res == -3)
+    {
+      /* Error while writing to warc_tmp. */
+      return WARC_TMP_FWRITEERR;
+    }
+  else
+    {
+      /* A read error! */
+      hs->rderrmsg = xstrdup (fd_errstr (sock));
+      return RETRFINISHED;
+    }
+}
+
  #define BEGINS_WITH(line, string_constant)                               \
    (!strncasecmp (line, string_constant, sizeof (string_constant) - 1)    \
     && (c_isspace (line[sizeof (string_constant) - 1])                      \
@@ -1519,9 +1670,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
    wgint contlen, contrange;
    struct url *conn;
    FILE *fp;
+  int err;
  
    int sock = -1;
-  int flags;
  
    /* Set to 1 when the authorization has already been sent and should
       not be tried again. */
@@ -1547,6 +1698,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
    char hdrval[256];
    char *message;
  
+  /* Declare WARC variables. */
+  bool warc_enabled = (opt.warc_filename != NULL);
+  FILE *warc_tmp = NULL;
+  char warc_timestamp_str [21];
+  char warc_request_uuid [48];
+  ip_address *warc_ip = NULL;
+  long int warc_payload_offset = -1;
+
    /* Whether this connection will be kept alive after the HTTP request
       is done. */
    bool keep_alive;
@@ -1852,7 +2011,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
               that the contents of Host would be exactly the same as
               the contents of CONNECT.  */
  
-          write_error = request_send (connreq, sock);
+          write_error = request_send (connreq, sock, 0);
            request_free (connreq);
            if (write_error < 0)
              {
@@ -1924,8 +2083,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
  #endif /* HAVE_SSL */
      }
  
+  /* Open the temporary file where we will write the request. */
+  if (warc_enabled)
+    {
+      warc_tmp = warc_tempfile ();
+      if (warc_tmp == NULL)
+        {
+          CLOSE_INVALIDATE (sock);
+          request_free (req);
+          return WARC_TMP_FOPENERR;
+        }
+
+      if (! proxy)
+        {
+          warc_ip = (ip_address *) alloca (sizeof (ip_address));
+          socket_ip_address (sock, warc_ip, ENDPOINT_PEER);
+        }
+    }
+
    /* Send the request to server.  */
-  write_error = request_send (req, sock);
+  write_error = request_send (req, sock, warc_tmp);
  
    if (write_error >= 0)
      {
@@ -1933,16 +2110,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
          {
            DEBUGP (("[POST data: %s]\n", opt.post_data));
            write_error = fd_write (sock, opt.post_data, post_data_size, -1);
+          if (write_error >= 0 && warc_tmp != NULL)
+            {
+              /* Remember end of headers / start of payload. */
+              warc_payload_offset = ftell (warc_tmp);
+
+              /* Write a copy of the data to the WARC record. */
+              int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
+              if (warc_tmp_written != post_data_size)
+                write_error = -2;
+            }
          }
        else if (opt.post_file_name && post_data_size != 0)
-        write_error = post_file (sock, opt.post_file_name, post_data_size);
+        {
+          if (warc_tmp != NULL)
+            /* Remember end of headers / start of payload. */
+            warc_payload_offset = ftell (warc_tmp);
+
+          write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
+        }
      }
  
    if (write_error < 0)
      {
        CLOSE_INVALIDATE (sock);
        request_free (req);
-      return WRITEFAILED;
+
+      if (warc_tmp != NULL)
+        fclose (warc_tmp);
+
+      if (write_error == -2)
+        return WARC_TMP_FWRITEERR;
+      else
+        return WRITEFAILED;
      }
    logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
               proxy ? "Proxy" : "HTTP");
@@ -1950,6 +2150,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
    contrange = 0;
    *dt &= ~RETROKF;
  
+
+  if (warc_enabled)
+    {
+      bool warc_result;
+      /* Generate a timestamp and uuid for this request. */
+      warc_timestamp (warc_timestamp_str);
+      warc_uuid_str (warc_request_uuid);
+
+      /* Create a request record and store it in the WARC file. */
+      warc_result = warc_write_request_record (u->url, warc_timestamp_str,
+                                               warc_request_uuid, warc_ip,
+                                               warc_tmp, warc_payload_offset);
+      if (! warc_result)
+        {
+          CLOSE_INVALIDATE (sock);
+          request_free (req);
+          return WARC_ERR;
+        }
+
+      /* warc_write_request_record has also closed warc_tmp. */
+    }
+
+
  read_header:
    head = read_http_response_head (sock);
    if (!head)
@@ -2073,11 +2296,42 @@ read_header:
    if (statcode == HTTP_STATUS_UNAUTHORIZED)
      {
        /* Authorization is required.  */
-      if (keep_alive && !head_only
-          && skip_short_body (sock, contlen, chunked_transfer_encoding))
-        CLOSE_FINISH (sock);
+
+      /* Normally we are not interested in the response body.
+         But if we are writing a WARC file we are: we like to keep everyting.  */
+      if (warc_enabled)
+        {
+          int err;
+          type = resp_header_strdup (resp, "Content-Type");
+          err = read_response_body (hs, sock, NULL, contlen, 0,
+                                    chunked_transfer_encoding,
+                                    u->url, warc_timestamp_str,
+                                    warc_request_uuid, warc_ip, type,
+                                    statcode, head);
+          xfree_null (type);
+
+          if (err != RETRFINISHED || hs->res < 0)
+            {
+              CLOSE_INVALIDATE (sock);
+              request_free (req);
+              xfree_null (message);
+              resp_free (resp);
+              xfree (head);
+              return err;
+            }
+          else
+            CLOSE_FINISH (sock);
+        }
        else
-        CLOSE_INVALIDATE (sock);
+        {
+          /* Since WARC is disabled, we are not interested in the response body.  */
+          if (keep_alive && !head_only
+              && skip_short_body (sock, contlen, chunked_transfer_encoding))
+            CLOSE_FINISH (sock);
+          else
+            CLOSE_INVALIDATE (sock);
+        }
+
        pconn.authorized = false;
        if (!auth_finished && (user && passwd))
          {
@@ -2325,11 +2579,42 @@ read_header:
                       _("Location: %s%s\n"),
                       hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
                       hs->newloc ? _(" [following]") : "");
-          if (keep_alive && !head_only
-              && skip_short_body (sock, contlen, chunked_transfer_encoding))
-            CLOSE_FINISH (sock);
+ 
+          /* In case the caller cares to look...  */
+          hs->len = 0;
+          hs->res = 0;
+          hs->restval = 0;
+
+          /* Normally we are not interested in the response body of a redirect.
+             But if we are writing a WARC file we are: we like to keep everyting.  */
+          if (warc_enabled)
+            {
+              int err = read_response_body (hs, sock, NULL, contlen, 0,
+                                            chunked_transfer_encoding,
+                                            u->url, warc_timestamp_str,
+                                            warc_request_uuid, warc_ip, type,
+                                            statcode, head);
+
+              if (err != RETRFINISHED || hs->res < 0)
+                {
+                  CLOSE_INVALIDATE (sock);
+                  xfree_null (type);
+                  xfree (head);
+                  return err;
+                }
+              else
+                CLOSE_FINISH (sock);
+            }
            else
-            CLOSE_INVALIDATE (sock);
+            {
+              /* Since WARC is disabled, we are not interested in the response body.  */
+              if (keep_alive && !head_only
+                  && skip_short_body (sock, contlen, chunked_transfer_encoding))
+                CLOSE_FINISH (sock);
+              else
+                CLOSE_INVALIDATE (sock);
+            }
+
            xfree_null (type);
            xfree (head);
            /* From RFC2616: The status codes 303 and 307 have
@@ -2447,8 +2732,6 @@ read_header:
              logputs (LOG_VERBOSE, "\n");
          }
      }
-  xfree_null (type);
-  type = NULL;                        /* We don't need it any more.  */
  
    /* Return if we have no intention of further downloading.  */
    if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only)
@@ -2456,21 +2739,48 @@ read_header:
        /* In case the caller cares to look...  */
        hs->len = 0;
        hs->res = 0;
-      xfree_null (type);
-      if (head_only)
-        /* Pre-1.10 Wget used CLOSE_INVALIDATE here.  Now we trust the
-           servers not to send body in response to a HEAD request, and
-           those that do will likely be caught by test_socket_open.
-           If not, they can be worked around using
-           `--no-http-keep-alive'.  */
-        CLOSE_FINISH (sock);
-      else if (keep_alive
-               && skip_short_body (sock, contlen, chunked_transfer_encoding))
-        /* Successfully skipped the body; also keep using the socket. */
-        CLOSE_FINISH (sock);
+      hs->restval = 0;
+
+      /* Normally we are not interested in the response body of a error responses.
+         But if we are writing a WARC file we are: we like to keep everyting.  */
+      if (warc_enabled)
+        {
+          int err = read_response_body (hs, sock, NULL, contlen, 0,
+                                        chunked_transfer_encoding,
+                                        u->url, warc_timestamp_str,
+                                        warc_request_uuid, warc_ip, type,
+                                        statcode, head);
+
+          if (err != RETRFINISHED || hs->res < 0)
+            {
+              CLOSE_INVALIDATE (sock);
+              xfree (head);
+              xfree_null (type);
+              return err;
+            }
+          else
+            CLOSE_FINISH (sock);
+        }
        else
-        CLOSE_INVALIDATE (sock);
+        {
+          /* Since WARC is disabled, we are not interested in the response body.  */
+          if (head_only)
+            /* Pre-1.10 Wget used CLOSE_INVALIDATE here.  Now we trust the
+               servers not to send body in response to a HEAD request, and
+               those that do will likely be caught by test_socket_open.
+               If not, they can be worked around using
+               `--no-http-keep-alive'.  */
+            CLOSE_FINISH (sock);
+          else if (keep_alive
+                   && skip_short_body (sock, contlen, chunked_transfer_encoding))
+            /* Successfully skipped the body; also keep using the socket. */
+            CLOSE_FINISH (sock);
+          else
+            CLOSE_INVALIDATE (sock);
+        }
+
        xfree (head);
+      xfree_null (type);
        return RETRFINISHED;
      }
  
@@ -2512,6 +2822,7 @@ read_header:
                              strerror (errno));
                   CLOSE_INVALIDATE (sock);
                   xfree (head);
+      xfree_null (type);
                   return UNLINKERR;
                 }
             }
@@ -2539,6 +2850,7 @@ read_header:
                           hs->local_file);
                CLOSE_INVALIDATE (sock);
                xfree (head);
+              xfree_null (type);
                return FOPEN_EXCL_ERR;
              }
          }
@@ -2547,6 +2859,7 @@ read_header:
            logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
            CLOSE_INVALIDATE (sock);
            xfree (head);
+          xfree_null (type);
            return FOPENERR;
          }
      }
@@ -2560,49 +2873,26 @@ read_header:
                   HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
      }
  
-  /* This confuses the timestamping code that checks for file size.
-     #### The timestamping code should be smarter about file size.  */
-  if (opt.save_headers && hs->restval == 0)
-    fwrite (head, 1, strlen (head), fp);
+
+  err = read_response_body (hs, sock, fp, contlen, contrange,
+                            chunked_transfer_encoding,
+                            u->url, warc_timestamp_str,
+                            warc_request_uuid, warc_ip, type,
+                            statcode, head);
  
    /* Now we no longer need to store the response header. */
    xfree (head);
-
-  /* Download the request body.  */
-  flags = 0;
-  if (contlen != -1)
-    /* If content-length is present, read that much; otherwise, read
-       until EOF.  The HTTP spec doesn't require the server to
-       actually close the connection when it's done sending data. */
-    flags |= rb_read_exactly;
-  if (hs->restval > 0 && contrange == 0)
-    /* If the server ignored our range request, instruct fd_read_body
-       to skip the first RESTVAL bytes of body.  */
-    flags |= rb_skip_startpos;
-
-  if (chunked_transfer_encoding)
-    flags |= rb_chunked_transfer_encoding;
-
-  hs->len = hs->restval;
-  hs->rd_size = 0;
-  hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
-                          hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
-                          flags);
+  xfree_null (type);
  
    if (hs->res >= 0)
      CLOSE_FINISH (sock);
    else
-    {
-      if (hs->res < 0)
-        hs->rderrmsg = xstrdup (fd_errstr (sock));
-      CLOSE_INVALIDATE (sock);
-    }
+    CLOSE_INVALIDATE (sock);
  
    if (!output_stream)
      fclose (fp);
-  if (hs->res == -2)
-    return FWRITEERR;
-  return RETRFINISHED;
+
+  return err;
  }
  
  /* The genuine HTTP loop!  This is the part where the retrieval is
@@ -2626,6 +2916,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc,
    char *file_name;
    bool force_full_retrieve = false;
  
+
+  /* If we are writing to a WARC file: always retrieve the whole file. */
+  if (opt.warc_filename != NULL)
+    force_full_retrieve = true;
+
+
    /* Assert that no value for *LOCAL_FILE was passed. */
    assert (local_file == NULL || *local_file == NULL);
  
@@ -2795,6 +3091,18 @@ Spider mode enabled. Check if remote file exists.\n"));
            /* Fatal errors just return from the function.  */
            ret = err;
            goto exit;
+        case WARC_ERR:
+          /* A fatal WARC error. */
+          logputs (LOG_VERBOSE, "\n");
+          logprintf (LOG_NOTQUIET, _("Cannot write to WARC file..\n"));
+          ret = err;
+          goto exit;
+        case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR:
+          /* A fatal WARC error. */
+          logputs (LOG_VERBOSE, "\n");
+          logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n"));
+          ret = err;
+          goto exit;
          case CONSSLERR:
            /* Another fatal error.  */
            logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
diff --git a/src/init.c b/src/init.c

index eae355235486cf6db064f627f997aed0f7df2f62..47fdea06c5f584bf8efb73c10330ad1b51599318 100644 (file)
--- a/src/init.c
+++ b/src/init.c
@@ -88,6 +88,7 @@ CMD_DECLARE (cmd_vector);
  
  CMD_DECLARE (cmd_spec_dirstruct);
  CMD_DECLARE (cmd_spec_header);
+CMD_DECLARE (cmd_spec_warc_header);
  CMD_DECLARE (cmd_spec_htmlify);
  CMD_DECLARE (cmd_spec_mirror);
  CMD_DECLARE (cmd_spec_prefer_family);
@@ -264,6 +265,15 @@ static const struct {
    { "verbose",          NULL,                   cmd_spec_verbose },
    { "wait",             &opt.wait,              cmd_time },
    { "waitretry",        &opt.waitretry,         cmd_time },
+  { "warccdx",          &opt.warc_cdx_enabled,  cmd_boolean },
+  { "warccdxdedup",     &opt.warc_cdx_dedup_filename,  cmd_file },
+  { "warccompression",  &opt.warc_compression_enabled, cmd_boolean },
+  { "warcdigests",      &opt.warc_digests_enabled, cmd_boolean },
+  { "warcfile",         &opt.warc_filename,     cmd_file },
+  { "warcheader",       NULL,                   cmd_spec_warc_header },
+  { "warckeeplog",      &opt.warc_keep_log,     cmd_boolean },
+  { "warcmaxsize",      &opt.warc_maxsize,      cmd_bytes },
+  { "warctempdir",      &opt.warc_tempdir,      cmd_directory },
  #ifdef USE_WATT32
    { "wdebug",           &opt.wdebug,            cmd_boolean },
  #endif
@@ -362,6 +372,14 @@ defaults (void)
  
    opt.useservertimestamps = true;
    opt.show_all_dns_entries = false;
+
+  opt.warc_maxsize = 0; /* 1024 * 1024 * 1024; */
+  opt.warc_compression_enabled = true;
+  opt.warc_digests_enabled = true;
+  opt.warc_cdx_enabled = false;
+  opt.warc_cdx_dedup_filename = NULL;
+  opt.warc_tempdir = NULL;
+  opt.warc_keep_log = true;
  }
  \f
  /* Return the user's home directory (strdup-ed), or NULL if none is
@@ -1235,6 +1253,27 @@ cmd_spec_header (const char *com, const char *val, void *place_ignored)
    return true;
  }
  
+static bool
+cmd_spec_warc_header (const char *com, const char *val, void *place_ignored)
+{
+  /* Empty value means reset the list of headers. */
+  if (*val == '\0')
+    {
+      free_vec (opt.warc_user_headers);
+      opt.warc_user_headers = NULL;
+      return true;
+    }
+
+  if (!check_user_specified_header (val))
+    {
+      fprintf (stderr, _("%s: %s: Invalid WARC header %s.\n"),
+               exec_name, com, quote (val));
+      return false;
+    }
+  opt.warc_user_headers = vec_append (opt.warc_user_headers, val);
+  return true;
+}
+
  static bool
  cmd_spec_htmlify (const char *com, const char *val, void *place_ignored)
  {
@@ -1639,6 +1678,7 @@ cleanup (void)
    xfree_null (opt.http_user);
    xfree_null (opt.http_passwd);
    free_vec (opt.user_headers);
+  free_vec (opt.warc_user_headers);
  # ifdef HAVE_SSL
    xfree_null (opt.cert_file);
    xfree_null (opt.private_key);
diff --git a/src/log.c b/src/log.c

index e6875f6bd722768a1391114d3dc5d6cb1d6c8454..0185df19ccbc3663b88d2963f5b0ac832101f0b2 100644 (file)
--- a/src/log.c
+++ b/src/log.c
@@ -79,6 +79,10 @@ as that of the covered work.  */
     logging is inhibited, logfp is set back to NULL. */
  static FILE *logfp;
  
+/* A second file descriptor pointing to the temporary log file for the
+   WARC writer.  If WARC writing is disabled, this is NULL.  */
+static FILE *warclogfp;
+
  /* If true, it means logging is inhibited, i.e. nothing is printed or
     stored.  */
  static bool inhibit_logging;
@@ -304,6 +308,31 @@ get_log_fp (void)
      return logfp;
    return stderr;
  }
+
+/* Returns the file descriptor for the secondary log file. This is
+   WARCLOGFP, except if called before log_init, in which case it
+   returns stderr.  This is useful in case someone calls a logging
+   function before log_init.
+
+   If logging is inhibited, return NULL.  */
+
+static FILE *
+get_warc_log_fp (void)
+{
+  if (inhibit_logging)
+    return NULL;
+  if (warclogfp)
+    return warclogfp;
+  return NULL;
+}
+
+/* Sets the file descriptor for the secondary log file.  */
+
+void
+log_set_warc_log_fp (FILE * fp)
+{
+  warclogfp = fp;
+}
  \f
  /* Log a literal string S.  The string is logged as-is, without a
     newline appended.  */
@@ -312,13 +341,17 @@ void
  logputs (enum log_options o, const char *s)
  {
    FILE *fp;
+  FILE *warcfp;
  
    check_redirect_output ();
    if ((fp = get_log_fp ()) == NULL)
      return;
+  warcfp = get_warc_log_fp ();
    CHECK_VERBOSE (o);
  
    FPUTS (s, fp);
+  if (warcfp != NULL)
+    FPUTS (s, warcfp);
    if (save_context_p)
      saved_append (s);
    if (flush_log_p)
@@ -356,8 +389,9 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
    int available_size = sizeof (smallmsg);
    int numwritten;
    FILE *fp = get_log_fp ();
+  FILE *warcfp = get_warc_log_fp ();
  
-  if (!save_context_p)
+  if (!save_context_p && warcfp == NULL)
      {
        /* In the simple case just call vfprintf(), to avoid needless
           allocation and games with vsnprintf(). */
@@ -407,8 +441,11 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
      }
  
    /* Writing succeeded. */
-  saved_append (write_ptr);
+  if (save_context_p)
+    saved_append (write_ptr);
    FPUTS (write_ptr, fp);
+  if (warcfp != NULL)
+    FPUTS (write_ptr, warcfp);
    if (state->bigmsg)
      xfree (state->bigmsg);
  
@@ -426,6 +463,7 @@ void
  logflush (void)
  {
    FILE *fp = get_log_fp ();
+  FILE *warcfp = get_warc_log_fp ();
    if (fp)
      {
  /* 2005-10-25 SMS.
@@ -440,6 +478,10 @@ logflush (void)
        fflush (fp);
  #endif /* def __VMS [else] */
      }
+
+  if (warcfp != NULL)
+    fflush (warcfp);
+
    needs_flushing = false;
  }
  
@@ -598,6 +640,7 @@ log_dump_context (void)
  {
    int num = log_line_current;
    FILE *fp = get_log_fp ();
+  FILE *warcfp = get_warc_log_fp ();
    if (!fp)
      return;
  
@@ -609,14 +652,23 @@ log_dump_context (void)
      {
        struct log_ln *ln = log_lines + num;
        if (ln->content)
-        FPUTS (ln->content, fp);
+        {
+          FPUTS (ln->content, fp);
+          if (warcfp != NULL)
+            FPUTS (ln->content, warcfp);
+        }
        ROT_ADVANCE (num);
      }
    while (num != log_line_current);
    if (trailing_line)
      if (log_lines[log_line_current].content)
-      FPUTS (log_lines[log_line_current].content, fp);
+      {
+        FPUTS (log_lines[log_line_current].content, fp);
+        if (warcfp != NULL)
+          FPUTS (log_lines[log_line_current].content, warcfp);
+      }
    fflush (fp);
+  fflush (warcfp);
  }
  \f
  /* String escape functions. */
diff --git a/src/log.h b/src/log.h

index 48c2f1b1518c715ba7eea06b0f09523788c3272d..d74ca53de7d11ffd5720a5126b142a3a7d6d3ac9 100644 (file)
--- a/src/log.h
+++ b/src/log.h
@@ -34,8 +34,12 @@ as that of the covered work.  */
  /* The log file to which Wget writes to after HUP.  */
  #define DEFAULT_LOGFILE "wget-log"
  
+#include <stdio.h>
+
  enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS };
  
+void log_set_warc_log_fp (FILE *);
+
  void logprintf (enum log_options, const char *, ...)
       GCC_FORMAT_ATTR (2, 3);
  void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2);
diff --git a/src/main.c b/src/main.c

index 05ad0e76576c73c2e6ace66838513287621e8b31..2846735900da5bccadd9b7818fd28fbdcd270fb4 100644 (file)
--- a/src/main.c
+++ b/src/main.c
@@ -55,6 +55,7 @@ as that of the covered work.  */
  #include "spider.h"
  #include "http.h"               /* for save_cookies */
  #include "ptimer.h"
+#include "warc.h"
  
  #include <getopt.h>
  #include <getpass.h>
@@ -287,6 +288,15 @@ static struct cmdline_option option_data[] =
      { "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument },
      { "wait", 'w', OPT_VALUE, "wait", -1 },
      { "waitretry", 0, OPT_VALUE, "waitretry", -1 },
+    { "warc-cdx", 0, OPT_BOOLEAN, "warccdx", -1 },
+    { "warc-compression", 0, OPT_BOOLEAN, "warccompression", -1 },
+    { "warc-dedup", 0, OPT_VALUE, "warccdxdedup", -1 },
+    { "warc-digests", 0, OPT_BOOLEAN, "warcdigests", -1 },
+    { "warc-file", 0, OPT_VALUE, "warcfile", -1 },
+    { "warc-header", 0, OPT_VALUE, "warcheader", -1 },
+    { "warc-keep-log", 0, OPT_BOOLEAN, "warckeeplog", -1 },
+    { "warc-max-size", 0, OPT_VALUE, "warcmaxsize", -1 },
+    { "warc-tempdir", 0, OPT_VALUE, "warctempdir", -1 },
  #ifdef USE_WATT32
      { "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 },
  #endif
@@ -652,6 +662,29 @@ FTP options:\n"),
         --retr-symlinks         when recursing, get linked-to files (not dir).\n"),
      "\n",
  
+    N_("\
+WARC options:\n"),
+    N_("\
+       --warc-file=FILENAME      save request/response data to a .warc.gz file.\n"),
+    N_("\
+       --warc-header=STRING      insert STRING into the warcinfo record.\n"),
+    N_("\
+       --warc-max-size=NUMBER    set maximum size of WARC files to NUMBER.\n"),
+    N_("\
+       --warc-cdx                write CDX index files.\n"),
+    N_("\
+       --warc-dedup=FILENAME     do not store records listed in this CDX file.\n"),
+    N_("\
+       --no-warc-compression     do not compress WARC files with GZIP.\n"),
+    N_("\
+       --no-warc-digests         do not calculate SHA1 digests.\n"),
+    N_("\
+       --no-warc-keep-log        do not store the log file in a WARC record.\n"),
+    N_("\
+       --warc-tempdir=DIRECTORY  location for temporary files created by the\n\
+                                 WARC writer.\n"),
+    "\n",
+
      N_("\
  Recursive download:\n"),
      N_("\
@@ -910,6 +943,7 @@ There is NO WARRANTY, to the extent permitted by law.\n"), stdout) < 0)
  }
  
  char *program_name; /* Needed by lib/error.c. */
+char *program_argstring; /* Needed by wget_warc.c. */
  
  int
  main (int argc, char **argv)
@@ -945,6 +979,22 @@ main (int argc, char **argv)
    windows_main ((char **) &exec_name);
  #endif
  
+  /* Construct the arguments string. */
+  int argstring_length = 1;
+  for (i = 1; i < argc; i++)
+    argstring_length += strlen (argv[i]) + 2 + 1;
+  char *p = program_argstring = malloc (argstring_length * sizeof (char));
+  for (i = 1; i < argc; i++)
+  {
+    *p++ = '"';
+    int arglen = strlen (argv[i]);
+    memcpy (p, argv[i], arglen);
+    p += arglen;
+    *p++ = '"';
+    *p++ = ' ';
+  }
+  *p = '\0';
+
    /* Load the hard-coded defaults.  */
    defaults ();
  
@@ -1194,6 +1244,47 @@ for details.\n\n"));
             }
      }
  
+  if (opt.warc_filename != 0)
+    {
+      if (opt.noclobber)
+        {
+          fprintf (stderr,
+                   _("WARC output does not work with --no-clobber, "
+                     "--no-clobber will be disabled.\n"));
+          opt.noclobber = false;
+        }
+      if (opt.timestamping)
+        {
+          fprintf (stderr,
+                   _("WARC output does not work with timestamping, "
+                     "timestamping will be disabled.\n"));
+          opt.timestamping = false;
+        }
+      if (opt.spider)
+        {
+          fprintf (stderr,
+                   _("WARC output does not work with --spider.\n"));
+          exit (1);
+        }
+      if (opt.always_rest)
+        {
+          fprintf (stderr,
+                   _("WARC output does not work with --continue, "
+                     "--continue will be disabled.\n"));
+          opt.always_rest = false;
+        }
+      if (opt.warc_cdx_dedup_filename != 0 && !opt.warc_digests_enabled)
+        {
+          fprintf (stderr,
+                   _("Digests are disabled; WARC deduplication will "
+                     "not find duplicate records.\n"));
+        }
+      if (opt.warc_keep_log)
+        {
+          opt.progress_type = "dot";
+        }
+    }
+
    if (opt.ask_passwd && opt.passwd)
      {
        fprintf (stderr,
@@ -1273,6 +1364,10 @@ for details.\n\n"));
    /* Initialize logging.  */
    log_init (opt.lfilename, append_to_log);
  
+  /* Open WARC file. */
+  if (opt.warc_filename != 0)
+    warc_init ();
+
    DEBUGP (("DEBUG output created by Wget %s on %s.\n\n",
             version_string, OS_TYPE));
  
@@ -1472,7 +1567,12 @@ outputting to a regular file.\n"));
    if (opt.convert_links && !opt.delete_after)
      convert_all_links ();
  
+  /* Close WARC file. */
+  if (opt.warc_filename != 0)
+    warc_close ();
+
    log_close ();
+
    for (i = 0; i < nurl; i++)
      xfree (url[i]);
    cleanup ();
diff --git a/src/options.h b/src/options.h

index 5e7c1eb6222cce9c418e3c7b01433ad261b9da00..0be66814209254c403ad834c601abec1b85c9181 100644 (file)
--- a/src/options.h
+++ b/src/options.h
@@ -87,6 +87,15 @@ struct options
                                    FTP. */
    char *output_document;       /* The output file to which the
                                    documents will be printed.  */
+  char *warc_filename;         /* WARC output filename */
+  char *warc_tempdir;  /* WARC temp dir */
+  char *warc_cdx_dedup_filename;       /* CDX file to be used for deduplication. */
+  wgint warc_maxsize;           /* WARC max archive size */
+  bool warc_compression_enabled;  /* For GZIP compression. */
+  bool warc_digests_enabled;  /* For SHA1 digests. */
+  bool warc_cdx_enabled;      /* Create CDX files? */
+  bool warc_keep_log;         /* Store the log file in a WARC record. */
+  char **warc_user_headers;            /* User-defined WARC header(s). */
  
    char *user;                  /* Generic username */
    char *passwd;                        /* Generic password */
diff --git a/src/retr.c b/src/retr.c

index 73947658c2d73b10b975ce589d39d1a2c2bb7401..3df582b8f500d3278d0230d69746e216f51cabef 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -139,13 +139,16 @@ limit_bandwidth (wgint bytes, struct ptimer *timer)
  
  /* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
     amount of data and decrease SKIP.  Increment *TOTAL by the amount
-   of data written.  */
+   of data written.  If OUT2 is not NULL, also write BUF to OUT2.
+   In case of error writing to OUT, -1 is returned.  In case of error
+   writing to OUT2, -2 is returned.  In case of any other error,
+   1 is returned.  */
  
  static int
-write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
-            wgint *written)
+write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
+            wgint *skip, wgint *written)
  {
-  if (!out)
+  if (out == NULL && out2 == NULL)
      return 1;
    if (*skip > bufsize)
      {
@@ -161,7 +164,10 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
          return 1;
      }
  
-  fwrite (buf, 1, bufsize, out);
+  if (out != NULL)
+    fwrite (buf, 1, bufsize, out);
+  if (out2 != NULL)
+    fwrite (buf, 1, bufsize, out2);
    *written += bufsize;
  
    /* Immediately flush the downloaded data.  This should not hinder
@@ -178,9 +184,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
       actual justification.  (Also, why 16K?  Anyone test other values?)
    */
  #ifndef __VMS
-  fflush (out);
+  if (out != NULL)
+    fflush (out);
+  if (out2 != NULL)
+    fflush (out2);
  #endif /* ndef __VMS */
-  return !ferror (out);
+  if (out != NULL && ferror (out))
+    return -1;
+  else if (out2 != NULL && ferror (out2))
+    return -2;
+  else
+    return 0;
  }
  
  /* Read the contents of file descriptor FD until it the connection
@@ -198,13 +212,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
     the amount of data written to disk.  The time it took to download
     the data is stored to ELAPSED.
  
+   If OUT2 is non-NULL, the contents is also written to OUT2.
+
     The function exits and returns the amount of data read.  In case of
     error while reading data, -1 is returned.  In case of error while
-   writing data, -2 is returned.  */
+   writing data to OUT, -2 is returned.  In case of error while writing
+   data to OUT2, -3 is returned.  */
  
  int
  fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
-              wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
+              wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
+              FILE *out2)
  {
    int ret = 0;
  #undef max
@@ -343,9 +361,10 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
        if (ret > 0)
          {
            sum_read += ret;
-          if (!write_data (out, dlbuf, ret, &skip, &sum_written))
+          int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
+          if (write_res != 0)
              {
-              ret = -2;
+              ret = (write_res == -3) ? -3 : -2;
                goto out;
              }
            if (chunked)
diff --git a/src/retr.h b/src/retr.h

index 7329b0375b74b8a7717cdf08670f71370f67e2fd..22ab9ecd8a5ec39919f67eefcb7415f64ec25253 100644 (file)
--- a/src/retr.h
+++ b/src/retr.h
@@ -50,7 +50,7 @@ enum {
    rb_chunked_transfer_encoding = 4
  };
  
-int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int);
+int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *);
  
  typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
  
diff --git a/src/test.c b/src/test.c

index e7ce54cf74340dd3384a5844435476e69b4b0862..80abafff9fb5cb234a735fd8b6a0820793a84954 100644 (file)
--- a/src/test.c
+++ b/src/test.c
@@ -46,6 +46,8 @@ const char *test_append_uri_pathel();
  const char *test_are_urls_equal();
  const char *test_is_robots_txt_url();
  
+const char *program_argstring = "TEST";
+
  int tests_run;
  
  static const char *
diff --git a/src/warc.c b/src/warc.c

new file mode 100644 (file)

index 0000000..77ef369
--- /dev/null
+++ b/src/warc.c
@@ -0,0 +1,1332 @@
+/* Utility functions for writing WARC files. */
+#define _GNU_SOURCE
+
+#include "wget.h"
+#include "hash.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <time.h>
+#include <tmpdir.h>
+#include <sha1.h>
+#include <base32.h>
+#include <unistd.h>
+#include <zlib.h>
+#ifdef HAVE_LIBUUID
+#include <uuid/uuid.h>
+#endif
+
+#include "warc.h"
+
+extern char *version_string;
+
+/* Set by main in main.c */
+extern char *program_argstring;
+
+
+/* The log file (a temporary file that contains a copy
+   of the wget log). */
+static FILE *warc_log_fp;
+
+/* The manifest file (a temporary file that contains the
+   warcinfo uuid of every file in this crawl). */
+static FILE *warc_manifest_fp;
+
+/* The current WARC file (or NULL, if WARC is disabled). */
+static FILE *warc_current_file;
+
+/* The gzip stream for the current WARC file
+   (or NULL, if WARC or gzip is disabled). */
+static gzFile *warc_current_gzfile;
+
+/* The offset of the current gzip record in the WARC file. */
+static size_t warc_current_gzfile_offset;
+
+/* The uncompressed size (so far) of the current record. */
+static size_t warc_current_gzfile_uncompressed_size;
+
+/* This is true until a warc_write_* method fails. */
+static bool warc_write_ok;
+
+/* The current CDX file (or NULL, if CDX is disabled). */
+static FILE *warc_current_cdx_file;
+
+/* The record id of the warcinfo record of the current WARC file.  */
+static char *warc_current_warcinfo_uuid_str;
+
+/* The file name of the current WARC file. */
+static char *warc_current_filename;
+
+/* The serial number of the current WARC file.  This number is
+   incremented each time a new file is opened and is used in the
+   WARC file's filename. */
+static int warc_current_file_number;
+
+/* The table of CDX records, if deduplication is enabled. */
+struct hash_table * warc_cdx_dedup_table;
+
+static bool warc_start_new_file (bool meta);
+
+
+struct warc_cdx_record
+{
+  char *url;
+  char *uuid;
+  char digest[SHA1_DIGEST_SIZE];
+};
+
+static unsigned long
+warc_hash_sha1_digest (const void *key)
+{
+  /* We just use some of the first bytes of the digest. */
+  unsigned long v = 0;
+  memcpy (&v, key, sizeof (unsigned long));
+  return v;
+}
+
+static int
+warc_cmp_sha1_digest (const void *digest1, const void *digest2)
+{
+  return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
+}
+
+
+
+/* Writes SIZE bytes from BUFFER to the current WARC file,
+   through gzwrite if compression is enabled.
+   Returns the number of uncompressed bytes written.  */
+static size_t
+warc_write_buffer (const char *buffer, size_t size)
+{
+  if (warc_current_gzfile)
+    {
+      warc_current_gzfile_uncompressed_size += size;
+      return gzwrite (warc_current_gzfile, buffer, size);
+    }
+  else
+    return fwrite (buffer, 1, size, warc_current_file);
+}
+
+/* Writes STR to the current WARC file.
+   Returns false and set warc_write_ok to false if there
+   is an error.  */
+static bool
+warc_write_string (const char *str)
+{
+  if (!warc_write_ok)
+    return false;
+
+  size_t n = strlen (str);
+  if (n != warc_write_buffer (str, n))
+    warc_write_ok = false;
+
+  return warc_write_ok;
+}
+
+
+#define EXTRA_GZIP_HEADER_SIZE 12
+#define GZIP_STATIC_HEADER_SIZE  10
+#define FLG_FEXTRA          0x04
+#define OFF_FLG             3
+
+/* Starts a new WARC record.  Writes the version header.
+   If opt.warc_maxsize is set and the current file is becoming
+   too large, this will open a new WARC file.
+
+   If compression is enabled, this will start a new
+   gzip stream in the current WARC file.
+
+   Returns false and set warc_write_ok to false if there
+   is an error.  */
+static bool
+warc_write_start_record ()
+{
+  if (!warc_write_ok)
+    return false;
+
+  fflush (warc_current_file);
+  if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
+    warc_start_new_file (false);
+
+  /* Start a GZIP stream, if required. */
+  if (opt.warc_compression_enabled)
+    {
+      /* Record the starting offset of the new record. */
+      warc_current_gzfile_offset = ftell (warc_current_file);
+
+      /* Reserve space for the extra GZIP header field.
+         In warc_write_end_record we will fill this space
+         with information about the uncompressed and
+         compressed size of the record. */
+      fprintf (warc_current_file, "XXXXXXXXXXXX");
+      fflush (warc_current_file);
+
+      /* Start a new GZIP stream. */
+      warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb+9");
+      warc_current_gzfile_uncompressed_size = 0;
+
+      if (warc_current_gzfile == NULL)
+        {
+          logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
+          warc_write_ok = false;
+          return false;
+        }
+    }
+
+  warc_write_string ("WARC/1.0\r\n");
+  return warc_write_ok;
+}
+
+/* Writes a WARC header to the current WARC record.
+   This method may be run after warc_write_start_record and
+   before warc_write_block_from_file.  */
+static bool
+warc_write_header (const char *name, const char *value)
+{
+  if (value)
+    {
+      warc_write_string (name);
+      warc_write_string (": ");
+      warc_write_string (value);
+      warc_write_string ("\r\n");
+    }
+  return warc_write_ok;
+}
+
+/* Copies the contents of DATA_IN to the WARC record.
+   Adds a Content-Length header to the WARC record.
+   Run this method after warc_write_header,
+   then run warc_write_end_record. */
+static bool
+warc_write_block_from_file (FILE *data_in)
+{
+  /* Add the Content-Length header. */
+  char *content_length;
+  fseek (data_in, 0L, SEEK_END);
+  if (! asprintf (&content_length, "%ld", ftell (data_in)))
+    {
+      warc_write_ok = false;
+      return false;
+    }
+  warc_write_header ("Content-Length", content_length);
+  free (content_length);
+
+  /* End of the WARC header section. */
+  warc_write_string ("\r\n");
+
+  if (fseek (data_in, 0L, SEEK_SET) != 0)
+    warc_write_ok = false;
+
+  /* Copy the data in the file to the WARC record. */
+  char buffer[BUFSIZ];
+  size_t s;
+  while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
+    {
+      if (warc_write_buffer (buffer, s) < s)
+        warc_write_ok = false;
+    }
+
+  return warc_write_ok;
+}
+
+/* Run this method to close the current WARC record.
+
+   If compression is enabled, this method closes the
+   current GZIP stream and fills the extra GZIP header
+   with the uncompressed and compressed length of the
+   record. */
+static bool
+warc_write_end_record ()
+{
+  warc_write_buffer ("\r\n\r\n", 4);
+
+  /* We start a new gzip stream for each record.  */
+  if (warc_write_ok && warc_current_gzfile)
+    {
+      if (gzclose (warc_current_gzfile) != Z_OK)
+        {
+          warc_write_ok = false;
+          return false;
+        }
+
+      fflush (warc_current_file);
+      fseek (warc_current_file, 0, SEEK_END);
+
+      /* The WARC standard suggests that we add 'skip length' data in the
+         extra header field of the GZIP stream.
+         
+         In warc_write_start_record we reserved space for this extra header.
+         This extra space starts at warc_current_gzfile_offset and fills
+         EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
+         warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
+         
+         We need to do three things:
+         1. Move the static GZIP header to warc_current_gzfile_offset;
+         2. Set the FEXTRA flag in the GZIP header;
+         3. Write the extra GZIP header after the static header, that is,
+            starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
+      */
+
+      /* Calculate the uncompressed and compressed sizes. */
+      size_t current_offset = ftell (warc_current_file);
+      size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
+      size_t compressed_size = warc_current_gzfile_uncompressed_size;
+
+      /* Go back to the static GZIP header. */
+      fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
+
+      /* Read the header. */
+      char static_header[GZIP_STATIC_HEADER_SIZE];
+      size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
+      if (result != GZIP_STATIC_HEADER_SIZE)
+        {
+          warc_write_ok = false;
+          return false;
+        }
+
+      /* Set the FEXTRA flag in the flags byte of the header. */
+      static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
+
+      /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
+      fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
+      fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
+
+      /* Prepare the extra GZIP header. */
+      char extra_header[EXTRA_GZIP_HEADER_SIZE];
+      /* XLEN, the length of the extra header fields.  */
+      extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
+      extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
+      /* The extra header field identifier for the WARC skip length. */
+      extra_header[2]  = 's';
+      extra_header[3]  = 'l';
+      /* The size of the uncompressed record.  */
+      extra_header[4]  = (uncompressed_size & 255);
+      extra_header[5]  = (uncompressed_size >> 8) & 255;
+      extra_header[6]  = (uncompressed_size >> 16) & 255;
+      extra_header[7]  = (uncompressed_size >> 24) & 255;
+      /* The size of the compressed record.  */
+      extra_header[8]  = (compressed_size & 255);
+      extra_header[9]  = (compressed_size >> 8) & 255;
+      extra_header[10] = (compressed_size >> 16) & 255;
+      extra_header[11] = (compressed_size >> 24) & 255;
+
+      /* Write the extra header after the static header. */
+      fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
+      fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
+
+      /* Done, move back to the end of the file. */
+      fflush (warc_current_file);
+      fseek (warc_current_file, 0, SEEK_END);
+    }
+
+  return warc_write_ok;
+}
+
+
+/* Writes the WARC-Date header for the given timestamp to
+   the current WARC record.
+   If timestamp is NULL, the current time will be used.  */
+static bool
+warc_write_date_header (char *timestamp)
+{
+  if (timestamp == NULL)
+    {
+      char current_timestamp[21];
+      warc_timestamp (current_timestamp);
+      timestamp = current_timestamp;
+    }
+  return warc_write_header ("WARC-Date", timestamp);
+}
+
+/* Writes the WARC-IP-Address header for the given IP to
+   the current WARC record.  If IP is NULL, no header will
+   be written.  */
+static bool
+warc_write_ip_header (ip_address *ip)
+{
+  if (ip != NULL)
+    return warc_write_header ("WARC-IP-Address", print_address (ip));
+  else
+    return warc_write_ok;
+}
+
+
+/* warc_sha1_stream_with_payload is a modified copy of sha1_stream
+   from gnulib/sha1.c.  This version calculates two digests in one go.
+
+   Compute SHA1 message digests for bytes read from STREAM.  The
+   digest of the complete file will be written into the 16 bytes
+   beginning at RES_BLOCK.
+   
+   If payload_offset >= 0, a second digest will be calculated of the
+   portion of the file starting at payload_offset and continuing to
+   the end of the file.  The digest number will be written into the
+   16 bytes beginning ad RES_PAYLOAD.  */
+static int
+warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
+{
+#define BLOCKSIZE 32768
+
+  struct sha1_ctx ctx_block;
+  struct sha1_ctx ctx_payload;
+  long int pos;
+  size_t sum;
+
+  char *buffer = malloc (BLOCKSIZE + 72);
+  if (!buffer)
+    return 1;
+
+  /* Initialize the computation context.  */
+  sha1_init_ctx (&ctx_block);
+  if (payload_offset >= 0)
+    sha1_init_ctx (&ctx_payload);
+
+  pos = 0;
+
+  /* Iterate over full file contents.  */
+  while (1)
+    {
+      /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
+         computation function processes the whole buffer so that with the
+         next round of the loop another block can be read.  */
+      size_t n;
+      sum = 0;
+
+      /* Read block.  Take care for partial reads.  */
+      while (1)
+        {
+          n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
+
+          sum += n;
+          pos += n;
+
+          if (sum == BLOCKSIZE)
+            break;
+
+          if (n == 0)
+            {
+              /* Check for the error flag IFF N == 0, so that we don't
+                 exit the loop after a partial read due to e.g., EAGAIN
+                 or EWOULDBLOCK.  */
+              if (ferror (stream))
+                {
+                  free (buffer);
+                  return 1;
+                }
+              goto process_partial_block;
+            }
+
+          /* We've read at least one byte, so ignore errors.  But always
+             check for EOF, since feof may be true even though N > 0.
+             Otherwise, we could end up calling fread after EOF.  */
+          if (feof (stream))
+            goto process_partial_block;
+        }
+
+      /* Process buffer with BLOCKSIZE bytes.  Note that
+                        BLOCKSIZE % 64 == 0
+       */
+      sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
+      if (payload_offset >= 0 && payload_offset < pos)
+        {
+          /* At least part of the buffer contains data from payload. */
+          int start_of_payload = payload_offset - (pos - BLOCKSIZE);
+          if (start_of_payload <= 0)
+            /* All bytes in the buffer belong to the payload. */
+            start_of_payload = 0;
+
+          /* Process the payload part of the buffer.
+             Note: we can't use  sha1_process_block  here even if we
+             process the complete buffer.  Because the payload doesn't
+             have to start with a full block, there may still be some
+             bytes left from the previous buffer.  Therefore, we need
+             to continue with  sha1_process_bytes.  */
+          sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
+        }
+    }
+
+ process_partial_block:;
+
+  /* Process any remaining bytes.  */
+  if (sum > 0)
+    {
+      sha1_process_bytes (buffer, sum, &ctx_block);
+      if (payload_offset >= 0 && payload_offset < pos)
+        {
+          /* At least part of the buffer contains data from payload. */
+          int start_of_payload = payload_offset - (pos - sum);
+          if (start_of_payload <= 0)
+            /* All bytes in the buffer belong to the payload. */
+            start_of_payload = 0;
+
+          /* Process the payload part of the buffer. */
+          sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
+        }
+    }
+
+  /* Construct result in desired memory.  */
+  sha1_finish_ctx (&ctx_block,   res_block);
+  if (payload_offset >= 0)
+    sha1_finish_ctx (&ctx_payload, res_payload);
+  free (buffer);
+  return 0;
+
+#undef BLOCKSIZE
+}
+
+/* Converts the SHA1 digest to a base32-encoded string.
+   "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
+static char *
+warc_base32_sha1_digest (char *sha1_digest)
+{
+  // length: "sha1:" + digest + "\0"
+  char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
+  base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
+  memcpy (sha1_base32, "sha1:", 5);
+  sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
+  return sha1_base32;
+}
+
+
+/* Sets the digest headers of the record.
+   This method will calculate the block digest and, if payload_offset >= 0,
+   will also calculate the payload digest of the payload starting at the
+   provided offset.  */
+static void
+warc_write_digest_headers (FILE *file, long payload_offset)
+{
+  if (opt.warc_digests_enabled)
+    {
+      /* Calculate the block and payload digests. */
+      char sha1_res_block[SHA1_DIGEST_SIZE];
+      char sha1_res_payload[SHA1_DIGEST_SIZE];
+
+      rewind (file);
+      if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
+        {
+          char *digest;
+
+          digest = warc_base32_sha1_digest (sha1_res_block);
+          warc_write_header ("WARC-Block-Digest", digest);
+          free (digest);
+
+          if (payload_offset >= 0)
+            {
+              digest = warc_base32_sha1_digest (sha1_res_payload);
+              warc_write_header ("WARC-Payload-Digest", digest);
+              free (digest);
+            }
+        }
+    }
+}
+
+
+/* Fills timestamp with the current time and date.
+   The UTC time is formatted following ISO 8601, as required
+   for use in the WARC-Date header.
+   The timestamp will be 21 characters long. */
+void
+warc_timestamp (char *timestamp)
+{
+  time_t rawtime;
+  struct tm * timeinfo;
+  time ( &rawtime );
+  timeinfo = gmtime (&rawtime);
+  strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
+}
+
+/* Fills uuid_str with a UUID based on random numbers.
+   (See RFC 4122, UUID version 4.)
+
+   Note: this is a fallback method, it is much better to use the
+   methods provided by libuuid.
+
+   The uuid_str will be 36 characters long. */
+static void
+warc_uuid_random (char *uuid_str)
+{
+  // RFC 4122, a version 4 UUID with only random numbers
+
+  unsigned char uuid_data[16];
+  int i;
+  for (i=0; i<16; i++)
+    uuid_data[i] = random_number (255);
+
+  // Set the four most significant bits (bits 12 through 15) of the
+  // time_hi_and_version field to the 4-bit version number
+  uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
+
+  // Set the two most significant bits (bits 6 and 7) of the
+  // clock_seq_hi_and_reserved to zero and one, respectively.
+  uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
+
+  sprintf (uuid_str,
+    "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+    uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
+    uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
+    uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
+    uuid_data[15]);
+}
+
+/* Fills urn_str with a UUID in the format required
+   for the WARC-Record-Id header.
+   The string will be 47 characters long. */
+void
+warc_uuid_str (char *urn_str)
+{
+  char uuid_str[37];
+
+# ifdef HAVE_LIBUUID
+  uuid_t record_id;
+  uuid_generate (record_id);
+  uuid_unparse (record_id, uuid_str);
+# else
+  warc_uuid_random (uuid_str);
+# endif
+
+  sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
+}
+
+/* Write a warcinfo record to the current file.
+   Updates warc_current_warcinfo_uuid_str. */
+bool
+warc_write_warcinfo_record (char *filename)
+{
+  /* Write warc-info record as the first record of the file. */
+  /* We add the record id of this info record to the other records in the file. */
+  warc_current_warcinfo_uuid_str = (char *) malloc (48);
+  warc_uuid_str (warc_current_warcinfo_uuid_str);
+
+  char timestamp[22];
+  warc_timestamp (timestamp);
+
+  char *filename_copy, *filename_basename;
+  filename_copy = strdup (filename);
+  filename_basename = basename (filename_copy);
+
+  warc_write_start_record ();
+  warc_write_header ("WARC-Type", "warcinfo");
+  warc_write_header ("Content-Type", "application/warc-fields");
+  warc_write_header ("WARC-Date", timestamp);
+  warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
+  warc_write_header ("WARC-Filename", filename_basename);
+
+  /* Create content.  */
+  FILE *warc_tmp = warc_tempfile ();
+  if (warc_tmp == NULL)
+    {
+      free (filename_copy);
+      return false;
+    }
+
+  fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
+  fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
+  fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
+  fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
+  fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
+  /* Add the user headers, if any. */
+  if (opt.warc_user_headers)
+    {
+      int i;
+      for (i = 0; opt.warc_user_headers[i]; i++)
+        fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
+    }
+  fprintf(warc_tmp, "\r\n");
+
+  warc_write_digest_headers (warc_tmp, -1);
+  warc_write_block_from_file (warc_tmp);
+  warc_write_end_record ();
+
+  if (! warc_write_ok)
+    {
+      logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
+    }
+
+  free (filename_copy);
+  fclose (warc_tmp);
+  return warc_write_ok;
+}
+
+/* Opens a new WARC file.
+   If META is true, generates a filename ending with 'meta.warc.gz'.
+   
+   This method will:
+   1. close the current WARC file (if there is one);
+   2. increment warc_current_file_number;
+   3. open a new WARC file;
+   4. write the initial warcinfo record.
+
+   Returns true on success, false otherwise.
+   */
+static bool
+warc_start_new_file (bool meta)
+{
+  if (opt.warc_filename == NULL)
+    return false;
+
+  if (warc_current_file != NULL)
+    fclose (warc_current_file);
+  if (warc_current_warcinfo_uuid_str)
+    free (warc_current_warcinfo_uuid_str);
+  if (warc_current_filename)
+    free (warc_current_filename);
+
+  warc_current_file_number++;
+
+  int base_filename_length = strlen (opt.warc_filename);
+  /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
+  char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
+  warc_current_filename = new_filename;
+
+  char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
+
+  /* If max size is enabled, we add a serial number to the file names. */
+  if (meta)
+    sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
+  else if (opt.warc_maxsize > 0)
+    sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
+  else
+    sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
+
+  logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
+
+  /* Open the WARC file. */
+  warc_current_file = fopen (new_filename, "wb+");
+  if (warc_current_file == NULL)
+    {
+      logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
+      return false;
+    }
+
+  if (! warc_write_warcinfo_record (new_filename))
+    return false;
+
+  /* Add warcinfo uuid to manifest. */
+  if (warc_manifest_fp)
+    fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
+
+  return true;
+}
+
+/* Opens the CDX file for output. */
+static bool
+warc_start_cdx_file ()
+{
+  int filename_length = strlen (opt.warc_filename);
+  char *cdx_filename = alloca (filename_length + 4 + 1);
+  memcpy (cdx_filename, opt.warc_filename, filename_length);
+  memcpy (cdx_filename + filename_length, ".cdx", 5);
+  warc_current_cdx_file = fopen (cdx_filename, "a+");
+  if (warc_current_cdx_file == NULL)
+    return false;
+
+  /* Print the CDX header.
+   *
+   * a - original url
+   * b - date
+   * m - mime type
+   * s - response code
+   * k - new style checksum
+   * r - redirect
+   * M - meta tags
+   * V - compressed arc file offset
+   * g - file name
+   * u - record-id
+   */
+  fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
+  fflush (warc_current_cdx_file);
+
+  return true;
+}
+
+#define CDX_FIELDSEP " \t\r\n"
+
+/* Parse the CDX header and find the field numbers of the original url,
+   checksum and record ID fields. */
+static bool
+warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
+{
+  *field_num_original_url = -1;
+  *field_num_checksum = -1;
+  *field_num_record_id = -1;
+
+  char *token;
+  char *save_ptr;
+  token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
+  
+  if (token != NULL && strcmp (token, "CDX") == 0)
+    {
+      int field_num = 0;
+      while (token != NULL)
+        {
+          token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
+          if (token != NULL)
+            {
+              switch (token[0])
+                {
+                case 'a':
+                  *field_num_original_url = field_num;
+                  break;
+                case 'k':
+                  *field_num_checksum = field_num;
+                  break;
+                case 'u':
+                  *field_num_record_id = field_num;
+                  break;
+                }
+            }
+          field_num++;
+        }
+    }
+
+  return *field_num_original_url != -1
+         && *field_num_checksum != -1
+         && *field_num_record_id != -1;
+}
+
+/* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
+static void
+warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
+{
+  char *original_url = NULL;
+  char *checksum = NULL;
+  char *record_id = NULL;
+
+  char *token;
+  char *save_ptr;
+  token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
+
+  /* Read this line to get the fields we need. */
+  int field_num = 0;
+  while (token != NULL)
+    {
+      char **val;
+      if (field_num == field_num_original_url)
+        val = &original_url;
+      else if (field_num == field_num_checksum)
+        val = &checksum;
+      else if (field_num == field_num_record_id)
+        val = &record_id;
+      else
+        val = NULL;
+
+      if (val != NULL)
+        *val = strdup (token);
+
+      token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
+      field_num++;
+    }
+
+  if (original_url != NULL && checksum != NULL && record_id != NULL)
+    {
+      /* For some extra efficiency, we decode the base32 encoded
+         checksum value.  This should produce exactly SHA1_DIGEST_SIZE
+         bytes.  */
+      size_t checksum_l;
+      char * checksum_v;
+      base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
+      free (checksum);
+
+      if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
+        {
+          /* This is a valid line with a valid checksum. */
+          struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
+          rec->url = original_url;
+          rec->uuid = record_id;
+          memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
+          hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
+          free (checksum_v);
+        }
+      else
+        {
+          free (original_url);
+          if (checksum_v != NULL)
+            free (checksum_v);
+          free (record_id);
+        }
+    }
+}
+
+/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
+   the warc_cdx_dedup_table. */
+bool
+warc_load_cdx_dedup_file ()
+{
+  FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
+  if (f == NULL)
+    return false;
+
+  int field_num_original_url = -1;
+  int field_num_checksum = -1;
+  int field_num_record_id = -1;
+
+  char *lineptr = NULL;
+  size_t n = 0;
+  size_t line_length;
+
+  /* The first line should contain the CDX header.
+     Format:  " CDX x x x x x"
+     where x are field type indicators.  For our purposes, we only
+     need 'a' (the original url), 'k' (the SHA1 checksum) and
+     'u' (the WARC record id). */
+  line_length = getline (&lineptr, &n, f);
+  if (line_length != -1)
+    warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
+
+  /* If the file contains all three fields, read the complete file. */
+  if (field_num_original_url == -1
+      || field_num_checksum == -1
+      || field_num_record_id == -1)
+    {
+      if (field_num_original_url == -1)
+        logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
+      if (field_num_checksum == -1)
+        logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
+      if (field_num_record_id == -1)
+        logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
+    }
+  else
+    {
+      /* Initialize the table. */
+      warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
+
+      do
+        {
+          line_length = getline (&lineptr, &n, f);
+          if (line_length != -1)
+            warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
+
+        }
+      while (line_length != -1);
+
+      /* Print results. */
+      int nrecords = hash_table_count (warc_cdx_dedup_table);
+      logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
+                                        "Loaded %d records from CDX.\n\n", nrecords),
+                              nrecords);
+    }
+
+  fclose (f);
+
+  return true;
+}
+#undef CDX_FIELDSEP
+
+/* Returns the existing duplicate CDX record for the given url and payload
+   digest.  Returns NULL if the url is not found or if the payload digest
+   does not match, or if CDX deduplication is disabled. */
+static struct warc_cdx_record *
+warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
+{
+  if (warc_cdx_dedup_table == NULL)
+    return NULL;
+
+  char *key;
+  struct warc_cdx_record *rec_existing;
+  hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
+
+  if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
+    return rec_existing;
+  else
+    return NULL;
+}
+
+/* Initializes the WARC writer (if opt.warc_filename is set).
+   This should be called before any WARC record is written. */
+void
+warc_init ()
+{
+  warc_write_ok = true;
+
+  if (opt.warc_filename != NULL)
+    {
+      if (opt.warc_cdx_dedup_filename != NULL)
+        {
+          if (! warc_load_cdx_dedup_file ())
+            {
+              logprintf (LOG_NOTQUIET,
+                         _("Could not read CDX file %s for deduplication.\n"),
+                         quote (opt.warc_cdx_dedup_filename));
+              exit(1);
+            }
+        }
+
+      warc_manifest_fp = warc_tempfile ();
+      if (warc_manifest_fp == NULL)
+        {
+          logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
+          exit(1);
+        }
+
+      if (opt.warc_keep_log)
+        {
+          warc_log_fp = warc_tempfile ();
+          if (warc_log_fp == NULL)
+            {
+              logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
+              exit(1);
+            }
+          log_set_warc_log_fp (warc_log_fp);
+        }
+
+      warc_current_file_number = -1;
+      if (! warc_start_new_file (false))
+        {
+          logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
+          exit(1);
+        }
+
+      if (opt.warc_cdx_enabled)
+        {
+          if (! warc_start_cdx_file ())
+            {
+              logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
+              exit(1);
+            }
+        }
+    }
+}
+
+/* Writes metadata (manifest, configuration, log file) to the WARC file. */
+void
+warc_write_metadata ()
+{
+  /* If there are multiple WARC files, the metadata should be written to a separate file. */
+  if (opt.warc_maxsize > 0)
+    warc_start_new_file (true);
+
+  char manifest_uuid [48];
+  warc_uuid_str (manifest_uuid);
+
+  fflush (warc_manifest_fp);
+  warc_write_resource_record (manifest_uuid,
+                              "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
+                              NULL, NULL, NULL, "text/plain",
+                              warc_manifest_fp, -1);
+  /* warc_write_resource_record has closed warc_manifest_fp. */
+
+  FILE * warc_tmp_fp = warc_tempfile ();
+  if (warc_tmp_fp == NULL)
+    {
+      logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
+      exit(1);
+    }
+  fflush (warc_tmp_fp);
+  fprintf (warc_tmp_fp, "%s\n", program_argstring);
+
+  warc_write_resource_record (manifest_uuid,
+                              "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
+                              NULL, NULL, NULL, "text/plain",
+                              warc_tmp_fp, -1);
+  /* warc_write_resource_record has closed warc_tmp_fp. */
+
+  if (warc_log_fp != NULL)
+    {
+      warc_write_resource_record (NULL,
+                                  "metadata://gnu.org/software/wget/warc/wget.log",
+                                  NULL, manifest_uuid, NULL, "text/plain",
+                                  warc_log_fp, -1);
+      /* warc_write_resource_record has closed warc_log_fp. */
+
+      warc_log_fp = NULL;
+      log_set_warc_log_fp (NULL);
+    }
+}
+
+/* Finishes the WARC writing.
+   This should be called at the end of the program. */
+void
+warc_close ()
+{
+  if (warc_current_file != NULL)
+    {
+      warc_write_metadata ();
+      free (warc_current_warcinfo_uuid_str);
+      fclose (warc_current_file);
+    }
+  if (warc_current_cdx_file != NULL)
+    fclose (warc_current_cdx_file);
+  if (warc_log_fp != NULL)
+    {
+      fclose (warc_log_fp);
+      log_set_warc_log_fp (NULL);
+    }
+}
+
+/* Creates a temporary file for writing WARC output.
+   The temporary file will be created in opt.warc_tempdir.
+   Returns the pointer to the temporary file, or NULL. */
+FILE *
+warc_tempfile ()
+{
+  char filename[100];
+  if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
+    return NULL;
+
+  int fd = mkstemp (filename);
+  if (fd < 0)
+    return NULL;
+
+  if (unlink (filename) < 0)
+    return NULL;
+
+  return fdopen (fd, "wb+");
+}
+
+
+/* Writes a request record to the WARC file.
+   url  is the target uri of the request,
+   timestamp_str  is the timestamp of the request (generated with warc_timestamp),
+   record_uuid  is the uuid of the request (generated with warc_uuid_str),
+   body  is a pointer to a file containing the request headers and body.
+   ip  is the ip address of the server (or NULL),
+   Calling this function will close body.
+   Returns true on success, false on error. */
+bool
+warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
+{
+  warc_write_start_record ();
+  warc_write_header ("WARC-Type", "request");
+  warc_write_header ("WARC-Target-URI", url);
+  warc_write_header ("Content-Type", "application/http;msgtype=request");
+  warc_write_date_header (timestamp_str);
+  warc_write_header ("WARC-Record-ID", record_uuid);
+  warc_write_ip_header (ip);
+  warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
+  warc_write_digest_headers (body, payload_offset);
+  warc_write_block_from_file (body);
+  warc_write_end_record ();
+  
+  fclose (body);
+
+  return warc_write_ok;
+}
+
+/* Writes a response record to the CDX file.
+   url  is the target uri of the request/response,
+   timestamp_str  is the timestamp of the request that generated this response,
+                  (generated with warc_timestamp),
+   mime_type  is the mime type of the response body (will be printed to CDX),
+   response_code  is the HTTP response code (will be printed to CDX),
+   payload_digest  is the sha1 digest of the payload,
+   redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
+   offset  is the position of the WARC record in the WARC file,
+   warc_filename  is the filename of the WARC,
+   response_uuid  is the uuid of the response.
+   Returns true on success, false on error. */
+static bool
+warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
+{
+  /* Transform the timestamp. */
+  char timestamp_str_cdx [15];
+  memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
+  memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
+  memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
+  memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
+  memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
+  memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
+  timestamp_str_cdx[14] = '\0';
+  
+  /* Rewrite the checksum. */
+  char *checksum;
+  if (payload_digest != NULL)
+    checksum = payload_digest + 5; /* Skip the "sha1:" */
+  else
+    checksum = "-";
+
+  if (mime_type == NULL || strlen(mime_type) == 0)
+    mime_type = "-";
+  if (redirect_location == NULL || strlen(redirect_location) == 0)
+    redirect_location = "-";
+
+  /* Print the CDX line. */
+  fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
+  fflush (warc_current_cdx_file);
+
+  return true;
+}
+
+/* Writes a revisit record to the WARC file.
+   url  is the target uri of the request/response,
+   timestamp_str  is the timestamp of the request that generated this response
+                  (generated with warc_timestamp),
+   concurrent_to_uuid  is the uuid of the request for that generated this response
+                 (generated with warc_uuid_str),
+   refers_to_uuid  is the uuid of the original response
+                 (generated with warc_uuid_str),
+   payload_digest  is the sha1 digest of the payload,
+   ip  is the ip address of the server (or NULL),
+   body  is a pointer to a file containing the response headers (without payload).
+   Calling this function will close body.
+   Returns true on success, false on error. */
+static bool
+warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
+{
+  char revisit_uuid [48];
+  warc_uuid_str (revisit_uuid);
+
+  char *block_digest = NULL;
+  char sha1_res_block[SHA1_DIGEST_SIZE];
+  sha1_stream (body, sha1_res_block);
+  block_digest = warc_base32_sha1_digest (sha1_res_block);
+
+  warc_write_start_record ();
+  warc_write_header ("WARC-Type", "revisit");
+  warc_write_header ("WARC-Record-ID", revisit_uuid);
+  warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
+  warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
+  warc_write_header ("WARC-Refers-To", refers_to);
+  warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
+  warc_write_header ("WARC-Truncated", "length");
+  warc_write_header ("WARC-Target-URI", url);
+  warc_write_date_header (timestamp_str);
+  warc_write_ip_header (ip);
+  warc_write_header ("Content-Type", "application/http;msgtype=response");
+  warc_write_header ("WARC-Block-Digest", block_digest);
+  warc_write_header ("WARC-Payload-Digest", payload_digest);
+  warc_write_block_from_file (body);
+  warc_write_end_record ();
+  
+  fclose (body);
+  free (block_digest);
+
+  return warc_write_ok;
+}
+
+/* Writes a response record to the WARC file.
+   url  is the target uri of the request/response,
+   timestamp_str  is the timestamp of the request that generated this response
+                  (generated with warc_timestamp),
+   concurrent_to_uuid  is the uuid of the request for that generated this response
+                 (generated with warc_uuid_str),
+   ip  is the ip address of the server (or NULL),
+   body  is a pointer to a file containing the response headers and body.
+   mime_type  is the mime type of the response body (will be printed to CDX),
+   response_code  is the HTTP response code (will be printed to CDX),
+   redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
+   Calling this function will close body.
+   Returns true on success, false on error. */
+bool
+warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
+{
+  char *block_digest = NULL;
+  char *payload_digest = NULL;
+  char sha1_res_block[SHA1_DIGEST_SIZE];
+  char sha1_res_payload[SHA1_DIGEST_SIZE];
+
+  if (opt.warc_digests_enabled)
+    {
+      /* Calculate the block and payload digests. */
+      rewind (body);
+      if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
+        {
+          /* Decide (based on url + payload digest) if we have seen this
+             data before. */
+          struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
+          if (rec_existing != NULL)
+            {
+              /* Found an existing record. */
+              logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
+
+              /* Remove the payload from the file. */
+              if (payload_offset > 0)
+                {
+                  if (ftruncate (fileno (body), payload_offset) == -1)
+                    return false;
+                }
+
+              /* Send the original payload digest. */
+              payload_digest = warc_base32_sha1_digest (sha1_res_payload);
+              bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
+              free (payload_digest);
+
+              return result;
+            }
+
+          block_digest = warc_base32_sha1_digest (sha1_res_block);
+          payload_digest = warc_base32_sha1_digest (sha1_res_payload);
+        }
+    }
+
+  /* Not a revisit, just store the record. */
+
+  char response_uuid [48];
+  warc_uuid_str (response_uuid);
+
+  fseek (warc_current_file, 0L, SEEK_END);
+  size_t offset = ftell (warc_current_file);
+
+  warc_write_start_record ();
+  warc_write_header ("WARC-Type", "response");
+  warc_write_header ("WARC-Record-ID", response_uuid);
+  warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
+  warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
+  warc_write_header ("WARC-Target-URI", url);
+  warc_write_date_header (timestamp_str);
+  warc_write_ip_header (ip);
+  warc_write_header ("WARC-Block-Digest", block_digest);
+  warc_write_header ("WARC-Payload-Digest", payload_digest);
+  warc_write_header ("Content-Type", "application/http;msgtype=response");
+  warc_write_block_from_file (body);
+  warc_write_end_record ();
+
+  fclose (body);
+
+  if (warc_write_ok && opt.warc_cdx_enabled)
+    {
+      /* Add this record to the CDX. */
+      warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
+    }
+
+  if (block_digest)
+    free (block_digest);
+  if (payload_digest)
+    free (payload_digest);
+
+  return warc_write_ok;
+}
+
+/* Writes a resource record to the WARC file.
+   resource_uuid  is the uuid of the resource (or NULL),
+   url  is the target uri of the resource,
+   timestamp_str  is the timestamp (generated with warc_timestamp),
+   concurrent_to_uuid  is the uuid of the request for that generated this resource
+                 (generated with warc_uuid_str) or NULL,
+   ip  is the ip address of the server (or NULL),
+   content_type  is the mime type of the body (or NULL),
+   body  is a pointer to a file containing the resource data.
+   Calling this function will close body.
+   Returns true on success, false on error. */
+bool
+warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
+{
+  if (resource_uuid == NULL)
+    {
+      resource_uuid = alloca (48);
+      warc_uuid_str (resource_uuid);
+    }
+
+  if (content_type == NULL)
+    content_type = "application/octet-stream";
+
+  warc_write_start_record ();
+  warc_write_header ("WARC-Type", "resource");
+  warc_write_header ("WARC-Record-ID", resource_uuid);
+  warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
+  warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
+  warc_write_header ("WARC-Target-URI", url);
+  warc_write_date_header (timestamp_str);
+  warc_write_ip_header (ip);
+  warc_write_digest_headers (body, payload_offset);
+  warc_write_header ("Content-Type", content_type);
+  warc_write_block_from_file (body);
+  warc_write_end_record ();
+  
+  fclose (body);
+
+  return warc_write_ok;
+}
+
diff --git a/src/warc.h b/src/warc.h

new file mode 100644 (file)

index 0000000..2ade2a8
--- /dev/null
+++ b/src/warc.h
@@ -0,0 +1,19 @@
+/* Declarations of WARC helper methods. */
+#ifndef WARC_H
+#define WARC_H
+
+#include "host.h"
+
+void warc_init ();
+void warc_close ();
+void warc_timestamp (char *timestamp);
+void warc_uuid_str (char *id_str);
+
+FILE * warc_tempfile ();
+
+bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset);
+bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location);
+bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset);
+
+#endif /* WARC_H */
+
diff --git a/src/wget.h b/src/wget.h

index c7c5e2cb1dd0ebd4c09419569fe174c5c199c81d..ee315b6f81aab54c806250040f0218073cc0f40e 100644 (file)
--- a/src/wget.h
+++ b/src/wget.h
@@ -353,7 +353,9 @@ typedef enum
    PROXERR,
    /* 50  */
    AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR,
-  UNLINKERR, NEWLOCATION_KEEP_POST
+  UNLINKERR, NEWLOCATION_KEEP_POST,
+
+  WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR
  } uerr_t;
  
  /* 2005-02-19 SMS.
author	Gijs van Tulder <gvtulder@gmail.com>
	Fri, 4 Nov 2011 21:25:00 +0000 (22:25 +0100)
committer	Giuseppe Scrivano <gscrivano@gnu.org>
	Fri, 4 Nov 2011 21:25:00 +0000 (22:25 +0100)
bootstrap.conf		patch \| blob \| history
configure.ac		patch \| blob \| history
src/ChangeLog		patch \| blob \| history
src/Makefile.am		patch \| blob \| history
src/ftp.c		patch \| blob \| history
src/http.c		patch \| blob \| history
src/init.c		patch \| blob \| history
src/log.c		patch \| blob \| history
src/log.h		patch \| blob \| history
src/main.c		patch \| blob \| history
src/options.h		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/retr.h		patch \| blob \| history
src/test.c		patch \| blob \| history
src/warc.c	[new file with mode: 0644]	patch \| blob
src/warc.h	[new file with mode: 0644]	patch \| blob
src/wget.h		patch \| blob \| history