warc: support large files.

[wget] / src / warc.c
diff --git a/src/warc.c b/src/warc.c

index 77ef3692cd358757b0bf9d50600d3324a57eeb67..1c1e0797e0724d3aa13dd5ab2fc2bc6852f543ef 100644 (file)
--- a/src/warc.c
+++ b/src/warc.c
@@ -1,4 +1,32 @@
-/* Utility functions for writing WARC files. */
+/* Utility functions for writing WARC files.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
  #define _GNU_SOURCE
  
  #include "wget.h"
@@ -14,11 +42,17 @@
  #include <sha1.h>
  #include <base32.h>
  #include <unistd.h>
+#ifdef HAVE_LIBZ
  #include <zlib.h>
+#endif
  #ifdef HAVE_LIBUUID
  #include <uuid/uuid.h>
  #endif
  
+#ifndef WINDOWS
+#include <libgen.h>
+#endif
+
  #include "warc.h"
  
  extern char *version_string;
@@ -38,15 +72,17 @@ static FILE *warc_manifest_fp;
  /* The current WARC file (or NULL, if WARC is disabled). */
  static FILE *warc_current_file;
  
+#ifdef HAVE_LIBZ
  /* The gzip stream for the current WARC file
     (or NULL, if WARC or gzip is disabled). */
  static gzFile *warc_current_gzfile;
  
  /* The offset of the current gzip record in the WARC file. */
-static size_t warc_current_gzfile_offset;
+static off_t warc_current_gzfile_offset;
  
  /* The uncompressed size (so far) of the current record. */
-static size_t warc_current_gzfile_uncompressed_size;
+static off_t warc_current_gzfile_uncompressed_size;
+# endif
  
  /* This is true until a warc_write_* method fails. */
  static bool warc_write_ok;
@@ -101,12 +137,14 @@ warc_cmp_sha1_digest (const void *digest1, const void *digest2)
  static size_t
  warc_write_buffer (const char *buffer, size_t size)
  {
+#ifdef HAVE_LIBZ
    if (warc_current_gzfile)
      {
        warc_current_gzfile_uncompressed_size += size;
        return gzwrite (warc_current_gzfile, buffer, size);
      }
    else
+#endif
      return fwrite (buffer, 1, size, warc_current_file);
  }
  
@@ -148,14 +186,15 @@ warc_write_start_record ()
      return false;
  
    fflush (warc_current_file);
-  if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
+  if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
      warc_start_new_file (false);
  
+#ifdef HAVE_LIBZ
    /* Start a GZIP stream, if required. */
    if (opt.warc_compression_enabled)
      {
        /* Record the starting offset of the new record. */
-      warc_current_gzfile_offset = ftell (warc_current_file);
+      warc_current_gzfile_offset = ftello (warc_current_file);
  
        /* Reserve space for the extra GZIP header field.
           In warc_write_end_record we will fill this space
@@ -165,7 +204,7 @@ warc_write_start_record ()
        fflush (warc_current_file);
  
        /* Start a new GZIP stream. */
-      warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb+9");
+      warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
        warc_current_gzfile_uncompressed_size = 0;
  
        if (warc_current_gzfile == NULL)
@@ -175,6 +214,7 @@ warc_write_start_record ()
            return false;
          }
      }
+#endif
  
    warc_write_string ("WARC/1.0\r\n");
    return warc_write_ok;
@@ -205,8 +245,8 @@ warc_write_block_from_file (FILE *data_in)
  {
    /* Add the Content-Length header. */
    char *content_length;
-  fseek (data_in, 0L, SEEK_END);
-  if (! asprintf (&content_length, "%ld", ftell (data_in)))
+  fseeko (data_in, 0L, SEEK_END);
+  if (! asprintf (&content_length, "%ld", ftello (data_in)))
      {
        warc_write_ok = false;
        return false;
@@ -217,7 +257,7 @@ warc_write_block_from_file (FILE *data_in)
    /* End of the WARC header section. */
    warc_write_string ("\r\n");
  
-  if (fseek (data_in, 0L, SEEK_SET) != 0)
+  if (fseeko (data_in, 0L, SEEK_SET) != 0)
      warc_write_ok = false;
  
    /* Copy the data in the file to the WARC record. */
@@ -243,6 +283,7 @@ warc_write_end_record ()
  {
    warc_write_buffer ("\r\n\r\n", 4);
  
+#ifdef HAVE_LIBZ
    /* We start a new gzip stream for each record.  */
    if (warc_write_ok && warc_current_gzfile)
      {
@@ -253,7 +294,7 @@ warc_write_end_record ()
          }
  
        fflush (warc_current_file);
-      fseek (warc_current_file, 0, SEEK_END);
+      fseeko (warc_current_file, 0, SEEK_END);
  
        /* The WARC standard suggests that we add 'skip length' data in the
           extra header field of the GZIP stream.
@@ -271,12 +312,12 @@ warc_write_end_record ()
        */
  
        /* Calculate the uncompressed and compressed sizes. */
-      size_t current_offset = ftell (warc_current_file);
-      size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
-      size_t compressed_size = warc_current_gzfile_uncompressed_size;
+      off_t current_offset = ftello (warc_current_file);
+      off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
+      off_t compressed_size = warc_current_gzfile_uncompressed_size;
  
        /* Go back to the static GZIP header. */
-      fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
+      fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
  
        /* Read the header. */
        char static_header[GZIP_STATIC_HEADER_SIZE];
@@ -291,7 +332,7 @@ warc_write_end_record ()
        static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
  
        /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
-      fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
+      fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
        fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
  
        /* Prepare the extra GZIP header. */
@@ -314,13 +355,14 @@ warc_write_end_record ()
        extra_header[11] = (compressed_size >> 24) & 255;
  
        /* Write the extra header after the static header. */
-      fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
+      fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
        fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
  
        /* Done, move back to the end of the file. */
        fflush (warc_current_file);
-      fseek (warc_current_file, 0, SEEK_END);
+      fseeko (warc_current_file, 0, SEEK_END);
      }
+#endif /* HAVE_LIBZ */
  
    return warc_write_ok;
  }
@@ -366,14 +408,14 @@ warc_write_ip_header (ip_address *ip)
     the end of the file.  The digest number will be written into the
     16 bytes beginning ad RES_PAYLOAD.  */
  static int
-warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
+warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset)
  {
  #define BLOCKSIZE 32768
  
    struct sha1_ctx ctx_block;
    struct sha1_ctx ctx_payload;
-  long int pos;
-  size_t sum;
+  off_t pos;
+  off_t sum;
  
    char *buffer = malloc (BLOCKSIZE + 72);
    if (!buffer)
@@ -392,7 +434,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
        /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
           computation function processes the whole buffer so that with the
           next round of the loop another block can be read.  */
-      size_t n;
+      off_t n;
        sum = 0;
  
        /* Read block.  Take care for partial reads.  */
@@ -433,7 +475,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
        if (payload_offset >= 0 && payload_offset < pos)
          {
            /* At least part of the buffer contains data from payload. */
-          int start_of_payload = payload_offset - (pos - BLOCKSIZE);
+          off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
            if (start_of_payload <= 0)
              /* All bytes in the buffer belong to the payload. */
              start_of_payload = 0;
@@ -457,7 +499,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
        if (payload_offset >= 0 && payload_offset < pos)
          {
            /* At least part of the buffer contains data from payload. */
-          int start_of_payload = payload_offset - (pos - sum);
+          off_t start_of_payload = payload_offset - (pos - sum);
            if (start_of_payload <= 0)
              /* All bytes in the buffer belong to the payload. */
              start_of_payload = 0;
@@ -605,7 +647,7 @@ warc_write_warcinfo_record (char *filename)
  
    char *filename_copy, *filename_basename;
    filename_copy = strdup (filename);
-  filename_basename = basename (filename_copy);
+  filename_basename = strdup (basename (filename_copy));
  
    warc_write_start_record ();
    warc_write_header ("WARC-Type", "warcinfo");
@@ -619,6 +661,7 @@ warc_write_warcinfo_record (char *filename)
    if (warc_tmp == NULL)
      {
        free (filename_copy);
+      free (filename_basename);
        return false;
      }
  
@@ -646,6 +689,7 @@ warc_write_warcinfo_record (char *filename)
      }
  
    free (filename_copy);
+  free (filename_basename);
    fclose (warc_tmp);
    return warc_write_ok;
  }
@@ -681,7 +725,11 @@ warc_start_new_file (bool meta)
    char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
    warc_current_filename = new_filename;
  
+#ifdef HAVE_LIBZ
    char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
+#else
+  char *extension = "warc";
+#endif
  
    /* If max size is enabled, we add a serial number to the file names. */
    if (meta)
@@ -1086,7 +1134,7 @@ warc_tempfile ()
     Calling this function will close body.
     Returns true on success, false on error. */
  bool
-warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
+warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset)
  {
    warc_write_start_record ();
    warc_write_header ("WARC-Type", "request");
@@ -1118,7 +1166,7 @@ warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip
     response_uuid  is the uuid of the response.
     Returns true on success, false on error. */
  static bool
-warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
+warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid)
  {
    /* Transform the timestamp. */
    char timestamp_str_cdx [15];
@@ -1210,7 +1258,7 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_u
     Calling this function will close body.
     Returns true on success, false on error. */
  bool
-warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
+warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location)
  {
    char *block_digest = NULL;
    char *payload_digest = NULL;
@@ -1256,8 +1304,8 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_
    char response_uuid [48];
    warc_uuid_str (response_uuid);
  
-  fseek (warc_current_file, 0L, SEEK_END);
-  size_t offset = ftell (warc_current_file);
+  fseeko (warc_current_file, 0L, SEEK_END);
+  off_t offset = ftello (warc_current_file);
  
    warc_write_start_record ();
    warc_write_header ("WARC-Type", "response");
@@ -1301,7 +1349,7 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_
     Calling this function will close body.
     Returns true on success, false on error. */
  bool
-warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
+warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset)
  {
    if (resource_uuid == NULL)
      {