sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #endif
  55
  56 #include "warc.h"
  57
  58 extern char *version_string;
  59
  60 /* Set by main in main.c */
  61 extern char *program_argstring;
  62
  63
  64 /* The log file (a temporary file that contains a copy
  65    of the wget log). */
  66 static FILE *warc_log_fp;
  67
  68 /* The manifest file (a temporary file that contains the
  69    warcinfo uuid of every file in this crawl). */
  70 static FILE *warc_manifest_fp;
  71
  72 /* The current WARC file (or NULL, if WARC is disabled). */
  73 static FILE *warc_current_file;
  74
  75 #ifdef HAVE_LIBZ
  76 /* The gzip stream for the current WARC file
  77    (or NULL, if WARC or gzip is disabled). */
  78 static gzFile warc_current_gzfile;
  79
  80 /* The offset of the current gzip record in the WARC file. */
  81 static off_t warc_current_gzfile_offset;
  82
  83 /* The uncompressed size (so far) of the current record. */
  84 static off_t warc_current_gzfile_uncompressed_size;
  85 # endif
  86
  87 /* This is true until a warc_write_* method fails. */
  88 static bool warc_write_ok;
  89
  90 /* The current CDX file (or NULL, if CDX is disabled). */
  91 static FILE *warc_current_cdx_file;
  92
  93 /* The record id of the warcinfo record of the current WARC file.  */
  94 static char *warc_current_warcinfo_uuid_str;
  95
  96 /* The file name of the current WARC file. */
  97 static char *warc_current_filename;
  98
  99 /* The serial number of the current WARC file.  This number is
 100    incremented each time a new file is opened and is used in the
 101    WARC file's filename. */
 102 static int warc_current_file_number;
 103
 104 /* The table of CDX records, if deduplication is enabled. */
 105 struct hash_table * warc_cdx_dedup_table;
 106
 107 static bool warc_start_new_file (bool meta);
 108
 109
 110 struct warc_cdx_record
 111 {
 112   char *url;
 113   char *uuid;
 114   char digest[SHA1_DIGEST_SIZE];
 115 };
 116
 117 static unsigned long
 118 warc_hash_sha1_digest (const void *key)
 119 {
 120   /* We just use some of the first bytes of the digest. */
 121   unsigned long v = 0;
 122   memcpy (&v, key, sizeof (unsigned long));
 123   return v;
 124 }
 125
 126 static int
 127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 128 {
 129   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 130 }
 131
 132
 133
 134 /* Writes SIZE bytes from BUFFER to the current WARC file,
 135    through gzwrite if compression is enabled.
 136    Returns the number of uncompressed bytes written.  */
 137 static size_t
 138 warc_write_buffer (const char *buffer, size_t size)
 139 {
 140 #ifdef HAVE_LIBZ
 141   if (warc_current_gzfile)
 142     {
 143       warc_current_gzfile_uncompressed_size += size;
 144       return gzwrite (warc_current_gzfile, buffer, size);
 145     }
 146   else
 147 #endif
 148     return fwrite (buffer, 1, size, warc_current_file);
 149 }
 150
 151 /* Writes STR to the current WARC file.
 152    Returns false and set warc_write_ok to false if there
 153    is an error.  */
 154 static bool
 155 warc_write_string (const char *str)
 156 {
 157   if (!warc_write_ok)
 158     return false;
 159
 160   size_t n = strlen (str);
 161   if (n != warc_write_buffer (str, n))
 162     warc_write_ok = false;
 163
 164   return warc_write_ok;
 165 }
 166
 167
 168 #define EXTRA_GZIP_HEADER_SIZE 12
 169 #define GZIP_STATIC_HEADER_SIZE  10
 170 #define FLG_FEXTRA          0x04
 171 #define OFF_FLG             3
 172
 173 /* Starts a new WARC record.  Writes the version header.
 174    If opt.warc_maxsize is set and the current file is becoming
 175    too large, this will open a new WARC file.
 176
 177    If compression is enabled, this will start a new
 178    gzip stream in the current WARC file.
 179
 180    Returns false and set warc_write_ok to false if there
 181    is an error.  */
 182 static bool
 183 warc_write_start_record (void)
 184 {
 185   if (!warc_write_ok)
 186     return false;
 187
 188   fflush (warc_current_file);
 189   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 190     warc_start_new_file (false);
 191
 192 #ifdef HAVE_LIBZ
 193   /* Start a GZIP stream, if required. */
 194   if (opt.warc_compression_enabled)
 195     {
 196       /* Record the starting offset of the new record. */
 197       warc_current_gzfile_offset = ftello (warc_current_file);
 198
 199       /* Reserve space for the extra GZIP header field.
 200          In warc_write_end_record we will fill this space
 201          with information about the uncompressed and
 202          compressed size of the record. */
 203       fprintf (warc_current_file, "XXXXXXXXXXXX");
 204       fflush (warc_current_file);
 205
 206       /* Start a new GZIP stream. */
 207       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 208       warc_current_gzfile_uncompressed_size = 0;
 209
 210       if (warc_current_gzfile == NULL)
 211         {
 212           logprintf (LOG_NOTQUIET,
 213 _("Error opening GZIP stream to WARC file.\n"));
 214           warc_write_ok = false;
 215           return false;
 216         }
 217     }
 218 #endif
 219
 220   warc_write_string ("WARC/1.0\r\n");
 221   return warc_write_ok;
 222 }
 223
 224 /* Writes a WARC header to the current WARC record.
 225    This method may be run after warc_write_start_record and
 226    before warc_write_block_from_file.  */
 227 static bool
 228 warc_write_header (const char *name, const char *value)
 229 {
 230   if (value)
 231     {
 232       warc_write_string (name);
 233       warc_write_string (": ");
 234       warc_write_string (value);
 235       warc_write_string ("\r\n");
 236     }
 237   return warc_write_ok;
 238 }
 239
 240 /* Copies the contents of DATA_IN to the WARC record.
 241    Adds a Content-Length header to the WARC record.
 242    Run this method after warc_write_header,
 243    then run warc_write_end_record. */
 244 static bool
 245 warc_write_block_from_file (FILE *data_in)
 246 {
 247   /* Add the Content-Length header. */
 248   char content_length[22];
 249   fseeko (data_in, 0L, SEEK_END);
 250   number_to_string (content_length, ftello (data_in));
 251   warc_write_header ("Content-Length", content_length);
 252
 253   /* End of the WARC header section. */
 254   warc_write_string ("\r\n");
 255
 256   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 257     warc_write_ok = false;
 258
 259   /* Copy the data in the file to the WARC record. */
 260   char buffer[BUFSIZ];
 261   size_t s;
 262   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 263     {
 264       if (warc_write_buffer (buffer, s) < s)
 265         warc_write_ok = false;
 266     }
 267
 268   return warc_write_ok;
 269 }
 270
 271 /* Run this method to close the current WARC record.
 272
 273    If compression is enabled, this method closes the
 274    current GZIP stream and fills the extra GZIP header
 275    with the uncompressed and compressed length of the
 276    record. */
 277 static bool
 278 warc_write_end_record (void)
 279 {
 280   warc_write_buffer ("\r\n\r\n", 4);
 281
 282 #ifdef HAVE_LIBZ
 283   /* We start a new gzip stream for each record.  */
 284   if (warc_write_ok && warc_current_gzfile)
 285     {
 286       if (gzclose (warc_current_gzfile) != Z_OK)
 287         {
 288           warc_write_ok = false;
 289           return false;
 290         }
 291
 292       fflush (warc_current_file);
 293       fseeko (warc_current_file, 0, SEEK_END);
 294
 295       /* The WARC standard suggests that we add 'skip length' data in the
 296          extra header field of the GZIP stream.
 297
 298          In warc_write_start_record we reserved space for this extra header.
 299          This extra space starts at warc_current_gzfile_offset and fills
 300          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 301          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 302
 303          We need to do three things:
 304          1. Move the static GZIP header to warc_current_gzfile_offset;
 305          2. Set the FEXTRA flag in the GZIP header;
 306          3. Write the extra GZIP header after the static header, that is,
 307             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 308       */
 309
 310       /* Calculate the uncompressed and compressed sizes. */
 311       off_t current_offset = ftello (warc_current_file);
 312       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 313       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 314
 315       /* Go back to the static GZIP header. */
 316       fseeko (warc_current_file, warc_current_gzfile_offset
 317               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 318
 319       /* Read the header. */
 320       char static_header[GZIP_STATIC_HEADER_SIZE];
 321       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
 322                              warc_current_file);
 323       if (result != GZIP_STATIC_HEADER_SIZE)
 324         {
 325           warc_write_ok = false;
 326           return false;
 327         }
 328
 329       /* Set the FEXTRA flag in the flags byte of the header. */
 330       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 331
 332       /* Write the header back to the file, but starting at
 333          warc_current_gzfile_offset. */
 334       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 335       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 336
 337       /* Prepare the extra GZIP header. */
 338       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 339       /* XLEN, the length of the extra header fields.  */
 340       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 341       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 342       /* The extra header field identifier for the WARC skip length. */
 343       extra_header[2]  = 's';
 344       extra_header[3]  = 'l';
 345       /* The size of the uncompressed record.  */
 346       extra_header[4]  = (uncompressed_size & 255);
 347       extra_header[5]  = (uncompressed_size >> 8) & 255;
 348       extra_header[6]  = (uncompressed_size >> 16) & 255;
 349       extra_header[7]  = (uncompressed_size >> 24) & 255;
 350       /* The size of the compressed record.  */
 351       extra_header[8]  = (compressed_size & 255);
 352       extra_header[9]  = (compressed_size >> 8) & 255;
 353       extra_header[10] = (compressed_size >> 16) & 255;
 354       extra_header[11] = (compressed_size >> 24) & 255;
 355
 356       /* Write the extra header after the static header. */
 357       fseeko (warc_current_file, warc_current_gzfile_offset
 358               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 359       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 360
 361       /* Done, move back to the end of the file. */
 362       fflush (warc_current_file);
 363       fseeko (warc_current_file, 0, SEEK_END);
 364     }
 365 #endif /* HAVE_LIBZ */
 366
 367   return warc_write_ok;
 368 }
 369
 370
 371 /* Writes the WARC-Date header for the given timestamp to
 372    the current WARC record.
 373    If timestamp is NULL, the current time will be used.  */
 374 static bool
 375 warc_write_date_header (const char *timestamp)
 376 {
 377   if (timestamp == NULL)
 378     {
 379       char current_timestamp[21];
 380       warc_timestamp (current_timestamp);
 381       timestamp = current_timestamp;
 382     }
 383   return warc_write_header ("WARC-Date", timestamp);
 384 }
 385
 386 /* Writes the WARC-IP-Address header for the given IP to
 387    the current WARC record.  If IP is NULL, no header will
 388    be written.  */
 389 static bool
 390 warc_write_ip_header (ip_address *ip)
 391 {
 392   if (ip != NULL)
 393     return warc_write_header ("WARC-IP-Address", print_address (ip));
 394   else
 395     return warc_write_ok;
 396 }
 397
 398
 399 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 400    from gnulib/sha1.c.  This version calculates two digests in one go.
 401
 402    Compute SHA1 message digests for bytes read from STREAM.  The
 403    digest of the complete file will be written into the 16 bytes
 404    beginning at RES_BLOCK.
 405
 406    If payload_offset >= 0, a second digest will be calculated of the
 407    portion of the file starting at payload_offset and continuing to
 408    the end of the file.  The digest number will be written into the
 409    16 bytes beginning ad RES_PAYLOAD.  */
 410 static int
 411 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 412                                off_t payload_offset)
 413 {
 414 #define BLOCKSIZE 32768
 415
 416   struct sha1_ctx ctx_block;
 417   struct sha1_ctx ctx_payload;
 418   off_t pos;
 419   off_t sum;
 420
 421   char *buffer = malloc (BLOCKSIZE + 72);
 422   if (!buffer)
 423     return 1;
 424
 425   /* Initialize the computation context.  */
 426   sha1_init_ctx (&ctx_block);
 427   if (payload_offset >= 0)
 428     sha1_init_ctx (&ctx_payload);
 429
 430   pos = 0;
 431
 432   /* Iterate over full file contents.  */
 433   while (1)
 434     {
 435       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 436          computation function processes the whole buffer so that with the
 437          next round of the loop another block can be read.  */
 438       off_t n;
 439       sum = 0;
 440
 441       /* Read block.  Take care for partial reads.  */
 442       while (1)
 443         {
 444           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 445
 446           sum += n;
 447           pos += n;
 448
 449           if (sum == BLOCKSIZE)
 450             break;
 451
 452           if (n == 0)
 453             {
 454               /* Check for the error flag IFF N == 0, so that we don't
 455                  exit the loop after a partial read due to e.g., EAGAIN
 456                  or EWOULDBLOCK.  */
 457               if (ferror (stream))
 458                 {
 459                   free (buffer);
 460                   return 1;
 461                 }
 462               goto process_partial_block;
 463             }
 464
 465           /* We've read at least one byte, so ignore errors.  But always
 466              check for EOF, since feof may be true even though N > 0.
 467              Otherwise, we could end up calling fread after EOF.  */
 468           if (feof (stream))
 469             goto process_partial_block;
 470         }
 471
 472       /* Process buffer with BLOCKSIZE bytes.  Note that
 473                         BLOCKSIZE % 64 == 0
 474        */
 475       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 476       if (payload_offset >= 0 && payload_offset < pos)
 477         {
 478           /* At least part of the buffer contains data from payload. */
 479           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 480           if (start_of_payload <= 0)
 481             /* All bytes in the buffer belong to the payload. */
 482             start_of_payload = 0;
 483
 484           /* Process the payload part of the buffer.
 485              Note: we can't use  sha1_process_block  here even if we
 486              process the complete buffer.  Because the payload doesn't
 487              have to start with a full block, there may still be some
 488              bytes left from the previous buffer.  Therefore, we need
 489              to continue with  sha1_process_bytes.  */
 490           sha1_process_bytes (buffer + start_of_payload,
 491                               BLOCKSIZE - start_of_payload, &ctx_payload);
 492         }
 493     }
 494
 495  process_partial_block:;
 496
 497   /* Process any remaining bytes.  */
 498   if (sum > 0)
 499     {
 500       sha1_process_bytes (buffer, sum, &ctx_block);
 501       if (payload_offset >= 0 && payload_offset < pos)
 502         {
 503           /* At least part of the buffer contains data from payload. */
 504           off_t start_of_payload = payload_offset - (pos - sum);
 505           if (start_of_payload <= 0)
 506             /* All bytes in the buffer belong to the payload. */
 507             start_of_payload = 0;
 508
 509           /* Process the payload part of the buffer. */
 510           sha1_process_bytes (buffer + start_of_payload,
 511                               sum - start_of_payload, &ctx_payload);
 512         }
 513     }
 514
 515   /* Construct result in desired memory.  */
 516   sha1_finish_ctx (&ctx_block,   res_block);
 517   if (payload_offset >= 0)
 518     sha1_finish_ctx (&ctx_payload, res_payload);
 519   free (buffer);
 520   return 0;
 521
 522 #undef BLOCKSIZE
 523 }
 524
 525 /* Converts the SHA1 digest to a base32-encoded string.
 526    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 527 static char *
 528 warc_base32_sha1_digest (char *sha1_digest)
 529 {
 530   /* length: "sha1:" + digest + "\0" */
 531   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 532   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
 533                  BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 534   memcpy (sha1_base32, "sha1:", 5);
 535   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 536   return sha1_base32;
 537 }
 538
 539
 540 /* Sets the digest headers of the record.
 541    This method will calculate the block digest and, if payload_offset >= 0,
 542    will also calculate the payload digest of the payload starting at the
 543    provided offset.  */
 544 static void
 545 warc_write_digest_headers (FILE *file, long payload_offset)
 546 {
 547   if (opt.warc_digests_enabled)
 548     {
 549       /* Calculate the block and payload digests. */
 550       char sha1_res_block[SHA1_DIGEST_SIZE];
 551       char sha1_res_payload[SHA1_DIGEST_SIZE];
 552
 553       rewind (file);
 554       if (warc_sha1_stream_with_payload (file, sha1_res_block,
 555           sha1_res_payload, payload_offset) == 0)
 556         {
 557           char *digest;
 558
 559           digest = warc_base32_sha1_digest (sha1_res_block);
 560           warc_write_header ("WARC-Block-Digest", digest);
 561           free (digest);
 562
 563           if (payload_offset >= 0)
 564             {
 565               digest = warc_base32_sha1_digest (sha1_res_payload);
 566               warc_write_header ("WARC-Payload-Digest", digest);
 567               free (digest);
 568             }
 569         }
 570     }
 571 }
 572
 573
 574 /* Fills timestamp with the current time and date.
 575    The UTC time is formatted following ISO 8601, as required
 576    for use in the WARC-Date header.
 577    The timestamp will be 21 characters long. */
 578 void
 579 warc_timestamp (char *timestamp)
 580 {
 581   time_t rawtime;
 582   struct tm * timeinfo;
 583   time ( &rawtime );
 584   timeinfo = gmtime (&rawtime);
 585   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 586 }
 587
 588 #ifdef HAVE_LIBUUID
 589 /* Fills urn_str with a UUID in the format required
 590    for the WARC-Record-Id header.
 591    The string will be 47 characters long. */
 592 void
 593 warc_uuid_str (char *urn_str)
 594 {
 595   char uuid_str[37];
 596
 597   uuid_t record_id;
 598   uuid_generate (record_id);
 599   uuid_unparse (record_id, uuid_str);
 600
 601   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 602 }
 603 #else
 604 /* Fills urn_str with a UUID based on random numbers in the format
 605    required for the WARC-Record-Id header.
 606    (See RFC 4122, UUID version 4.)
 607
 608    Note: this is a fallback method, it is much better to use the
 609    methods provided by libuuid.
 610
 611    The string will be 47 characters long. */
 612 void
 613 warc_uuid_str (char *urn_str)
 614 {
 615   // RFC 4122, a version 4 UUID with only random numbers
 616
 617   unsigned char uuid_data[16];
 618   int i;
 619   for (i=0; i<16; i++)
 620     uuid_data[i] = random_number (255);
 621
 622   // Set the four most significant bits (bits 12 through 15) of the
 623   // time_hi_and_version field to the 4-bit version number
 624   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 625
 626   // Set the two most significant bits (bits 6 and 7) of the
 627   // clock_seq_hi_and_reserved to zero and one, respectively.
 628   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 629
 630   sprintf (urn_str,
 631     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 632     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 633     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 634     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 635     uuid_data[15]);
 636 }
 637 #endif
 638
 639 /* Write a warcinfo record to the current file.
 640    Updates warc_current_warcinfo_uuid_str. */
 641 static bool
 642 warc_write_warcinfo_record (char *filename)
 643 {
 644   /* Write warc-info record as the first record of the file. */
 645   /* We add the record id of this info record to the other records in the
 646      file. */
 647   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 648   warc_uuid_str (warc_current_warcinfo_uuid_str);
 649
 650   char timestamp[22];
 651   warc_timestamp (timestamp);
 652
 653   char *filename_copy, *filename_basename;
 654   filename_copy = strdup (filename);
 655   filename_basename = strdup (basename (filename_copy));
 656
 657   warc_write_start_record ();
 658   warc_write_header ("WARC-Type", "warcinfo");
 659   warc_write_header ("Content-Type", "application/warc-fields");
 660   warc_write_header ("WARC-Date", timestamp);
 661   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 662   warc_write_header ("WARC-Filename", filename_basename);
 663
 664   /* Create content.  */
 665   FILE *warc_tmp = warc_tempfile ();
 666   if (warc_tmp == NULL)
 667     {
 668       free (filename_copy);
 669       free (filename_basename);
 670       return false;
 671     }
 672
 673   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 674   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 675   fprintf (warc_tmp,
 676 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 677   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 678   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 679   /* Add the user headers, if any. */
 680   if (opt.warc_user_headers)
 681     {
 682       int i;
 683       for (i = 0; opt.warc_user_headers[i]; i++)
 684         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 685     }
 686   fprintf(warc_tmp, "\r\n");
 687
 688   warc_write_digest_headers (warc_tmp, -1);
 689   warc_write_block_from_file (warc_tmp);
 690   warc_write_end_record ();
 691
 692   if (! warc_write_ok)
 693     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 694
 695   free (filename_copy);
 696   free (filename_basename);
 697   fclose (warc_tmp);
 698   return warc_write_ok;
 699 }
 700
 701 /* Opens a new WARC file.
 702    If META is true, generates a filename ending with 'meta.warc.gz'.
 703
 704    This method will:
 705    1. close the current WARC file (if there is one);
 706    2. increment warc_current_file_number;
 707    3. open a new WARC file;
 708    4. write the initial warcinfo record.
 709
 710    Returns true on success, false otherwise.
 711    */
 712 static bool
 713 warc_start_new_file (bool meta)
 714 {
 715   if (opt.warc_filename == NULL)
 716     return false;
 717
 718   if (warc_current_file != NULL)
 719     fclose (warc_current_file);
 720   if (warc_current_warcinfo_uuid_str)
 721     free (warc_current_warcinfo_uuid_str);
 722   if (warc_current_filename)
 723     free (warc_current_filename);
 724
 725   warc_current_file_number++;
 726
 727   int base_filename_length = strlen (opt.warc_filename);
 728   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 729   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 730   warc_current_filename = new_filename;
 731
 732 #ifdef HAVE_LIBZ
 733   const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 734 #else
 735   const char *extension = "warc";
 736 #endif
 737
 738   /* If max size is enabled, we add a serial number to the file names. */
 739   if (meta)
 740     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 741   else if (opt.warc_maxsize > 0)
 742     {
 743       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
 744                warc_current_file_number, extension);
 745     }
 746   else
 747     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 748
 749   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 750
 751   /* Open the WARC file. */
 752   warc_current_file = fopen (new_filename, "wb+");
 753   if (warc_current_file == NULL)
 754     {
 755       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
 756                  quote (new_filename));
 757       return false;
 758     }
 759
 760   if (! warc_write_warcinfo_record (new_filename))
 761     return false;
 762
 763   /* Add warcinfo uuid to manifest. */
 764   if (warc_manifest_fp)
 765     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 766
 767   return true;
 768 }
 769
 770 /* Opens the CDX file for output. */
 771 static bool
 772 warc_start_cdx_file (void)
 773 {
 774   int filename_length = strlen (opt.warc_filename);
 775   char *cdx_filename = alloca (filename_length + 4 + 1);
 776   memcpy (cdx_filename, opt.warc_filename, filename_length);
 777   memcpy (cdx_filename + filename_length, ".cdx", 5);
 778   warc_current_cdx_file = fopen (cdx_filename, "a+");
 779   if (warc_current_cdx_file == NULL)
 780     return false;
 781
 782   /* Print the CDX header.
 783    *
 784    * a - original url
 785    * b - date
 786    * m - mime type
 787    * s - response code
 788    * k - new style checksum
 789    * r - redirect
 790    * M - meta tags
 791    * V - compressed arc file offset
 792    * g - file name
 793    * u - record-id
 794    */
 795   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 796   fflush (warc_current_cdx_file);
 797
 798   return true;
 799 }
 800
 801 #define CDX_FIELDSEP " \t\r\n"
 802
 803 /* Parse the CDX header and find the field numbers of the original url,
 804    checksum and record ID fields. */
 805 static bool
 806 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
 807                        int *field_num_checksum, int *field_num_record_id)
 808 {
 809   *field_num_original_url = -1;
 810   *field_num_checksum = -1;
 811   *field_num_record_id = -1;
 812
 813   char *token;
 814   char *save_ptr;
 815   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 816
 817   if (token != NULL && strcmp (token, "CDX") == 0)
 818     {
 819       int field_num = 0;
 820       while (token != NULL)
 821         {
 822           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 823           if (token != NULL)
 824             {
 825               switch (token[0])
 826                 {
 827                 case 'a':
 828                   *field_num_original_url = field_num;
 829                   break;
 830                 case 'k':
 831                   *field_num_checksum = field_num;
 832                   break;
 833                 case 'u':
 834                   *field_num_record_id = field_num;
 835                   break;
 836                 }
 837             }
 838           field_num++;
 839         }
 840     }
 841
 842   return *field_num_original_url != -1
 843          && *field_num_checksum != -1
 844          && *field_num_record_id != -1;
 845 }
 846
 847 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 848 static void
 849 warc_process_cdx_line (char *lineptr, int field_num_original_url,
 850                        int field_num_checksum, int field_num_record_id)
 851 {
 852   char *original_url = NULL;
 853   char *checksum = NULL;
 854   char *record_id = NULL;
 855
 856   char *token;
 857   char *save_ptr;
 858   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 859
 860   /* Read this line to get the fields we need. */
 861   int field_num = 0;
 862   while (token != NULL)
 863     {
 864       char **val;
 865       if (field_num == field_num_original_url)
 866         val = &original_url;
 867       else if (field_num == field_num_checksum)
 868         val = &checksum;
 869       else if (field_num == field_num_record_id)
 870         val = &record_id;
 871       else
 872         val = NULL;
 873
 874       if (val != NULL)
 875         *val = strdup (token);
 876
 877       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 878       field_num++;
 879     }
 880
 881   if (original_url != NULL && checksum != NULL && record_id != NULL)
 882     {
 883       /* For some extra efficiency, we decode the base32 encoded
 884          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 885          bytes.  */
 886       size_t checksum_l;
 887       char * checksum_v;
 888       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
 889                            &checksum_l);
 890       free (checksum);
 891
 892       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 893         {
 894           /* This is a valid line with a valid checksum. */
 895           struct warc_cdx_record *rec;
 896           rec = malloc (sizeof (struct warc_cdx_record));
 897           rec->url = original_url;
 898           rec->uuid = record_id;
 899           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 900           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 901           free (checksum_v);
 902         }
 903       else
 904         {
 905           free (original_url);
 906           if (checksum_v != NULL)
 907             free (checksum_v);
 908           free (record_id);
 909         }
 910     }
 911   else
 912     {
 913       xfree_null(checksum);
 914       xfree_null(original_url);
 915       xfree_null(record_id);
 916     }
 917 }
 918
 919 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 920    the warc_cdx_dedup_table. */
 921 static bool
 922 warc_load_cdx_dedup_file (void)
 923 {
 924   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 925   if (f == NULL)
 926     return false;
 927
 928   int field_num_original_url = -1;
 929   int field_num_checksum = -1;
 930   int field_num_record_id = -1;
 931
 932   char *lineptr = NULL;
 933   size_t n = 0;
 934   ssize_t line_length;
 935
 936   /* The first line should contain the CDX header.
 937      Format:  " CDX x x x x x"
 938      where x are field type indicators.  For our purposes, we only
 939      need 'a' (the original url), 'k' (the SHA1 checksum) and
 940      'u' (the WARC record id). */
 941   line_length = getline (&lineptr, &n, f);
 942   if (line_length != -1)
 943     warc_parse_cdx_header (lineptr, &field_num_original_url,
 944                            &field_num_checksum, &field_num_record_id);
 945
 946   /* If the file contains all three fields, read the complete file. */
 947   if (field_num_original_url == -1
 948       || field_num_checksum == -1
 949       || field_num_record_id == -1)
 950     {
 951       if (field_num_original_url == -1)
 952         logprintf (LOG_NOTQUIET,
 953 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 954       if (field_num_checksum == -1)
 955         logprintf (LOG_NOTQUIET,
 956 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 957       if (field_num_record_id == -1)
 958         logprintf (LOG_NOTQUIET,
 959 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 960     }
 961   else
 962     {
 963       /* Initialize the table. */
 964       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
 965                                              warc_cmp_sha1_digest);
 966
 967       do
 968         {
 969           line_length = getline (&lineptr, &n, f);
 970           if (line_length != -1)
 971             {
 972               warc_process_cdx_line (lineptr, field_num_original_url,
 973                             field_num_checksum, field_num_record_id);
 974             }
 975
 976         }
 977       while (line_length != -1);
 978
 979       /* Print results. */
 980       int nrecords = hash_table_count (warc_cdx_dedup_table);
 981       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 982                                         "Loaded %d records from CDX.\n\n",
 983                                          nrecords),
 984                               nrecords);
 985     }
 986
 987   free (lineptr);
 988   fclose (f);
 989
 990   return true;
 991 }
 992 #undef CDX_FIELDSEP
 993
 994 /* Returns the existing duplicate CDX record for the given url and payload
 995    digest.  Returns NULL if the url is not found or if the payload digest
 996    does not match, or if CDX deduplication is disabled. */
 997 static struct warc_cdx_record *
 998 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 999 {
1000   if (warc_cdx_dedup_table == NULL)
1001     return NULL;
1002
1003   struct warc_cdx_record *rec_existing
1004     = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1005
1006   if (rec_existing && strcmp (rec_existing->url, url) == 0)
1007     return rec_existing;
1008   else
1009     return NULL;
1010 }
1011
1012 /* Initializes the WARC writer (if opt.warc_filename is set).
1013    This should be called before any WARC record is written. */
1014 void
1015 warc_init (void)
1016 {
1017   warc_write_ok = true;
1018
1019   if (opt.warc_filename != NULL)
1020     {
1021       if (opt.warc_cdx_dedup_filename != NULL)
1022         {
1023           if (! warc_load_cdx_dedup_file ())
1024             {
1025               logprintf (LOG_NOTQUIET,
1026                          _("Could not read CDX file %s for deduplication.\n"),
1027                          quote (opt.warc_cdx_dedup_filename));
1028               exit(1);
1029             }
1030         }
1031
1032       warc_manifest_fp = warc_tempfile ();
1033       if (warc_manifest_fp == NULL)
1034         {
1035           logprintf (LOG_NOTQUIET,
1036                      _("Could not open temporary WARC manifest file.\n"));
1037           exit(1);
1038         }
1039
1040       if (opt.warc_keep_log)
1041         {
1042           warc_log_fp = warc_tempfile ();
1043           if (warc_log_fp == NULL)
1044             {
1045               logprintf (LOG_NOTQUIET,
1046                          _("Could not open temporary WARC log file.\n"));
1047               exit(1);
1048             }
1049           log_set_warc_log_fp (warc_log_fp);
1050         }
1051
1052       warc_current_file_number = -1;
1053       if (! warc_start_new_file (false))
1054         {
1055           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1056           exit(1);
1057         }
1058
1059       if (opt.warc_cdx_enabled)
1060         {
1061           if (! warc_start_cdx_file ())
1062             {
1063               logprintf (LOG_NOTQUIET,
1064                          _("Could not open CDX file for output.\n"));
1065               exit(1);
1066             }
1067         }
1068     }
1069 }
1070
1071 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1072 static void
1073 warc_write_metadata (void)
1074 {
1075   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1076   if (opt.warc_maxsize > 0)
1077     warc_start_new_file (true);
1078
1079   char manifest_uuid [48];
1080   warc_uuid_str (manifest_uuid);
1081
1082   fflush (warc_manifest_fp);
1083   warc_write_resource_record (manifest_uuid,
1084                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1085                               NULL, NULL, NULL, "text/plain",
1086                               warc_manifest_fp, -1);
1087   /* warc_write_resource_record has closed warc_manifest_fp. */
1088
1089   FILE * warc_tmp_fp = warc_tempfile ();
1090   if (warc_tmp_fp == NULL)
1091     {
1092       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1093       exit(1);
1094     }
1095   fflush (warc_tmp_fp);
1096   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1097
1098   warc_write_resource_record (manifest_uuid,
1099                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1100                               NULL, NULL, NULL, "text/plain",
1101                               warc_tmp_fp, -1);
1102   /* warc_write_resource_record has closed warc_tmp_fp. */
1103
1104   if (warc_log_fp != NULL)
1105     {
1106       warc_write_resource_record (NULL,
1107                               "metadata://gnu.org/software/wget/warc/wget.log",
1108                                   NULL, manifest_uuid, NULL, "text/plain",
1109                                   warc_log_fp, -1);
1110       /* warc_write_resource_record has closed warc_log_fp. */
1111
1112       warc_log_fp = NULL;
1113       log_set_warc_log_fp (NULL);
1114     }
1115 }
1116
1117 /* Finishes the WARC writing.
1118    This should be called at the end of the program. */
1119 void
1120 warc_close (void)
1121 {
1122   if (warc_current_file != NULL)
1123     {
1124       warc_write_metadata ();
1125       free (warc_current_warcinfo_uuid_str);
1126       fclose (warc_current_file);
1127     }
1128   if (warc_current_cdx_file != NULL)
1129     fclose (warc_current_cdx_file);
1130   if (warc_log_fp != NULL)
1131     {
1132       fclose (warc_log_fp);
1133       log_set_warc_log_fp (NULL);
1134     }
1135 }
1136
1137 /* Creates a temporary file for writing WARC output.
1138    The temporary file will be created in opt.warc_tempdir.
1139    Returns the pointer to the temporary file, or NULL. */
1140 FILE *
1141 warc_tempfile (void)
1142 {
1143   char filename[100];
1144   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1145     return NULL;
1146
1147   int fd = mkstemp (filename);
1148   if (fd < 0)
1149     return NULL;
1150
1151   if (unlink (filename) < 0)
1152     return NULL;
1153
1154   return fdopen (fd, "wb+");
1155 }
1156
1157
1158 /* Writes a request record to the WARC file.
1159    url  is the target uri of the request,
1160    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1161    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1162    body  is a pointer to a file containing the request headers and body.
1163    ip  is the ip address of the server (or NULL),
1164    Calling this function will close body.
1165    Returns true on success, false on error. */
1166 bool
1167 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1168                            ip_address *ip, FILE *body, off_t payload_offset)
1169 {
1170   warc_write_start_record ();
1171   warc_write_header ("WARC-Type", "request");
1172   warc_write_header ("WARC-Target-URI", url);
1173   warc_write_header ("Content-Type", "application/http;msgtype=request");
1174   warc_write_date_header (timestamp_str);
1175   warc_write_header ("WARC-Record-ID", record_uuid);
1176   warc_write_ip_header (ip);
1177   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1178   warc_write_digest_headers (body, payload_offset);
1179   warc_write_block_from_file (body);
1180   warc_write_end_record ();
1181
1182   fclose (body);
1183
1184   return warc_write_ok;
1185 }
1186
1187 /* Writes a response record to the CDX file.
1188    url  is the target uri of the request/response,
1189    timestamp_str  is the timestamp of the request that generated this response,
1190                   (generated with warc_timestamp),
1191    mime_type  is the mime type of the response body (will be printed to CDX),
1192    response_code  is the HTTP response code (will be printed to CDX),
1193    payload_digest  is the sha1 digest of the payload,
1194    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1195    offset  is the position of the WARC record in the WARC file,
1196    warc_filename  is the filename of the WARC,
1197    response_uuid  is the uuid of the response.
1198    Returns true on success, false on error. */
1199 static bool
1200 warc_write_cdx_record (const char *url, const char *timestamp_str,
1201                        const char *mime_type, int response_code,
1202                        const char *payload_digest, const char *redirect_location,
1203                        off_t offset, const char *warc_filename,
1204                        const char *response_uuid)
1205 {
1206   /* Transform the timestamp. */
1207   char timestamp_str_cdx [15];
1208   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1209   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1210   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1211   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1212   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1213   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1214   timestamp_str_cdx[14] = '\0';
1215
1216   /* Rewrite the checksum. */
1217   const char *checksum;
1218   if (payload_digest != NULL)
1219     checksum = payload_digest + 5; /* Skip the "sha1:" */
1220   else
1221     checksum = "-";
1222
1223   if (mime_type == NULL || strlen(mime_type) == 0)
1224     mime_type = "-";
1225   if (redirect_location == NULL || strlen(redirect_location) == 0)
1226     redirect_location = "-";
1227
1228   char offset_string[22];
1229   number_to_string (offset_string, offset);
1230
1231   /* Print the CDX line. */
1232   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1233            timestamp_str_cdx, url, mime_type, response_code, checksum,
1234            redirect_location, offset_string, warc_current_filename,
1235            response_uuid);
1236   fflush (warc_current_cdx_file);
1237
1238   return true;
1239 }
1240
1241 /* Writes a revisit record to the WARC file.
1242    url  is the target uri of the request/response,
1243    timestamp_str  is the timestamp of the request that generated this response
1244                   (generated with warc_timestamp),
1245    concurrent_to_uuid  is the uuid of the request for that generated this response
1246                  (generated with warc_uuid_str),
1247    refers_to_uuid  is the uuid of the original response
1248                  (generated with warc_uuid_str),
1249    payload_digest  is the sha1 digest of the payload,
1250    ip  is the ip address of the server (or NULL),
1251    body  is a pointer to a file containing the response headers (without payload).
1252    Calling this function will close body.
1253    Returns true on success, false on error. */
1254 static bool
1255 warc_write_revisit_record (char *url, char *timestamp_str,
1256                            char *concurrent_to_uuid, char *payload_digest,
1257                            char *refers_to, ip_address *ip, FILE *body)
1258 {
1259   char revisit_uuid [48];
1260   warc_uuid_str (revisit_uuid);
1261
1262   char *block_digest = NULL;
1263   char sha1_res_block[SHA1_DIGEST_SIZE];
1264   sha1_stream (body, sha1_res_block);
1265   block_digest = warc_base32_sha1_digest (sha1_res_block);
1266
1267   warc_write_start_record ();
1268   warc_write_header ("WARC-Type", "revisit");
1269   warc_write_header ("WARC-Record-ID", revisit_uuid);
1270   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1271   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1272   warc_write_header ("WARC-Refers-To", refers_to);
1273   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1274   warc_write_header ("WARC-Truncated", "length");
1275   warc_write_header ("WARC-Target-URI", url);
1276   warc_write_date_header (timestamp_str);
1277   warc_write_ip_header (ip);
1278   warc_write_header ("Content-Type", "application/http;msgtype=response");
1279   warc_write_header ("WARC-Block-Digest", block_digest);
1280   warc_write_header ("WARC-Payload-Digest", payload_digest);
1281   warc_write_block_from_file (body);
1282   warc_write_end_record ();
1283
1284   fclose (body);
1285   free (block_digest);
1286
1287   return warc_write_ok;
1288 }
1289
1290 /* Writes a response record to the WARC file.
1291    url  is the target uri of the request/response,
1292    timestamp_str  is the timestamp of the request that generated this response
1293                   (generated with warc_timestamp),
1294    concurrent_to_uuid  is the uuid of the request for that generated this response
1295                  (generated with warc_uuid_str),
1296    ip  is the ip address of the server (or NULL),
1297    body  is a pointer to a file containing the response headers and body.
1298    mime_type  is the mime type of the response body (will be printed to CDX),
1299    response_code  is the HTTP response code (will be printed to CDX),
1300    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1301    Calling this function will close body.
1302    Returns true on success, false on error. */
1303 bool
1304 warc_write_response_record (char *url, char *timestamp_str,
1305                             char *concurrent_to_uuid, ip_address *ip,
1306                             FILE *body, off_t payload_offset, char *mime_type,
1307                             int response_code, char *redirect_location)
1308 {
1309   char *block_digest = NULL;
1310   char *payload_digest = NULL;
1311   char sha1_res_block[SHA1_DIGEST_SIZE];
1312   char sha1_res_payload[SHA1_DIGEST_SIZE];
1313
1314   if (opt.warc_digests_enabled)
1315     {
1316       /* Calculate the block and payload digests. */
1317       rewind (body);
1318       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1319           payload_offset) == 0)
1320         {
1321           /* Decide (based on url + payload digest) if we have seen this
1322              data before. */
1323           struct warc_cdx_record *rec_existing;
1324           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1325           if (rec_existing != NULL)
1326             {
1327               bool result;
1328
1329               /* Found an existing record. */
1330               logprintf (LOG_VERBOSE,
1331           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1332
1333               /* Remove the payload from the file. */
1334               if (payload_offset > 0)
1335                 {
1336                   if (ftruncate (fileno (body), payload_offset) == -1)
1337                     return false;
1338                 }
1339
1340               /* Send the original payload digest. */
1341               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1342               result = warc_write_revisit_record (url, timestamp_str,
1343                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1344                          ip, body);
1345               free (payload_digest);
1346
1347               return result;
1348             }
1349
1350           block_digest = warc_base32_sha1_digest (sha1_res_block);
1351           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1352         }
1353     }
1354
1355   /* Not a revisit, just store the record. */
1356
1357   char response_uuid [48];
1358   warc_uuid_str (response_uuid);
1359
1360   fseeko (warc_current_file, 0L, SEEK_END);
1361   off_t offset = ftello (warc_current_file);
1362
1363   warc_write_start_record ();
1364   warc_write_header ("WARC-Type", "response");
1365   warc_write_header ("WARC-Record-ID", response_uuid);
1366   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1367   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1368   warc_write_header ("WARC-Target-URI", url);
1369   warc_write_date_header (timestamp_str);
1370   warc_write_ip_header (ip);
1371   warc_write_header ("WARC-Block-Digest", block_digest);
1372   warc_write_header ("WARC-Payload-Digest", payload_digest);
1373   warc_write_header ("Content-Type", "application/http;msgtype=response");
1374   warc_write_block_from_file (body);
1375   warc_write_end_record ();
1376
1377   fclose (body);
1378
1379   if (warc_write_ok && opt.warc_cdx_enabled)
1380     {
1381       /* Add this record to the CDX. */
1382       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1383       payload_digest, redirect_location, offset, warc_current_filename,
1384       response_uuid);
1385     }
1386
1387   if (block_digest)
1388     free (block_digest);
1389   if (payload_digest)
1390     free (payload_digest);
1391
1392   return warc_write_ok;
1393 }
1394
1395 /* Writes a resource record to the WARC file.
1396    resource_uuid  is the uuid of the resource (or NULL),
1397    url  is the target uri of the resource,
1398    timestamp_str  is the timestamp (generated with warc_timestamp),
1399    concurrent_to_uuid  is the uuid of the request for that generated this
1400    resource (generated with warc_uuid_str) or NULL,
1401    ip  is the ip address of the server (or NULL),
1402    content_type  is the mime type of the body (or NULL),
1403    body  is a pointer to a file containing the resource data.
1404    Calling this function will close body.
1405    Returns true on success, false on error. */
1406 bool
1407 warc_write_resource_record (char *resource_uuid, const char *url,
1408                  const char *timestamp_str, const char *concurrent_to_uuid,
1409                  ip_address *ip, const char *content_type, FILE *body,
1410                  off_t payload_offset)
1411 {
1412   if (resource_uuid == NULL)
1413     {
1414       resource_uuid = alloca (48);
1415       warc_uuid_str (resource_uuid);
1416     }
1417
1418   if (content_type == NULL)
1419     content_type = "application/octet-stream";
1420
1421   warc_write_start_record ();
1422   warc_write_header ("WARC-Type", "resource");
1423   warc_write_header ("WARC-Record-ID", resource_uuid);
1424   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1425   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1426   warc_write_header ("WARC-Target-URI", url);
1427   warc_write_date_header (timestamp_str);
1428   warc_write_ip_header (ip);
1429   warc_write_digest_headers (body, payload_offset);
1430   warc_write_header ("Content-Type", content_type);
1431   warc_write_block_from_file (body);
1432   warc_write_end_record ();
1433
1434   fclose (body);
1435
1436   return warc_write_ok;
1437 }