sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #endif
  55
  56 #include "warc.h"
  57
  58 extern char *version_string;
  59
  60 /* Set by main in main.c */
  61 extern char *program_argstring;
  62
  63
  64 /* The log file (a temporary file that contains a copy
  65    of the wget log). */
  66 static FILE *warc_log_fp;
  67
  68 /* The manifest file (a temporary file that contains the
  69    warcinfo uuid of every file in this crawl). */
  70 static FILE *warc_manifest_fp;
  71
  72 /* The current WARC file (or NULL, if WARC is disabled). */
  73 static FILE *warc_current_file;
  74
  75 #ifdef HAVE_LIBZ
  76 /* The gzip stream for the current WARC file
  77    (or NULL, if WARC or gzip is disabled). */
  78 static gzFile *warc_current_gzfile;
  79
  80 /* The offset of the current gzip record in the WARC file. */
  81 static off_t warc_current_gzfile_offset;
  82
  83 /* The uncompressed size (so far) of the current record. */
  84 static off_t warc_current_gzfile_uncompressed_size;
  85 # endif
  86
  87 /* This is true until a warc_write_* method fails. */
  88 static bool warc_write_ok;
  89
  90 /* The current CDX file (or NULL, if CDX is disabled). */
  91 static FILE *warc_current_cdx_file;
  92
  93 /* The record id of the warcinfo record of the current WARC file.  */
  94 static char *warc_current_warcinfo_uuid_str;
  95
  96 /* The file name of the current WARC file. */
  97 static char *warc_current_filename;
  98
  99 /* The serial number of the current WARC file.  This number is
 100    incremented each time a new file is opened and is used in the
 101    WARC file's filename. */
 102 static int warc_current_file_number;
 103
 104 /* The table of CDX records, if deduplication is enabled. */
 105 struct hash_table * warc_cdx_dedup_table;
 106
 107 static bool warc_start_new_file (bool meta);
 108
 109
 110 struct warc_cdx_record
 111 {
 112   char *url;
 113   char *uuid;
 114   char digest[SHA1_DIGEST_SIZE];
 115 };
 116
 117 static unsigned long
 118 warc_hash_sha1_digest (const void *key)
 119 {
 120   /* We just use some of the first bytes of the digest. */
 121   unsigned long v = 0;
 122   memcpy (&v, key, sizeof (unsigned long));
 123   return v;
 124 }
 125
 126 static int
 127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 128 {
 129   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 130 }
 131
 132
 133
 134 /* Writes SIZE bytes from BUFFER to the current WARC file,
 135    through gzwrite if compression is enabled.
 136    Returns the number of uncompressed bytes written.  */
 137 static size_t
 138 warc_write_buffer (const char *buffer, size_t size)
 139 {
 140 #ifdef HAVE_LIBZ
 141   if (warc_current_gzfile)
 142     {
 143       warc_current_gzfile_uncompressed_size += size;
 144       return gzwrite (warc_current_gzfile, buffer, size);
 145     }
 146   else
 147 #endif
 148     return fwrite (buffer, 1, size, warc_current_file);
 149 }
 150
 151 /* Writes STR to the current WARC file.
 152    Returns false and set warc_write_ok to false if there
 153    is an error.  */
 154 static bool
 155 warc_write_string (const char *str)
 156 {
 157   if (!warc_write_ok)
 158     return false;
 159
 160   size_t n = strlen (str);
 161   if (n != warc_write_buffer (str, n))
 162     warc_write_ok = false;
 163
 164   return warc_write_ok;
 165 }
 166
 167
 168 #define EXTRA_GZIP_HEADER_SIZE 12
 169 #define GZIP_STATIC_HEADER_SIZE  10
 170 #define FLG_FEXTRA          0x04
 171 #define OFF_FLG             3
 172
 173 /* Starts a new WARC record.  Writes the version header.
 174    If opt.warc_maxsize is set and the current file is becoming
 175    too large, this will open a new WARC file.
 176
 177    If compression is enabled, this will start a new
 178    gzip stream in the current WARC file.
 179
 180    Returns false and set warc_write_ok to false if there
 181    is an error.  */
 182 static bool
 183 warc_write_start_record ()
 184 {
 185   if (!warc_write_ok)
 186     return false;
 187
 188   fflush (warc_current_file);
 189   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 190     warc_start_new_file (false);
 191
 192 #ifdef HAVE_LIBZ
 193   /* Start a GZIP stream, if required. */
 194   if (opt.warc_compression_enabled)
 195     {
 196       /* Record the starting offset of the new record. */
 197       warc_current_gzfile_offset = ftello (warc_current_file);
 198
 199       /* Reserve space for the extra GZIP header field.
 200          In warc_write_end_record we will fill this space
 201          with information about the uncompressed and
 202          compressed size of the record. */
 203       fprintf (warc_current_file, "XXXXXXXXXXXX");
 204       fflush (warc_current_file);
 205
 206       /* Start a new GZIP stream. */
 207       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 208       warc_current_gzfile_uncompressed_size = 0;
 209
 210       if (warc_current_gzfile == NULL)
 211         {
 212           logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
 213           warc_write_ok = false;
 214           return false;
 215         }
 216     }
 217 #endif
 218
 219   warc_write_string ("WARC/1.0\r\n");
 220   return warc_write_ok;
 221 }
 222
 223 /* Writes a WARC header to the current WARC record.
 224    This method may be run after warc_write_start_record and
 225    before warc_write_block_from_file.  */
 226 static bool
 227 warc_write_header (const char *name, const char *value)
 228 {
 229   if (value)
 230     {
 231       warc_write_string (name);
 232       warc_write_string (": ");
 233       warc_write_string (value);
 234       warc_write_string ("\r\n");
 235     }
 236   return warc_write_ok;
 237 }
 238
 239 /* Copies the contents of DATA_IN to the WARC record.
 240    Adds a Content-Length header to the WARC record.
 241    Run this method after warc_write_header,
 242    then run warc_write_end_record. */
 243 static bool
 244 warc_write_block_from_file (FILE *data_in)
 245 {
 246   /* Add the Content-Length header. */
 247   char *content_length;
 248   fseeko (data_in, 0L, SEEK_END);
 249   if (! asprintf (&content_length, "%ld", ftello (data_in)))
 250     {
 251       warc_write_ok = false;
 252       return false;
 253     }
 254   warc_write_header ("Content-Length", content_length);
 255   free (content_length);
 256
 257   /* End of the WARC header section. */
 258   warc_write_string ("\r\n");
 259
 260   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 261     warc_write_ok = false;
 262
 263   /* Copy the data in the file to the WARC record. */
 264   char buffer[BUFSIZ];
 265   size_t s;
 266   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 267     {
 268       if (warc_write_buffer (buffer, s) < s)
 269         warc_write_ok = false;
 270     }
 271
 272   return warc_write_ok;
 273 }
 274
 275 /* Run this method to close the current WARC record.
 276
 277    If compression is enabled, this method closes the
 278    current GZIP stream and fills the extra GZIP header
 279    with the uncompressed and compressed length of the
 280    record. */
 281 static bool
 282 warc_write_end_record ()
 283 {
 284   warc_write_buffer ("\r\n\r\n", 4);
 285
 286 #ifdef HAVE_LIBZ
 287   /* We start a new gzip stream for each record.  */
 288   if (warc_write_ok && warc_current_gzfile)
 289     {
 290       if (gzclose (warc_current_gzfile) != Z_OK)
 291         {
 292           warc_write_ok = false;
 293           return false;
 294         }
 295
 296       fflush (warc_current_file);
 297       fseeko (warc_current_file, 0, SEEK_END);
 298
 299       /* The WARC standard suggests that we add 'skip length' data in the
 300          extra header field of the GZIP stream.
 301
 302          In warc_write_start_record we reserved space for this extra header.
 303          This extra space starts at warc_current_gzfile_offset and fills
 304          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 305          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 306
 307          We need to do three things:
 308          1. Move the static GZIP header to warc_current_gzfile_offset;
 309          2. Set the FEXTRA flag in the GZIP header;
 310          3. Write the extra GZIP header after the static header, that is,
 311             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 312       */
 313
 314       /* Calculate the uncompressed and compressed sizes. */
 315       off_t current_offset = ftello (warc_current_file);
 316       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 317       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 318
 319       /* Go back to the static GZIP header. */
 320       fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 321
 322       /* Read the header. */
 323       char static_header[GZIP_STATIC_HEADER_SIZE];
 324       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 325       if (result != GZIP_STATIC_HEADER_SIZE)
 326         {
 327           warc_write_ok = false;
 328           return false;
 329         }
 330
 331       /* Set the FEXTRA flag in the flags byte of the header. */
 332       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 333
 334       /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
 335       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 336       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 337
 338       /* Prepare the extra GZIP header. */
 339       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 340       /* XLEN, the length of the extra header fields.  */
 341       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 342       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 343       /* The extra header field identifier for the WARC skip length. */
 344       extra_header[2]  = 's';
 345       extra_header[3]  = 'l';
 346       /* The size of the uncompressed record.  */
 347       extra_header[4]  = (uncompressed_size & 255);
 348       extra_header[5]  = (uncompressed_size >> 8) & 255;
 349       extra_header[6]  = (uncompressed_size >> 16) & 255;
 350       extra_header[7]  = (uncompressed_size >> 24) & 255;
 351       /* The size of the compressed record.  */
 352       extra_header[8]  = (compressed_size & 255);
 353       extra_header[9]  = (compressed_size >> 8) & 255;
 354       extra_header[10] = (compressed_size >> 16) & 255;
 355       extra_header[11] = (compressed_size >> 24) & 255;
 356
 357       /* Write the extra header after the static header. */
 358       fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 359       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 360
 361       /* Done, move back to the end of the file. */
 362       fflush (warc_current_file);
 363       fseeko (warc_current_file, 0, SEEK_END);
 364     }
 365 #endif /* HAVE_LIBZ */
 366
 367   return warc_write_ok;
 368 }
 369
 370
 371 /* Writes the WARC-Date header for the given timestamp to
 372    the current WARC record.
 373    If timestamp is NULL, the current time will be used.  */
 374 static bool
 375 warc_write_date_header (char *timestamp)
 376 {
 377   if (timestamp == NULL)
 378     {
 379       char current_timestamp[21];
 380       warc_timestamp (current_timestamp);
 381       timestamp = current_timestamp;
 382     }
 383   return warc_write_header ("WARC-Date", timestamp);
 384 }
 385
 386 /* Writes the WARC-IP-Address header for the given IP to
 387    the current WARC record.  If IP is NULL, no header will
 388    be written.  */
 389 static bool
 390 warc_write_ip_header (ip_address *ip)
 391 {
 392   if (ip != NULL)
 393     return warc_write_header ("WARC-IP-Address", print_address (ip));
 394   else
 395     return warc_write_ok;
 396 }
 397
 398
 399 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 400    from gnulib/sha1.c.  This version calculates two digests in one go.
 401
 402    Compute SHA1 message digests for bytes read from STREAM.  The
 403    digest of the complete file will be written into the 16 bytes
 404    beginning at RES_BLOCK.
 405
 406    If payload_offset >= 0, a second digest will be calculated of the
 407    portion of the file starting at payload_offset and continuing to
 408    the end of the file.  The digest number will be written into the
 409    16 bytes beginning ad RES_PAYLOAD.  */
 410 static int
 411 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset)
 412 {
 413 #define BLOCKSIZE 32768
 414
 415   struct sha1_ctx ctx_block;
 416   struct sha1_ctx ctx_payload;
 417   off_t pos;
 418   off_t sum;
 419
 420   char *buffer = malloc (BLOCKSIZE + 72);
 421   if (!buffer)
 422     return 1;
 423
 424   /* Initialize the computation context.  */
 425   sha1_init_ctx (&ctx_block);
 426   if (payload_offset >= 0)
 427     sha1_init_ctx (&ctx_payload);
 428
 429   pos = 0;
 430
 431   /* Iterate over full file contents.  */
 432   while (1)
 433     {
 434       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 435          computation function processes the whole buffer so that with the
 436          next round of the loop another block can be read.  */
 437       off_t n;
 438       sum = 0;
 439
 440       /* Read block.  Take care for partial reads.  */
 441       while (1)
 442         {
 443           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 444
 445           sum += n;
 446           pos += n;
 447
 448           if (sum == BLOCKSIZE)
 449             break;
 450
 451           if (n == 0)
 452             {
 453               /* Check for the error flag IFF N == 0, so that we don't
 454                  exit the loop after a partial read due to e.g., EAGAIN
 455                  or EWOULDBLOCK.  */
 456               if (ferror (stream))
 457                 {
 458                   free (buffer);
 459                   return 1;
 460                 }
 461               goto process_partial_block;
 462             }
 463
 464           /* We've read at least one byte, so ignore errors.  But always
 465              check for EOF, since feof may be true even though N > 0.
 466              Otherwise, we could end up calling fread after EOF.  */
 467           if (feof (stream))
 468             goto process_partial_block;
 469         }
 470
 471       /* Process buffer with BLOCKSIZE bytes.  Note that
 472                         BLOCKSIZE % 64 == 0
 473        */
 474       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 475       if (payload_offset >= 0 && payload_offset < pos)
 476         {
 477           /* At least part of the buffer contains data from payload. */
 478           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 479           if (start_of_payload <= 0)
 480             /* All bytes in the buffer belong to the payload. */
 481             start_of_payload = 0;
 482
 483           /* Process the payload part of the buffer.
 484              Note: we can't use  sha1_process_block  here even if we
 485              process the complete buffer.  Because the payload doesn't
 486              have to start with a full block, there may still be some
 487              bytes left from the previous buffer.  Therefore, we need
 488              to continue with  sha1_process_bytes.  */
 489           sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
 490         }
 491     }
 492
 493  process_partial_block:;
 494
 495   /* Process any remaining bytes.  */
 496   if (sum > 0)
 497     {
 498       sha1_process_bytes (buffer, sum, &ctx_block);
 499       if (payload_offset >= 0 && payload_offset < pos)
 500         {
 501           /* At least part of the buffer contains data from payload. */
 502           off_t start_of_payload = payload_offset - (pos - sum);
 503           if (start_of_payload <= 0)
 504             /* All bytes in the buffer belong to the payload. */
 505             start_of_payload = 0;
 506
 507           /* Process the payload part of the buffer. */
 508           sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
 509         }
 510     }
 511
 512   /* Construct result in desired memory.  */
 513   sha1_finish_ctx (&ctx_block,   res_block);
 514   if (payload_offset >= 0)
 515     sha1_finish_ctx (&ctx_payload, res_payload);
 516   free (buffer);
 517   return 0;
 518
 519 #undef BLOCKSIZE
 520 }
 521
 522 /* Converts the SHA1 digest to a base32-encoded string.
 523    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 524 static char *
 525 warc_base32_sha1_digest (char *sha1_digest)
 526 {
 527   // length: "sha1:" + digest + "\0"
 528   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 529   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 530   memcpy (sha1_base32, "sha1:", 5);
 531   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 532   return sha1_base32;
 533 }
 534
 535
 536 /* Sets the digest headers of the record.
 537    This method will calculate the block digest and, if payload_offset >= 0,
 538    will also calculate the payload digest of the payload starting at the
 539    provided offset.  */
 540 static void
 541 warc_write_digest_headers (FILE *file, long payload_offset)
 542 {
 543   if (opt.warc_digests_enabled)
 544     {
 545       /* Calculate the block and payload digests. */
 546       char sha1_res_block[SHA1_DIGEST_SIZE];
 547       char sha1_res_payload[SHA1_DIGEST_SIZE];
 548
 549       rewind (file);
 550       if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
 551         {
 552           char *digest;
 553
 554           digest = warc_base32_sha1_digest (sha1_res_block);
 555           warc_write_header ("WARC-Block-Digest", digest);
 556           free (digest);
 557
 558           if (payload_offset >= 0)
 559             {
 560               digest = warc_base32_sha1_digest (sha1_res_payload);
 561               warc_write_header ("WARC-Payload-Digest", digest);
 562               free (digest);
 563             }
 564         }
 565     }
 566 }
 567
 568
 569 /* Fills timestamp with the current time and date.
 570    The UTC time is formatted following ISO 8601, as required
 571    for use in the WARC-Date header.
 572    The timestamp will be 21 characters long. */
 573 void
 574 warc_timestamp (char *timestamp)
 575 {
 576   time_t rawtime;
 577   struct tm * timeinfo;
 578   time ( &rawtime );
 579   timeinfo = gmtime (&rawtime);
 580   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 581 }
 582
 583 #ifdef HAVE_LIBUUID
 584 /* Fills urn_str with a UUID in the format required
 585    for the WARC-Record-Id header.
 586    The string will be 47 characters long. */
 587 void
 588 warc_uuid_str (char *urn_str)
 589 {
 590   char uuid_str[37];
 591
 592   uuid_t record_id;
 593   uuid_generate (record_id);
 594   uuid_unparse (record_id, uuid_str);
 595
 596   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 597 }
 598 #else
 599 /* Fills urn_str with a UUID based on random numbers in the format
 600    required for the WARC-Record-Id header.
 601    (See RFC 4122, UUID version 4.)
 602
 603    Note: this is a fallback method, it is much better to use the
 604    methods provided by libuuid.
 605
 606    The string will be 47 characters long. */
 607 void
 608 warc_uuid_str (char *urn_str)
 609 {
 610   // RFC 4122, a version 4 UUID with only random numbers
 611
 612   unsigned char uuid_data[16];
 613   int i;
 614   for (i=0; i<16; i++)
 615     uuid_data[i] = random_number (255);
 616
 617   // Set the four most significant bits (bits 12 through 15) of the
 618   // time_hi_and_version field to the 4-bit version number
 619   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 620
 621   // Set the two most significant bits (bits 6 and 7) of the
 622   // clock_seq_hi_and_reserved to zero and one, respectively.
 623   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 624
 625   sprintf (urn_str,
 626     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 627     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 628     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 629     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 630     uuid_data[15]);
 631 }
 632 #endif
 633
 634 /* Write a warcinfo record to the current file.
 635    Updates warc_current_warcinfo_uuid_str. */
 636 bool
 637 warc_write_warcinfo_record (char *filename)
 638 {
 639   /* Write warc-info record as the first record of the file. */
 640   /* We add the record id of this info record to the other records in the file. */
 641   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 642   warc_uuid_str (warc_current_warcinfo_uuid_str);
 643
 644   char timestamp[22];
 645   warc_timestamp (timestamp);
 646
 647   char *filename_copy, *filename_basename;
 648   filename_copy = strdup (filename);
 649   filename_basename = strdup (basename (filename_copy));
 650
 651   warc_write_start_record ();
 652   warc_write_header ("WARC-Type", "warcinfo");
 653   warc_write_header ("Content-Type", "application/warc-fields");
 654   warc_write_header ("WARC-Date", timestamp);
 655   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 656   warc_write_header ("WARC-Filename", filename_basename);
 657
 658   /* Create content.  */
 659   FILE *warc_tmp = warc_tempfile ();
 660   if (warc_tmp == NULL)
 661     {
 662       free (filename_copy);
 663       free (filename_basename);
 664       return false;
 665     }
 666
 667   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 668   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 669   fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 670   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 671   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 672   /* Add the user headers, if any. */
 673   if (opt.warc_user_headers)
 674     {
 675       int i;
 676       for (i = 0; opt.warc_user_headers[i]; i++)
 677         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 678     }
 679   fprintf(warc_tmp, "\r\n");
 680
 681   warc_write_digest_headers (warc_tmp, -1);
 682   warc_write_block_from_file (warc_tmp);
 683   warc_write_end_record ();
 684
 685   if (! warc_write_ok)
 686     {
 687       logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 688     }
 689
 690   free (filename_copy);
 691   free (filename_basename);
 692   fclose (warc_tmp);
 693   return warc_write_ok;
 694 }
 695
 696 /* Opens a new WARC file.
 697    If META is true, generates a filename ending with 'meta.warc.gz'.
 698
 699    This method will:
 700    1. close the current WARC file (if there is one);
 701    2. increment warc_current_file_number;
 702    3. open a new WARC file;
 703    4. write the initial warcinfo record.
 704
 705    Returns true on success, false otherwise.
 706    */
 707 static bool
 708 warc_start_new_file (bool meta)
 709 {
 710   if (opt.warc_filename == NULL)
 711     return false;
 712
 713   if (warc_current_file != NULL)
 714     fclose (warc_current_file);
 715   if (warc_current_warcinfo_uuid_str)
 716     free (warc_current_warcinfo_uuid_str);
 717   if (warc_current_filename)
 718     free (warc_current_filename);
 719
 720   warc_current_file_number++;
 721
 722   int base_filename_length = strlen (opt.warc_filename);
 723   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 724   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 725   warc_current_filename = new_filename;
 726
 727 #ifdef HAVE_LIBZ
 728   char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 729 #else
 730   char *extension = "warc";
 731 #endif
 732
 733   /* If max size is enabled, we add a serial number to the file names. */
 734   if (meta)
 735     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 736   else if (opt.warc_maxsize > 0)
 737     sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
 738   else
 739     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 740
 741   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 742
 743   /* Open the WARC file. */
 744   warc_current_file = fopen (new_filename, "wb+");
 745   if (warc_current_file == NULL)
 746     {
 747       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
 748       return false;
 749     }
 750
 751   if (! warc_write_warcinfo_record (new_filename))
 752     return false;
 753
 754   /* Add warcinfo uuid to manifest. */
 755   if (warc_manifest_fp)
 756     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 757
 758   return true;
 759 }
 760
 761 /* Opens the CDX file for output. */
 762 static bool
 763 warc_start_cdx_file ()
 764 {
 765   int filename_length = strlen (opt.warc_filename);
 766   char *cdx_filename = alloca (filename_length + 4 + 1);
 767   memcpy (cdx_filename, opt.warc_filename, filename_length);
 768   memcpy (cdx_filename + filename_length, ".cdx", 5);
 769   warc_current_cdx_file = fopen (cdx_filename, "a+");
 770   if (warc_current_cdx_file == NULL)
 771     return false;
 772
 773   /* Print the CDX header.
 774    *
 775    * a - original url
 776    * b - date
 777    * m - mime type
 778    * s - response code
 779    * k - new style checksum
 780    * r - redirect
 781    * M - meta tags
 782    * V - compressed arc file offset
 783    * g - file name
 784    * u - record-id
 785    */
 786   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 787   fflush (warc_current_cdx_file);
 788
 789   return true;
 790 }
 791
 792 #define CDX_FIELDSEP " \t\r\n"
 793
 794 /* Parse the CDX header and find the field numbers of the original url,
 795    checksum and record ID fields. */
 796 static bool
 797 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
 798 {
 799   *field_num_original_url = -1;
 800   *field_num_checksum = -1;
 801   *field_num_record_id = -1;
 802
 803   char *token;
 804   char *save_ptr;
 805   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 806
 807   if (token != NULL && strcmp (token, "CDX") == 0)
 808     {
 809       int field_num = 0;
 810       while (token != NULL)
 811         {
 812           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 813           if (token != NULL)
 814             {
 815               switch (token[0])
 816                 {
 817                 case 'a':
 818                   *field_num_original_url = field_num;
 819                   break;
 820                 case 'k':
 821                   *field_num_checksum = field_num;
 822                   break;
 823                 case 'u':
 824                   *field_num_record_id = field_num;
 825                   break;
 826                 }
 827             }
 828           field_num++;
 829         }
 830     }
 831
 832   return *field_num_original_url != -1
 833          && *field_num_checksum != -1
 834          && *field_num_record_id != -1;
 835 }
 836
 837 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 838 static void
 839 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
 840 {
 841   char *original_url = NULL;
 842   char *checksum = NULL;
 843   char *record_id = NULL;
 844
 845   char *token;
 846   char *save_ptr;
 847   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 848
 849   /* Read this line to get the fields we need. */
 850   int field_num = 0;
 851   while (token != NULL)
 852     {
 853       char **val;
 854       if (field_num == field_num_original_url)
 855         val = &original_url;
 856       else if (field_num == field_num_checksum)
 857         val = &checksum;
 858       else if (field_num == field_num_record_id)
 859         val = &record_id;
 860       else
 861         val = NULL;
 862
 863       if (val != NULL)
 864         *val = strdup (token);
 865
 866       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 867       field_num++;
 868     }
 869
 870   if (original_url != NULL && checksum != NULL && record_id != NULL)
 871     {
 872       /* For some extra efficiency, we decode the base32 encoded
 873          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 874          bytes.  */
 875       size_t checksum_l;
 876       char * checksum_v;
 877       base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
 878       free (checksum);
 879
 880       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 881         {
 882           /* This is a valid line with a valid checksum. */
 883           struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
 884           rec->url = original_url;
 885           rec->uuid = record_id;
 886           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 887           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 888           free (checksum_v);
 889         }
 890       else
 891         {
 892           free (original_url);
 893           if (checksum_v != NULL)
 894             free (checksum_v);
 895           free (record_id);
 896         }
 897     }
 898 }
 899
 900 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 901    the warc_cdx_dedup_table. */
 902 bool
 903 warc_load_cdx_dedup_file ()
 904 {
 905   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 906   if (f == NULL)
 907     return false;
 908
 909   int field_num_original_url = -1;
 910   int field_num_checksum = -1;
 911   int field_num_record_id = -1;
 912
 913   char *lineptr = NULL;
 914   size_t n = 0;
 915   size_t line_length;
 916
 917   /* The first line should contain the CDX header.
 918      Format:  " CDX x x x x x"
 919      where x are field type indicators.  For our purposes, we only
 920      need 'a' (the original url), 'k' (the SHA1 checksum) and
 921      'u' (the WARC record id). */
 922   line_length = getline (&lineptr, &n, f);
 923   if (line_length != -1)
 924     warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
 925
 926   /* If the file contains all three fields, read the complete file. */
 927   if (field_num_original_url == -1
 928       || field_num_checksum == -1
 929       || field_num_record_id == -1)
 930     {
 931       if (field_num_original_url == -1)
 932         logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 933       if (field_num_checksum == -1)
 934         logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 935       if (field_num_record_id == -1)
 936         logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 937     }
 938   else
 939     {
 940       /* Initialize the table. */
 941       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
 942
 943       do
 944         {
 945           line_length = getline (&lineptr, &n, f);
 946           if (line_length != -1)
 947             warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
 948
 949         }
 950       while (line_length != -1);
 951
 952       /* Print results. */
 953       int nrecords = hash_table_count (warc_cdx_dedup_table);
 954       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 955                                         "Loaded %d records from CDX.\n\n", nrecords),
 956                               nrecords);
 957     }
 958
 959   fclose (f);
 960
 961   return true;
 962 }
 963 #undef CDX_FIELDSEP
 964
 965 /* Returns the existing duplicate CDX record for the given url and payload
 966    digest.  Returns NULL if the url is not found or if the payload digest
 967    does not match, or if CDX deduplication is disabled. */
 968 static struct warc_cdx_record *
 969 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 970 {
 971   if (warc_cdx_dedup_table == NULL)
 972     return NULL;
 973
 974   char *key;
 975   struct warc_cdx_record *rec_existing;
 976   hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
 977
 978   if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
 979     return rec_existing;
 980   else
 981     return NULL;
 982 }
 983
 984 /* Initializes the WARC writer (if opt.warc_filename is set).
 985    This should be called before any WARC record is written. */
 986 void
 987 warc_init ()
 988 {
 989   warc_write_ok = true;
 990
 991   if (opt.warc_filename != NULL)
 992     {
 993       if (opt.warc_cdx_dedup_filename != NULL)
 994         {
 995           if (! warc_load_cdx_dedup_file ())
 996             {
 997               logprintf (LOG_NOTQUIET,
 998                          _("Could not read CDX file %s for deduplication.\n"),
 999                          quote (opt.warc_cdx_dedup_filename));
1000               exit(1);
1001             }
1002         }
1003
1004       warc_manifest_fp = warc_tempfile ();
1005       if (warc_manifest_fp == NULL)
1006         {
1007           logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
1008           exit(1);
1009         }
1010
1011       if (opt.warc_keep_log)
1012         {
1013           warc_log_fp = warc_tempfile ();
1014           if (warc_log_fp == NULL)
1015             {
1016               logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
1017               exit(1);
1018             }
1019           log_set_warc_log_fp (warc_log_fp);
1020         }
1021
1022       warc_current_file_number = -1;
1023       if (! warc_start_new_file (false))
1024         {
1025           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1026           exit(1);
1027         }
1028
1029       if (opt.warc_cdx_enabled)
1030         {
1031           if (! warc_start_cdx_file ())
1032             {
1033               logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
1034               exit(1);
1035             }
1036         }
1037     }
1038 }
1039
1040 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1041 void
1042 warc_write_metadata ()
1043 {
1044   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1045   if (opt.warc_maxsize > 0)
1046     warc_start_new_file (true);
1047
1048   char manifest_uuid [48];
1049   warc_uuid_str (manifest_uuid);
1050
1051   fflush (warc_manifest_fp);
1052   warc_write_resource_record (manifest_uuid,
1053                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1054                               NULL, NULL, NULL, "text/plain",
1055                               warc_manifest_fp, -1);
1056   /* warc_write_resource_record has closed warc_manifest_fp. */
1057
1058   FILE * warc_tmp_fp = warc_tempfile ();
1059   if (warc_tmp_fp == NULL)
1060     {
1061       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1062       exit(1);
1063     }
1064   fflush (warc_tmp_fp);
1065   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1066
1067   warc_write_resource_record (manifest_uuid,
1068                               "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1069                               NULL, NULL, NULL, "text/plain",
1070                               warc_tmp_fp, -1);
1071   /* warc_write_resource_record has closed warc_tmp_fp. */
1072
1073   if (warc_log_fp != NULL)
1074     {
1075       warc_write_resource_record (NULL,
1076                                   "metadata://gnu.org/software/wget/warc/wget.log",
1077                                   NULL, manifest_uuid, NULL, "text/plain",
1078                                   warc_log_fp, -1);
1079       /* warc_write_resource_record has closed warc_log_fp. */
1080
1081       warc_log_fp = NULL;
1082       log_set_warc_log_fp (NULL);
1083     }
1084 }
1085
1086 /* Finishes the WARC writing.
1087    This should be called at the end of the program. */
1088 void
1089 warc_close ()
1090 {
1091   if (warc_current_file != NULL)
1092     {
1093       warc_write_metadata ();
1094       free (warc_current_warcinfo_uuid_str);
1095       fclose (warc_current_file);
1096     }
1097   if (warc_current_cdx_file != NULL)
1098     fclose (warc_current_cdx_file);
1099   if (warc_log_fp != NULL)
1100     {
1101       fclose (warc_log_fp);
1102       log_set_warc_log_fp (NULL);
1103     }
1104 }
1105
1106 /* Creates a temporary file for writing WARC output.
1107    The temporary file will be created in opt.warc_tempdir.
1108    Returns the pointer to the temporary file, or NULL. */
1109 FILE *
1110 warc_tempfile ()
1111 {
1112   char filename[100];
1113   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1114     return NULL;
1115
1116   int fd = mkstemp (filename);
1117   if (fd < 0)
1118     return NULL;
1119
1120   if (unlink (filename) < 0)
1121     return NULL;
1122
1123   return fdopen (fd, "wb+");
1124 }
1125
1126
1127 /* Writes a request record to the WARC file.
1128    url  is the target uri of the request,
1129    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1130    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1131    body  is a pointer to a file containing the request headers and body.
1132    ip  is the ip address of the server (or NULL),
1133    Calling this function will close body.
1134    Returns true on success, false on error. */
1135 bool
1136 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset)
1137 {
1138   warc_write_start_record ();
1139   warc_write_header ("WARC-Type", "request");
1140   warc_write_header ("WARC-Target-URI", url);
1141   warc_write_header ("Content-Type", "application/http;msgtype=request");
1142   warc_write_date_header (timestamp_str);
1143   warc_write_header ("WARC-Record-ID", record_uuid);
1144   warc_write_ip_header (ip);
1145   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1146   warc_write_digest_headers (body, payload_offset);
1147   warc_write_block_from_file (body);
1148   warc_write_end_record ();
1149
1150   fclose (body);
1151
1152   return warc_write_ok;
1153 }
1154
1155 /* Writes a response record to the CDX file.
1156    url  is the target uri of the request/response,
1157    timestamp_str  is the timestamp of the request that generated this response,
1158                   (generated with warc_timestamp),
1159    mime_type  is the mime type of the response body (will be printed to CDX),
1160    response_code  is the HTTP response code (will be printed to CDX),
1161    payload_digest  is the sha1 digest of the payload,
1162    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1163    offset  is the position of the WARC record in the WARC file,
1164    warc_filename  is the filename of the WARC,
1165    response_uuid  is the uuid of the response.
1166    Returns true on success, false on error. */
1167 static bool
1168 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid)
1169 {
1170   /* Transform the timestamp. */
1171   char timestamp_str_cdx [15];
1172   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1173   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1174   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1175   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1176   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1177   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1178   timestamp_str_cdx[14] = '\0';
1179
1180   /* Rewrite the checksum. */
1181   char *checksum;
1182   if (payload_digest != NULL)
1183     checksum = payload_digest + 5; /* Skip the "sha1:" */
1184   else
1185     checksum = "-";
1186
1187   if (mime_type == NULL || strlen(mime_type) == 0)
1188     mime_type = "-";
1189   if (redirect_location == NULL || strlen(redirect_location) == 0)
1190     redirect_location = "-";
1191
1192   /* Print the CDX line. */
1193   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1194   fflush (warc_current_cdx_file);
1195
1196   return true;
1197 }
1198
1199 /* Writes a revisit record to the WARC file.
1200    url  is the target uri of the request/response,
1201    timestamp_str  is the timestamp of the request that generated this response
1202                   (generated with warc_timestamp),
1203    concurrent_to_uuid  is the uuid of the request for that generated this response
1204                  (generated with warc_uuid_str),
1205    refers_to_uuid  is the uuid of the original response
1206                  (generated with warc_uuid_str),
1207    payload_digest  is the sha1 digest of the payload,
1208    ip  is the ip address of the server (or NULL),
1209    body  is a pointer to a file containing the response headers (without payload).
1210    Calling this function will close body.
1211    Returns true on success, false on error. */
1212 static bool
1213 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1214 {
1215   char revisit_uuid [48];
1216   warc_uuid_str (revisit_uuid);
1217
1218   char *block_digest = NULL;
1219   char sha1_res_block[SHA1_DIGEST_SIZE];
1220   sha1_stream (body, sha1_res_block);
1221   block_digest = warc_base32_sha1_digest (sha1_res_block);
1222
1223   warc_write_start_record ();
1224   warc_write_header ("WARC-Type", "revisit");
1225   warc_write_header ("WARC-Record-ID", revisit_uuid);
1226   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1227   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1228   warc_write_header ("WARC-Refers-To", refers_to);
1229   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1230   warc_write_header ("WARC-Truncated", "length");
1231   warc_write_header ("WARC-Target-URI", url);
1232   warc_write_date_header (timestamp_str);
1233   warc_write_ip_header (ip);
1234   warc_write_header ("Content-Type", "application/http;msgtype=response");
1235   warc_write_header ("WARC-Block-Digest", block_digest);
1236   warc_write_header ("WARC-Payload-Digest", payload_digest);
1237   warc_write_block_from_file (body);
1238   warc_write_end_record ();
1239
1240   fclose (body);
1241   free (block_digest);
1242
1243   return warc_write_ok;
1244 }
1245
1246 /* Writes a response record to the WARC file.
1247    url  is the target uri of the request/response,
1248    timestamp_str  is the timestamp of the request that generated this response
1249                   (generated with warc_timestamp),
1250    concurrent_to_uuid  is the uuid of the request for that generated this response
1251                  (generated with warc_uuid_str),
1252    ip  is the ip address of the server (or NULL),
1253    body  is a pointer to a file containing the response headers and body.
1254    mime_type  is the mime type of the response body (will be printed to CDX),
1255    response_code  is the HTTP response code (will be printed to CDX),
1256    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1257    Calling this function will close body.
1258    Returns true on success, false on error. */
1259 bool
1260 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location)
1261 {
1262   char *block_digest = NULL;
1263   char *payload_digest = NULL;
1264   char sha1_res_block[SHA1_DIGEST_SIZE];
1265   char sha1_res_payload[SHA1_DIGEST_SIZE];
1266
1267   if (opt.warc_digests_enabled)
1268     {
1269       /* Calculate the block and payload digests. */
1270       rewind (body);
1271       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1272         {
1273           /* Decide (based on url + payload digest) if we have seen this
1274              data before. */
1275           struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1276           if (rec_existing != NULL)
1277             {
1278               /* Found an existing record. */
1279               logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1280
1281               /* Remove the payload from the file. */
1282               if (payload_offset > 0)
1283                 {
1284                   if (ftruncate (fileno (body), payload_offset) == -1)
1285                     return false;
1286                 }
1287
1288               /* Send the original payload digest. */
1289               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1290               bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1291               free (payload_digest);
1292
1293               return result;
1294             }
1295
1296           block_digest = warc_base32_sha1_digest (sha1_res_block);
1297           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1298         }
1299     }
1300
1301   /* Not a revisit, just store the record. */
1302
1303   char response_uuid [48];
1304   warc_uuid_str (response_uuid);
1305
1306   fseeko (warc_current_file, 0L, SEEK_END);
1307   off_t offset = ftello (warc_current_file);
1308
1309   warc_write_start_record ();
1310   warc_write_header ("WARC-Type", "response");
1311   warc_write_header ("WARC-Record-ID", response_uuid);
1312   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1313   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1314   warc_write_header ("WARC-Target-URI", url);
1315   warc_write_date_header (timestamp_str);
1316   warc_write_ip_header (ip);
1317   warc_write_header ("WARC-Block-Digest", block_digest);
1318   warc_write_header ("WARC-Payload-Digest", payload_digest);
1319   warc_write_header ("Content-Type", "application/http;msgtype=response");
1320   warc_write_block_from_file (body);
1321   warc_write_end_record ();
1322
1323   fclose (body);
1324
1325   if (warc_write_ok && opt.warc_cdx_enabled)
1326     {
1327       /* Add this record to the CDX. */
1328       warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1329     }
1330
1331   if (block_digest)
1332     free (block_digest);
1333   if (payload_digest)
1334     free (payload_digest);
1335
1336   return warc_write_ok;
1337 }
1338
1339 /* Writes a resource record to the WARC file.
1340    resource_uuid  is the uuid of the resource (or NULL),
1341    url  is the target uri of the resource,
1342    timestamp_str  is the timestamp (generated with warc_timestamp),
1343    concurrent_to_uuid  is the uuid of the request for that generated this resource
1344                  (generated with warc_uuid_str) or NULL,
1345    ip  is the ip address of the server (or NULL),
1346    content_type  is the mime type of the body (or NULL),
1347    body  is a pointer to a file containing the resource data.
1348    Calling this function will close body.
1349    Returns true on success, false on error. */
1350 bool
1351 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset)
1352 {
1353   if (resource_uuid == NULL)
1354     {
1355       resource_uuid = alloca (48);
1356       warc_uuid_str (resource_uuid);
1357     }
1358
1359   if (content_type == NULL)
1360     content_type = "application/octet-stream";
1361
1362   warc_write_start_record ();
1363   warc_write_header ("WARC-Type", "resource");
1364   warc_write_header ("WARC-Record-ID", resource_uuid);
1365   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1366   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1367   warc_write_header ("WARC-Target-URI", url);
1368   warc_write_date_header (timestamp_str);
1369   warc_write_ip_header (ip);
1370   warc_write_digest_headers (body, payload_offset);
1371   warc_write_header ("Content-Type", content_type);
1372   warc_write_block_from_file (body);
1373   warc_write_end_record ();
1374
1375   fclose (body);
1376
1377   return warc_write_ok;
1378 }
1379