sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #endif
  55
  56 #include "warc.h"
  57
  58 extern char *version_string;
  59
  60 /* Set by main in main.c */
  61 extern char *program_argstring;
  62
  63
  64 /* The log file (a temporary file that contains a copy
  65    of the wget log). */
  66 static FILE *warc_log_fp;
  67
  68 /* The manifest file (a temporary file that contains the
  69    warcinfo uuid of every file in this crawl). */
  70 static FILE *warc_manifest_fp;
  71
  72 /* The current WARC file (or NULL, if WARC is disabled). */
  73 static FILE *warc_current_file;
  74
  75 #ifdef HAVE_LIBZ
  76 /* The gzip stream for the current WARC file
  77    (or NULL, if WARC or gzip is disabled). */
  78 static gzFile *warc_current_gzfile;
  79
  80 /* The offset of the current gzip record in the WARC file. */
  81 static off_t warc_current_gzfile_offset;
  82
  83 /* The uncompressed size (so far) of the current record. */
  84 static off_t warc_current_gzfile_uncompressed_size;
  85 # endif
  86
  87 /* This is true until a warc_write_* method fails. */
  88 static bool warc_write_ok;
  89
  90 /* The current CDX file (or NULL, if CDX is disabled). */
  91 static FILE *warc_current_cdx_file;
  92
  93 /* The record id of the warcinfo record of the current WARC file.  */
  94 static char *warc_current_warcinfo_uuid_str;
  95
  96 /* The file name of the current WARC file. */
  97 static char *warc_current_filename;
  98
  99 /* The serial number of the current WARC file.  This number is
 100    incremented each time a new file is opened and is used in the
 101    WARC file's filename. */
 102 static int warc_current_file_number;
 103
 104 /* The table of CDX records, if deduplication is enabled. */
 105 struct hash_table * warc_cdx_dedup_table;
 106
 107 static bool warc_start_new_file (bool meta);
 108
 109
 110 struct warc_cdx_record
 111 {
 112   char *url;
 113   char *uuid;
 114   char digest[SHA1_DIGEST_SIZE];
 115 };
 116
 117 static unsigned long
 118 warc_hash_sha1_digest (const void *key)
 119 {
 120   /* We just use some of the first bytes of the digest. */
 121   unsigned long v = 0;
 122   memcpy (&v, key, sizeof (unsigned long));
 123   return v;
 124 }
 125
 126 static int
 127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 128 {
 129   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 130 }
 131
 132
 133
 134 /* Writes SIZE bytes from BUFFER to the current WARC file,
 135    through gzwrite if compression is enabled.
 136    Returns the number of uncompressed bytes written.  */
 137 static size_t
 138 warc_write_buffer (const char *buffer, size_t size)
 139 {
 140 #ifdef HAVE_LIBZ
 141   if (warc_current_gzfile)
 142     {
 143       warc_current_gzfile_uncompressed_size += size;
 144       return gzwrite (warc_current_gzfile, buffer, size);
 145     }
 146   else
 147 #endif
 148     return fwrite (buffer, 1, size, warc_current_file);
 149 }
 150
 151 /* Writes STR to the current WARC file.
 152    Returns false and set warc_write_ok to false if there
 153    is an error.  */
 154 static bool
 155 warc_write_string (const char *str)
 156 {
 157   if (!warc_write_ok)
 158     return false;
 159
 160   size_t n = strlen (str);
 161   if (n != warc_write_buffer (str, n))
 162     warc_write_ok = false;
 163
 164   return warc_write_ok;
 165 }
 166
 167
 168 #define EXTRA_GZIP_HEADER_SIZE 12
 169 #define GZIP_STATIC_HEADER_SIZE  10
 170 #define FLG_FEXTRA          0x04
 171 #define OFF_FLG             3
 172
 173 /* Starts a new WARC record.  Writes the version header.
 174    If opt.warc_maxsize is set and the current file is becoming
 175    too large, this will open a new WARC file.
 176
 177    If compression is enabled, this will start a new
 178    gzip stream in the current WARC file.
 179
 180    Returns false and set warc_write_ok to false if there
 181    is an error.  */
 182 static bool
 183 warc_write_start_record ()
 184 {
 185   if (!warc_write_ok)
 186     return false;
 187
 188   fflush (warc_current_file);
 189   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 190     warc_start_new_file (false);
 191
 192 #ifdef HAVE_LIBZ
 193   /* Start a GZIP stream, if required. */
 194   if (opt.warc_compression_enabled)
 195     {
 196       /* Record the starting offset of the new record. */
 197       warc_current_gzfile_offset = ftello (warc_current_file);
 198
 199       /* Reserve space for the extra GZIP header field.
 200          In warc_write_end_record we will fill this space
 201          with information about the uncompressed and
 202          compressed size of the record. */
 203       fprintf (warc_current_file, "XXXXXXXXXXXX");
 204       fflush (warc_current_file);
 205
 206       /* Start a new GZIP stream. */
 207       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 208       warc_current_gzfile_uncompressed_size = 0;
 209
 210       if (warc_current_gzfile == NULL)
 211         {
 212           logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
 213           warc_write_ok = false;
 214           return false;
 215         }
 216     }
 217 #endif
 218
 219   warc_write_string ("WARC/1.0\r\n");
 220   return warc_write_ok;
 221 }
 222
 223 /* Writes a WARC header to the current WARC record.
 224    This method may be run after warc_write_start_record and
 225    before warc_write_block_from_file.  */
 226 static bool
 227 warc_write_header (const char *name, const char *value)
 228 {
 229   if (value)
 230     {
 231       warc_write_string (name);
 232       warc_write_string (": ");
 233       warc_write_string (value);
 234       warc_write_string ("\r\n");
 235     }
 236   return warc_write_ok;
 237 }
 238
 239 /* Copies the contents of DATA_IN to the WARC record.
 240    Adds a Content-Length header to the WARC record.
 241    Run this method after warc_write_header,
 242    then run warc_write_end_record. */
 243 static bool
 244 warc_write_block_from_file (FILE *data_in)
 245 {
 246   /* Add the Content-Length header. */
 247   char *content_length;
 248   fseeko (data_in, 0L, SEEK_END);
 249   if (! asprintf (&content_length, "%ld", ftello (data_in)))
 250     {
 251       warc_write_ok = false;
 252       return false;
 253     }
 254   warc_write_header ("Content-Length", content_length);
 255   free (content_length);
 256
 257   /* End of the WARC header section. */
 258   warc_write_string ("\r\n");
 259
 260   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 261     warc_write_ok = false;
 262
 263   /* Copy the data in the file to the WARC record. */
 264   char buffer[BUFSIZ];
 265   size_t s;
 266   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 267     {
 268       if (warc_write_buffer (buffer, s) < s)
 269         warc_write_ok = false;
 270     }
 271
 272   return warc_write_ok;
 273 }
 274
 275 /* Run this method to close the current WARC record.
 276
 277    If compression is enabled, this method closes the
 278    current GZIP stream and fills the extra GZIP header
 279    with the uncompressed and compressed length of the
 280    record. */
 281 static bool
 282 warc_write_end_record ()
 283 {
 284   warc_write_buffer ("\r\n\r\n", 4);
 285
 286 #ifdef HAVE_LIBZ
 287   /* We start a new gzip stream for each record.  */
 288   if (warc_write_ok && warc_current_gzfile)
 289     {
 290       if (gzclose (warc_current_gzfile) != Z_OK)
 291         {
 292           warc_write_ok = false;
 293           return false;
 294         }
 295
 296       fflush (warc_current_file);
 297       fseeko (warc_current_file, 0, SEEK_END);
 298
 299       /* The WARC standard suggests that we add 'skip length' data in the
 300          extra header field of the GZIP stream.
 301
 302          In warc_write_start_record we reserved space for this extra header.
 303          This extra space starts at warc_current_gzfile_offset and fills
 304          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 305          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 306
 307          We need to do three things:
 308          1. Move the static GZIP header to warc_current_gzfile_offset;
 309          2. Set the FEXTRA flag in the GZIP header;
 310          3. Write the extra GZIP header after the static header, that is,
 311             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 312       */
 313
 314       /* Calculate the uncompressed and compressed sizes. */
 315       off_t current_offset = ftello (warc_current_file);
 316       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 317       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 318
 319       /* Go back to the static GZIP header. */
 320       fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 321
 322       /* Read the header. */
 323       char static_header[GZIP_STATIC_HEADER_SIZE];
 324       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 325       if (result != GZIP_STATIC_HEADER_SIZE)
 326         {
 327           warc_write_ok = false;
 328           return false;
 329         }
 330
 331       /* Set the FEXTRA flag in the flags byte of the header. */
 332       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 333
 334       /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
 335       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 336       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 337
 338       /* Prepare the extra GZIP header. */
 339       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 340       /* XLEN, the length of the extra header fields.  */
 341       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 342       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 343       /* The extra header field identifier for the WARC skip length. */
 344       extra_header[2]  = 's';
 345       extra_header[3]  = 'l';
 346       /* The size of the uncompressed record.  */
 347       extra_header[4]  = (uncompressed_size & 255);
 348       extra_header[5]  = (uncompressed_size >> 8) & 255;
 349       extra_header[6]  = (uncompressed_size >> 16) & 255;
 350       extra_header[7]  = (uncompressed_size >> 24) & 255;
 351       /* The size of the compressed record.  */
 352       extra_header[8]  = (compressed_size & 255);
 353       extra_header[9]  = (compressed_size >> 8) & 255;
 354       extra_header[10] = (compressed_size >> 16) & 255;
 355       extra_header[11] = (compressed_size >> 24) & 255;
 356
 357       /* Write the extra header after the static header. */
 358       fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 359       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 360
 361       /* Done, move back to the end of the file. */
 362       fflush (warc_current_file);
 363       fseeko (warc_current_file, 0, SEEK_END);
 364     }
 365 #endif /* HAVE_LIBZ */
 366
 367   return warc_write_ok;
 368 }
 369
 370
 371 /* Writes the WARC-Date header for the given timestamp to
 372    the current WARC record.
 373    If timestamp is NULL, the current time will be used.  */
 374 static bool
 375 warc_write_date_header (char *timestamp)
 376 {
 377   if (timestamp == NULL)
 378     {
 379       char current_timestamp[21];
 380       warc_timestamp (current_timestamp);
 381       timestamp = current_timestamp;
 382     }
 383   return warc_write_header ("WARC-Date", timestamp);
 384 }
 385
 386 /* Writes the WARC-IP-Address header for the given IP to
 387    the current WARC record.  If IP is NULL, no header will
 388    be written.  */
 389 static bool
 390 warc_write_ip_header (ip_address *ip)
 391 {
 392   if (ip != NULL)
 393     return warc_write_header ("WARC-IP-Address", print_address (ip));
 394   else
 395     return warc_write_ok;
 396 }
 397
 398
 399 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 400    from gnulib/sha1.c.  This version calculates two digests in one go.
 401
 402    Compute SHA1 message digests for bytes read from STREAM.  The
 403    digest of the complete file will be written into the 16 bytes
 404    beginning at RES_BLOCK.
 405
 406    If payload_offset >= 0, a second digest will be calculated of the
 407    portion of the file starting at payload_offset and continuing to
 408    the end of the file.  The digest number will be written into the
 409    16 bytes beginning ad RES_PAYLOAD.  */
 410 static int
 411 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset)
 412 {
 413 #define BLOCKSIZE 32768
 414
 415   struct sha1_ctx ctx_block;
 416   struct sha1_ctx ctx_payload;
 417   off_t pos;
 418   off_t sum;
 419
 420   char *buffer = malloc (BLOCKSIZE + 72);
 421   if (!buffer)
 422     return 1;
 423
 424   /* Initialize the computation context.  */
 425   sha1_init_ctx (&ctx_block);
 426   if (payload_offset >= 0)
 427     sha1_init_ctx (&ctx_payload);
 428
 429   pos = 0;
 430
 431   /* Iterate over full file contents.  */
 432   while (1)
 433     {
 434       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 435          computation function processes the whole buffer so that with the
 436          next round of the loop another block can be read.  */
 437       off_t n;
 438       sum = 0;
 439
 440       /* Read block.  Take care for partial reads.  */
 441       while (1)
 442         {
 443           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 444
 445           sum += n;
 446           pos += n;
 447
 448           if (sum == BLOCKSIZE)
 449             break;
 450
 451           if (n == 0)
 452             {
 453               /* Check for the error flag IFF N == 0, so that we don't
 454                  exit the loop after a partial read due to e.g., EAGAIN
 455                  or EWOULDBLOCK.  */
 456               if (ferror (stream))
 457                 {
 458                   free (buffer);
 459                   return 1;
 460                 }
 461               goto process_partial_block;
 462             }
 463
 464           /* We've read at least one byte, so ignore errors.  But always
 465              check for EOF, since feof may be true even though N > 0.
 466              Otherwise, we could end up calling fread after EOF.  */
 467           if (feof (stream))
 468             goto process_partial_block;
 469         }
 470
 471       /* Process buffer with BLOCKSIZE bytes.  Note that
 472                         BLOCKSIZE % 64 == 0
 473        */
 474       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 475       if (payload_offset >= 0 && payload_offset < pos)
 476         {
 477           /* At least part of the buffer contains data from payload. */
 478           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 479           if (start_of_payload <= 0)
 480             /* All bytes in the buffer belong to the payload. */
 481             start_of_payload = 0;
 482
 483           /* Process the payload part of the buffer.
 484              Note: we can't use  sha1_process_block  here even if we
 485              process the complete buffer.  Because the payload doesn't
 486              have to start with a full block, there may still be some
 487              bytes left from the previous buffer.  Therefore, we need
 488              to continue with  sha1_process_bytes.  */
 489           sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
 490         }
 491     }
 492
 493  process_partial_block:;
 494
 495   /* Process any remaining bytes.  */
 496   if (sum > 0)
 497     {
 498       sha1_process_bytes (buffer, sum, &ctx_block);
 499       if (payload_offset >= 0 && payload_offset < pos)
 500         {
 501           /* At least part of the buffer contains data from payload. */
 502           off_t start_of_payload = payload_offset - (pos - sum);
 503           if (start_of_payload <= 0)
 504             /* All bytes in the buffer belong to the payload. */
 505             start_of_payload = 0;
 506
 507           /* Process the payload part of the buffer. */
 508           sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
 509         }
 510     }
 511
 512   /* Construct result in desired memory.  */
 513   sha1_finish_ctx (&ctx_block,   res_block);
 514   if (payload_offset >= 0)
 515     sha1_finish_ctx (&ctx_payload, res_payload);
 516   free (buffer);
 517   return 0;
 518
 519 #undef BLOCKSIZE
 520 }
 521
 522 /* Converts the SHA1 digest to a base32-encoded string.
 523    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 524 static char *
 525 warc_base32_sha1_digest (char *sha1_digest)
 526 {
 527   // length: "sha1:" + digest + "\0"
 528   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 529   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 530   memcpy (sha1_base32, "sha1:", 5);
 531   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 532   return sha1_base32;
 533 }
 534
 535
 536 /* Sets the digest headers of the record.
 537    This method will calculate the block digest and, if payload_offset >= 0,
 538    will also calculate the payload digest of the payload starting at the
 539    provided offset.  */
 540 static void
 541 warc_write_digest_headers (FILE *file, long payload_offset)
 542 {
 543   if (opt.warc_digests_enabled)
 544     {
 545       /* Calculate the block and payload digests. */
 546       char sha1_res_block[SHA1_DIGEST_SIZE];
 547       char sha1_res_payload[SHA1_DIGEST_SIZE];
 548
 549       rewind (file);
 550       if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
 551         {
 552           char *digest;
 553
 554           digest = warc_base32_sha1_digest (sha1_res_block);
 555           warc_write_header ("WARC-Block-Digest", digest);
 556           free (digest);
 557
 558           if (payload_offset >= 0)
 559             {
 560               digest = warc_base32_sha1_digest (sha1_res_payload);
 561               warc_write_header ("WARC-Payload-Digest", digest);
 562               free (digest);
 563             }
 564         }
 565     }
 566 }
 567
 568
 569 /* Fills timestamp with the current time and date.
 570    The UTC time is formatted following ISO 8601, as required
 571    for use in the WARC-Date header.
 572    The timestamp will be 21 characters long. */
 573 void
 574 warc_timestamp (char *timestamp)
 575 {
 576   time_t rawtime;
 577   struct tm * timeinfo;
 578   time ( &rawtime );
 579   timeinfo = gmtime (&rawtime);
 580   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 581 }
 582
 583 /* Fills uuid_str with a UUID based on random numbers.
 584    (See RFC 4122, UUID version 4.)
 585
 586    Note: this is a fallback method, it is much better to use the
 587    methods provided by libuuid.
 588
 589    The uuid_str will be 36 characters long. */
 590 static void
 591 warc_uuid_random (char *uuid_str)
 592 {
 593   // RFC 4122, a version 4 UUID with only random numbers
 594
 595   unsigned char uuid_data[16];
 596   int i;
 597   for (i=0; i<16; i++)
 598     uuid_data[i] = random_number (255);
 599
 600   // Set the four most significant bits (bits 12 through 15) of the
 601   // time_hi_and_version field to the 4-bit version number
 602   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 603
 604   // Set the two most significant bits (bits 6 and 7) of the
 605   // clock_seq_hi_and_reserved to zero and one, respectively.
 606   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 607
 608   sprintf (uuid_str,
 609     "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 610     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 611     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 612     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 613     uuid_data[15]);
 614 }
 615
 616 /* Fills urn_str with a UUID in the format required
 617    for the WARC-Record-Id header.
 618    The string will be 47 characters long. */
 619 void
 620 warc_uuid_str (char *urn_str)
 621 {
 622   char uuid_str[37];
 623
 624 # ifdef HAVE_LIBUUID
 625   uuid_t record_id;
 626   uuid_generate (record_id);
 627   uuid_unparse (record_id, uuid_str);
 628 # else
 629   warc_uuid_random (uuid_str);
 630 # endif
 631
 632   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 633 }
 634
 635 /* Write a warcinfo record to the current file.
 636    Updates warc_current_warcinfo_uuid_str. */
 637 bool
 638 warc_write_warcinfo_record (char *filename)
 639 {
 640   /* Write warc-info record as the first record of the file. */
 641   /* We add the record id of this info record to the other records in the file. */
 642   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 643   warc_uuid_str (warc_current_warcinfo_uuid_str);
 644
 645   char timestamp[22];
 646   warc_timestamp (timestamp);
 647
 648   char *filename_copy, *filename_basename;
 649   filename_copy = strdup (filename);
 650   filename_basename = strdup (basename (filename_copy));
 651
 652   warc_write_start_record ();
 653   warc_write_header ("WARC-Type", "warcinfo");
 654   warc_write_header ("Content-Type", "application/warc-fields");
 655   warc_write_header ("WARC-Date", timestamp);
 656   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 657   warc_write_header ("WARC-Filename", filename_basename);
 658
 659   /* Create content.  */
 660   FILE *warc_tmp = warc_tempfile ();
 661   if (warc_tmp == NULL)
 662     {
 663       free (filename_copy);
 664       free (filename_basename);
 665       return false;
 666     }
 667
 668   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 669   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 670   fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 671   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 672   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 673   /* Add the user headers, if any. */
 674   if (opt.warc_user_headers)
 675     {
 676       int i;
 677       for (i = 0; opt.warc_user_headers[i]; i++)
 678         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 679     }
 680   fprintf(warc_tmp, "\r\n");
 681
 682   warc_write_digest_headers (warc_tmp, -1);
 683   warc_write_block_from_file (warc_tmp);
 684   warc_write_end_record ();
 685
 686   if (! warc_write_ok)
 687     {
 688       logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 689     }
 690
 691   free (filename_copy);
 692   free (filename_basename);
 693   fclose (warc_tmp);
 694   return warc_write_ok;
 695 }
 696
 697 /* Opens a new WARC file.
 698    If META is true, generates a filename ending with 'meta.warc.gz'.
 699
 700    This method will:
 701    1. close the current WARC file (if there is one);
 702    2. increment warc_current_file_number;
 703    3. open a new WARC file;
 704    4. write the initial warcinfo record.
 705
 706    Returns true on success, false otherwise.
 707    */
 708 static bool
 709 warc_start_new_file (bool meta)
 710 {
 711   if (opt.warc_filename == NULL)
 712     return false;
 713
 714   if (warc_current_file != NULL)
 715     fclose (warc_current_file);
 716   if (warc_current_warcinfo_uuid_str)
 717     free (warc_current_warcinfo_uuid_str);
 718   if (warc_current_filename)
 719     free (warc_current_filename);
 720
 721   warc_current_file_number++;
 722
 723   int base_filename_length = strlen (opt.warc_filename);
 724   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 725   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 726   warc_current_filename = new_filename;
 727
 728 #ifdef HAVE_LIBZ
 729   char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 730 #else
 731   char *extension = "warc";
 732 #endif
 733
 734   /* If max size is enabled, we add a serial number to the file names. */
 735   if (meta)
 736     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 737   else if (opt.warc_maxsize > 0)
 738     sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
 739   else
 740     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 741
 742   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 743
 744   /* Open the WARC file. */
 745   warc_current_file = fopen (new_filename, "wb+");
 746   if (warc_current_file == NULL)
 747     {
 748       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
 749       return false;
 750     }
 751
 752   if (! warc_write_warcinfo_record (new_filename))
 753     return false;
 754
 755   /* Add warcinfo uuid to manifest. */
 756   if (warc_manifest_fp)
 757     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 758
 759   return true;
 760 }
 761
 762 /* Opens the CDX file for output. */
 763 static bool
 764 warc_start_cdx_file ()
 765 {
 766   int filename_length = strlen (opt.warc_filename);
 767   char *cdx_filename = alloca (filename_length + 4 + 1);
 768   memcpy (cdx_filename, opt.warc_filename, filename_length);
 769   memcpy (cdx_filename + filename_length, ".cdx", 5);
 770   warc_current_cdx_file = fopen (cdx_filename, "a+");
 771   if (warc_current_cdx_file == NULL)
 772     return false;
 773
 774   /* Print the CDX header.
 775    *
 776    * a - original url
 777    * b - date
 778    * m - mime type
 779    * s - response code
 780    * k - new style checksum
 781    * r - redirect
 782    * M - meta tags
 783    * V - compressed arc file offset
 784    * g - file name
 785    * u - record-id
 786    */
 787   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 788   fflush (warc_current_cdx_file);
 789
 790   return true;
 791 }
 792
 793 #define CDX_FIELDSEP " \t\r\n"
 794
 795 /* Parse the CDX header and find the field numbers of the original url,
 796    checksum and record ID fields. */
 797 static bool
 798 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
 799 {
 800   *field_num_original_url = -1;
 801   *field_num_checksum = -1;
 802   *field_num_record_id = -1;
 803
 804   char *token;
 805   char *save_ptr;
 806   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 807
 808   if (token != NULL && strcmp (token, "CDX") == 0)
 809     {
 810       int field_num = 0;
 811       while (token != NULL)
 812         {
 813           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 814           if (token != NULL)
 815             {
 816               switch (token[0])
 817                 {
 818                 case 'a':
 819                   *field_num_original_url = field_num;
 820                   break;
 821                 case 'k':
 822                   *field_num_checksum = field_num;
 823                   break;
 824                 case 'u':
 825                   *field_num_record_id = field_num;
 826                   break;
 827                 }
 828             }
 829           field_num++;
 830         }
 831     }
 832
 833   return *field_num_original_url != -1
 834          && *field_num_checksum != -1
 835          && *field_num_record_id != -1;
 836 }
 837
 838 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 839 static void
 840 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
 841 {
 842   char *original_url = NULL;
 843   char *checksum = NULL;
 844   char *record_id = NULL;
 845
 846   char *token;
 847   char *save_ptr;
 848   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 849
 850   /* Read this line to get the fields we need. */
 851   int field_num = 0;
 852   while (token != NULL)
 853     {
 854       char **val;
 855       if (field_num == field_num_original_url)
 856         val = &original_url;
 857       else if (field_num == field_num_checksum)
 858         val = &checksum;
 859       else if (field_num == field_num_record_id)
 860         val = &record_id;
 861       else
 862         val = NULL;
 863
 864       if (val != NULL)
 865         *val = strdup (token);
 866
 867       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 868       field_num++;
 869     }
 870
 871   if (original_url != NULL && checksum != NULL && record_id != NULL)
 872     {
 873       /* For some extra efficiency, we decode the base32 encoded
 874          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 875          bytes.  */
 876       size_t checksum_l;
 877       char * checksum_v;
 878       base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
 879       free (checksum);
 880
 881       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 882         {
 883           /* This is a valid line with a valid checksum. */
 884           struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
 885           rec->url = original_url;
 886           rec->uuid = record_id;
 887           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 888           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 889           free (checksum_v);
 890         }
 891       else
 892         {
 893           free (original_url);
 894           if (checksum_v != NULL)
 895             free (checksum_v);
 896           free (record_id);
 897         }
 898     }
 899 }
 900
 901 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 902    the warc_cdx_dedup_table. */
 903 bool
 904 warc_load_cdx_dedup_file ()
 905 {
 906   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 907   if (f == NULL)
 908     return false;
 909
 910   int field_num_original_url = -1;
 911   int field_num_checksum = -1;
 912   int field_num_record_id = -1;
 913
 914   char *lineptr = NULL;
 915   size_t n = 0;
 916   size_t line_length;
 917
 918   /* The first line should contain the CDX header.
 919      Format:  " CDX x x x x x"
 920      where x are field type indicators.  For our purposes, we only
 921      need 'a' (the original url), 'k' (the SHA1 checksum) and
 922      'u' (the WARC record id). */
 923   line_length = getline (&lineptr, &n, f);
 924   if (line_length != -1)
 925     warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
 926
 927   /* If the file contains all three fields, read the complete file. */
 928   if (field_num_original_url == -1
 929       || field_num_checksum == -1
 930       || field_num_record_id == -1)
 931     {
 932       if (field_num_original_url == -1)
 933         logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 934       if (field_num_checksum == -1)
 935         logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 936       if (field_num_record_id == -1)
 937         logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 938     }
 939   else
 940     {
 941       /* Initialize the table. */
 942       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
 943
 944       do
 945         {
 946           line_length = getline (&lineptr, &n, f);
 947           if (line_length != -1)
 948             warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
 949
 950         }
 951       while (line_length != -1);
 952
 953       /* Print results. */
 954       int nrecords = hash_table_count (warc_cdx_dedup_table);
 955       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 956                                         "Loaded %d records from CDX.\n\n", nrecords),
 957                               nrecords);
 958     }
 959
 960   fclose (f);
 961
 962   return true;
 963 }
 964 #undef CDX_FIELDSEP
 965
 966 /* Returns the existing duplicate CDX record for the given url and payload
 967    digest.  Returns NULL if the url is not found or if the payload digest
 968    does not match, or if CDX deduplication is disabled. */
 969 static struct warc_cdx_record *
 970 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 971 {
 972   if (warc_cdx_dedup_table == NULL)
 973     return NULL;
 974
 975   char *key;
 976   struct warc_cdx_record *rec_existing;
 977   hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
 978
 979   if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
 980     return rec_existing;
 981   else
 982     return NULL;
 983 }
 984
 985 /* Initializes the WARC writer (if opt.warc_filename is set).
 986    This should be called before any WARC record is written. */
 987 void
 988 warc_init ()
 989 {
 990   warc_write_ok = true;
 991
 992   if (opt.warc_filename != NULL)
 993     {
 994       if (opt.warc_cdx_dedup_filename != NULL)
 995         {
 996           if (! warc_load_cdx_dedup_file ())
 997             {
 998               logprintf (LOG_NOTQUIET,
 999                          _("Could not read CDX file %s for deduplication.\n"),
1000                          quote (opt.warc_cdx_dedup_filename));
1001               exit(1);
1002             }
1003         }
1004
1005       warc_manifest_fp = warc_tempfile ();
1006       if (warc_manifest_fp == NULL)
1007         {
1008           logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
1009           exit(1);
1010         }
1011
1012       if (opt.warc_keep_log)
1013         {
1014           warc_log_fp = warc_tempfile ();
1015           if (warc_log_fp == NULL)
1016             {
1017               logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
1018               exit(1);
1019             }
1020           log_set_warc_log_fp (warc_log_fp);
1021         }
1022
1023       warc_current_file_number = -1;
1024       if (! warc_start_new_file (false))
1025         {
1026           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1027           exit(1);
1028         }
1029
1030       if (opt.warc_cdx_enabled)
1031         {
1032           if (! warc_start_cdx_file ())
1033             {
1034               logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
1035               exit(1);
1036             }
1037         }
1038     }
1039 }
1040
1041 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1042 void
1043 warc_write_metadata ()
1044 {
1045   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1046   if (opt.warc_maxsize > 0)
1047     warc_start_new_file (true);
1048
1049   char manifest_uuid [48];
1050   warc_uuid_str (manifest_uuid);
1051
1052   fflush (warc_manifest_fp);
1053   warc_write_resource_record (manifest_uuid,
1054                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1055                               NULL, NULL, NULL, "text/plain",
1056                               warc_manifest_fp, -1);
1057   /* warc_write_resource_record has closed warc_manifest_fp. */
1058
1059   FILE * warc_tmp_fp = warc_tempfile ();
1060   if (warc_tmp_fp == NULL)
1061     {
1062       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1063       exit(1);
1064     }
1065   fflush (warc_tmp_fp);
1066   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1067
1068   warc_write_resource_record (manifest_uuid,
1069                               "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1070                               NULL, NULL, NULL, "text/plain",
1071                               warc_tmp_fp, -1);
1072   /* warc_write_resource_record has closed warc_tmp_fp. */
1073
1074   if (warc_log_fp != NULL)
1075     {
1076       warc_write_resource_record (NULL,
1077                                   "metadata://gnu.org/software/wget/warc/wget.log",
1078                                   NULL, manifest_uuid, NULL, "text/plain",
1079                                   warc_log_fp, -1);
1080       /* warc_write_resource_record has closed warc_log_fp. */
1081
1082       warc_log_fp = NULL;
1083       log_set_warc_log_fp (NULL);
1084     }
1085 }
1086
1087 /* Finishes the WARC writing.
1088    This should be called at the end of the program. */
1089 void
1090 warc_close ()
1091 {
1092   if (warc_current_file != NULL)
1093     {
1094       warc_write_metadata ();
1095       free (warc_current_warcinfo_uuid_str);
1096       fclose (warc_current_file);
1097     }
1098   if (warc_current_cdx_file != NULL)
1099     fclose (warc_current_cdx_file);
1100   if (warc_log_fp != NULL)
1101     {
1102       fclose (warc_log_fp);
1103       log_set_warc_log_fp (NULL);
1104     }
1105 }
1106
1107 /* Creates a temporary file for writing WARC output.
1108    The temporary file will be created in opt.warc_tempdir.
1109    Returns the pointer to the temporary file, or NULL. */
1110 FILE *
1111 warc_tempfile ()
1112 {
1113   char filename[100];
1114   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1115     return NULL;
1116
1117   int fd = mkstemp (filename);
1118   if (fd < 0)
1119     return NULL;
1120
1121   if (unlink (filename) < 0)
1122     return NULL;
1123
1124   return fdopen (fd, "wb+");
1125 }
1126
1127
1128 /* Writes a request record to the WARC file.
1129    url  is the target uri of the request,
1130    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1131    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1132    body  is a pointer to a file containing the request headers and body.
1133    ip  is the ip address of the server (or NULL),
1134    Calling this function will close body.
1135    Returns true on success, false on error. */
1136 bool
1137 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset)
1138 {
1139   warc_write_start_record ();
1140   warc_write_header ("WARC-Type", "request");
1141   warc_write_header ("WARC-Target-URI", url);
1142   warc_write_header ("Content-Type", "application/http;msgtype=request");
1143   warc_write_date_header (timestamp_str);
1144   warc_write_header ("WARC-Record-ID", record_uuid);
1145   warc_write_ip_header (ip);
1146   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1147   warc_write_digest_headers (body, payload_offset);
1148   warc_write_block_from_file (body);
1149   warc_write_end_record ();
1150
1151   fclose (body);
1152
1153   return warc_write_ok;
1154 }
1155
1156 /* Writes a response record to the CDX file.
1157    url  is the target uri of the request/response,
1158    timestamp_str  is the timestamp of the request that generated this response,
1159                   (generated with warc_timestamp),
1160    mime_type  is the mime type of the response body (will be printed to CDX),
1161    response_code  is the HTTP response code (will be printed to CDX),
1162    payload_digest  is the sha1 digest of the payload,
1163    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1164    offset  is the position of the WARC record in the WARC file,
1165    warc_filename  is the filename of the WARC,
1166    response_uuid  is the uuid of the response.
1167    Returns true on success, false on error. */
1168 static bool
1169 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid)
1170 {
1171   /* Transform the timestamp. */
1172   char timestamp_str_cdx [15];
1173   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1174   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1175   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1176   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1177   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1178   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1179   timestamp_str_cdx[14] = '\0';
1180
1181   /* Rewrite the checksum. */
1182   char *checksum;
1183   if (payload_digest != NULL)
1184     checksum = payload_digest + 5; /* Skip the "sha1:" */
1185   else
1186     checksum = "-";
1187
1188   if (mime_type == NULL || strlen(mime_type) == 0)
1189     mime_type = "-";
1190   if (redirect_location == NULL || strlen(redirect_location) == 0)
1191     redirect_location = "-";
1192
1193   /* Print the CDX line. */
1194   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1195   fflush (warc_current_cdx_file);
1196
1197   return true;
1198 }
1199
1200 /* Writes a revisit record to the WARC file.
1201    url  is the target uri of the request/response,
1202    timestamp_str  is the timestamp of the request that generated this response
1203                   (generated with warc_timestamp),
1204    concurrent_to_uuid  is the uuid of the request for that generated this response
1205                  (generated with warc_uuid_str),
1206    refers_to_uuid  is the uuid of the original response
1207                  (generated with warc_uuid_str),
1208    payload_digest  is the sha1 digest of the payload,
1209    ip  is the ip address of the server (or NULL),
1210    body  is a pointer to a file containing the response headers (without payload).
1211    Calling this function will close body.
1212    Returns true on success, false on error. */
1213 static bool
1214 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1215 {
1216   char revisit_uuid [48];
1217   warc_uuid_str (revisit_uuid);
1218
1219   char *block_digest = NULL;
1220   char sha1_res_block[SHA1_DIGEST_SIZE];
1221   sha1_stream (body, sha1_res_block);
1222   block_digest = warc_base32_sha1_digest (sha1_res_block);
1223
1224   warc_write_start_record ();
1225   warc_write_header ("WARC-Type", "revisit");
1226   warc_write_header ("WARC-Record-ID", revisit_uuid);
1227   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1228   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1229   warc_write_header ("WARC-Refers-To", refers_to);
1230   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1231   warc_write_header ("WARC-Truncated", "length");
1232   warc_write_header ("WARC-Target-URI", url);
1233   warc_write_date_header (timestamp_str);
1234   warc_write_ip_header (ip);
1235   warc_write_header ("Content-Type", "application/http;msgtype=response");
1236   warc_write_header ("WARC-Block-Digest", block_digest);
1237   warc_write_header ("WARC-Payload-Digest", payload_digest);
1238   warc_write_block_from_file (body);
1239   warc_write_end_record ();
1240
1241   fclose (body);
1242   free (block_digest);
1243
1244   return warc_write_ok;
1245 }
1246
1247 /* Writes a response record to the WARC file.
1248    url  is the target uri of the request/response,
1249    timestamp_str  is the timestamp of the request that generated this response
1250                   (generated with warc_timestamp),
1251    concurrent_to_uuid  is the uuid of the request for that generated this response
1252                  (generated with warc_uuid_str),
1253    ip  is the ip address of the server (or NULL),
1254    body  is a pointer to a file containing the response headers and body.
1255    mime_type  is the mime type of the response body (will be printed to CDX),
1256    response_code  is the HTTP response code (will be printed to CDX),
1257    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1258    Calling this function will close body.
1259    Returns true on success, false on error. */
1260 bool
1261 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location)
1262 {
1263   char *block_digest = NULL;
1264   char *payload_digest = NULL;
1265   char sha1_res_block[SHA1_DIGEST_SIZE];
1266   char sha1_res_payload[SHA1_DIGEST_SIZE];
1267
1268   if (opt.warc_digests_enabled)
1269     {
1270       /* Calculate the block and payload digests. */
1271       rewind (body);
1272       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1273         {
1274           /* Decide (based on url + payload digest) if we have seen this
1275              data before. */
1276           struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1277           if (rec_existing != NULL)
1278             {
1279               /* Found an existing record. */
1280               logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1281
1282               /* Remove the payload from the file. */
1283               if (payload_offset > 0)
1284                 {
1285                   if (ftruncate (fileno (body), payload_offset) == -1)
1286                     return false;
1287                 }
1288
1289               /* Send the original payload digest. */
1290               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1291               bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1292               free (payload_digest);
1293
1294               return result;
1295             }
1296
1297           block_digest = warc_base32_sha1_digest (sha1_res_block);
1298           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1299         }
1300     }
1301
1302   /* Not a revisit, just store the record. */
1303
1304   char response_uuid [48];
1305   warc_uuid_str (response_uuid);
1306
1307   fseeko (warc_current_file, 0L, SEEK_END);
1308   off_t offset = ftello (warc_current_file);
1309
1310   warc_write_start_record ();
1311   warc_write_header ("WARC-Type", "response");
1312   warc_write_header ("WARC-Record-ID", response_uuid);
1313   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1314   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1315   warc_write_header ("WARC-Target-URI", url);
1316   warc_write_date_header (timestamp_str);
1317   warc_write_ip_header (ip);
1318   warc_write_header ("WARC-Block-Digest", block_digest);
1319   warc_write_header ("WARC-Payload-Digest", payload_digest);
1320   warc_write_header ("Content-Type", "application/http;msgtype=response");
1321   warc_write_block_from_file (body);
1322   warc_write_end_record ();
1323
1324   fclose (body);
1325
1326   if (warc_write_ok && opt.warc_cdx_enabled)
1327     {
1328       /* Add this record to the CDX. */
1329       warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1330     }
1331
1332   if (block_digest)
1333     free (block_digest);
1334   if (payload_digest)
1335     free (payload_digest);
1336
1337   return warc_write_ok;
1338 }
1339
1340 /* Writes a resource record to the WARC file.
1341    resource_uuid  is the uuid of the resource (or NULL),
1342    url  is the target uri of the resource,
1343    timestamp_str  is the timestamp (generated with warc_timestamp),
1344    concurrent_to_uuid  is the uuid of the request for that generated this resource
1345                  (generated with warc_uuid_str) or NULL,
1346    ip  is the ip address of the server (or NULL),
1347    content_type  is the mime type of the body (or NULL),
1348    body  is a pointer to a file containing the resource data.
1349    Calling this function will close body.
1350    Returns true on success, false on error. */
1351 bool
1352 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset)
1353 {
1354   if (resource_uuid == NULL)
1355     {
1356       resource_uuid = alloca (48);
1357       warc_uuid_str (resource_uuid);
1358     }
1359
1360   if (content_type == NULL)
1361     content_type = "application/octet-stream";
1362
1363   warc_write_start_record ();
1364   warc_write_header ("WARC-Type", "resource");
1365   warc_write_header ("WARC-Record-ID", resource_uuid);
1366   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1367   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1368   warc_write_header ("WARC-Target-URI", url);
1369   warc_write_date_header (timestamp_str);
1370   warc_write_ip_header (ip);
1371   warc_write_digest_headers (body, payload_offset);
1372   warc_write_header ("Content-Type", content_type);
1373   warc_write_block_from_file (body);
1374   warc_write_end_record ();
1375
1376   fclose (body);
1377
1378   return warc_write_ok;
1379 }
1380