sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #endif
  55
  56 #include "warc.h"
  57
  58 extern char *version_string;
  59
  60 /* Set by main in main.c */
  61 extern char *program_argstring;
  62
  63
  64 /* The log file (a temporary file that contains a copy
  65    of the wget log). */
  66 static FILE *warc_log_fp;
  67
  68 /* The manifest file (a temporary file that contains the
  69    warcinfo uuid of every file in this crawl). */
  70 static FILE *warc_manifest_fp;
  71
  72 /* The current WARC file (or NULL, if WARC is disabled). */
  73 static FILE *warc_current_file;
  74
  75 #ifdef HAVE_LIBZ
  76 /* The gzip stream for the current WARC file
  77    (or NULL, if WARC or gzip is disabled). */
  78 static gzFile warc_current_gzfile;
  79
  80 /* The offset of the current gzip record in the WARC file. */
  81 static off_t warc_current_gzfile_offset;
  82
  83 /* The uncompressed size (so far) of the current record. */
  84 static off_t warc_current_gzfile_uncompressed_size;
  85 # endif
  86
  87 /* This is true until a warc_write_* method fails. */
  88 static bool warc_write_ok;
  89
  90 /* The current CDX file (or NULL, if CDX is disabled). */
  91 static FILE *warc_current_cdx_file;
  92
  93 /* The record id of the warcinfo record of the current WARC file.  */
  94 static char *warc_current_warcinfo_uuid_str;
  95
  96 /* The file name of the current WARC file. */
  97 static char *warc_current_filename;
  98
  99 /* The serial number of the current WARC file.  This number is
 100    incremented each time a new file is opened and is used in the
 101    WARC file's filename. */
 102 static int warc_current_file_number;
 103
 104 /* The table of CDX records, if deduplication is enabled. */
 105 struct hash_table * warc_cdx_dedup_table;
 106
 107 static bool warc_start_new_file (bool meta);
 108
 109
 110 struct warc_cdx_record
 111 {
 112   char *url;
 113   char *uuid;
 114   char digest[SHA1_DIGEST_SIZE];
 115 };
 116
 117 static unsigned long
 118 warc_hash_sha1_digest (const void *key)
 119 {
 120   /* We just use some of the first bytes of the digest. */
 121   unsigned long v = 0;
 122   memcpy (&v, key, sizeof (unsigned long));
 123   return v;
 124 }
 125
 126 static int
 127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 128 {
 129   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 130 }
 131
 132
 133
 134 /* Writes SIZE bytes from BUFFER to the current WARC file,
 135    through gzwrite if compression is enabled.
 136    Returns the number of uncompressed bytes written.  */
 137 static size_t
 138 warc_write_buffer (const char *buffer, size_t size)
 139 {
 140 #ifdef HAVE_LIBZ
 141   if (warc_current_gzfile)
 142     {
 143       warc_current_gzfile_uncompressed_size += size;
 144       return gzwrite (warc_current_gzfile, buffer, size);
 145     }
 146   else
 147 #endif
 148     return fwrite (buffer, 1, size, warc_current_file);
 149 }
 150
 151 /* Writes STR to the current WARC file.
 152    Returns false and set warc_write_ok to false if there
 153    is an error.  */
 154 static bool
 155 warc_write_string (const char *str)
 156 {
 157   if (!warc_write_ok)
 158     return false;
 159
 160   size_t n = strlen (str);
 161   if (n != warc_write_buffer (str, n))
 162     warc_write_ok = false;
 163
 164   return warc_write_ok;
 165 }
 166
 167
 168 #define EXTRA_GZIP_HEADER_SIZE 12
 169 #define GZIP_STATIC_HEADER_SIZE  10
 170 #define FLG_FEXTRA          0x04
 171 #define OFF_FLG             3
 172
 173 /* Starts a new WARC record.  Writes the version header.
 174    If opt.warc_maxsize is set and the current file is becoming
 175    too large, this will open a new WARC file.
 176
 177    If compression is enabled, this will start a new
 178    gzip stream in the current WARC file.
 179
 180    Returns false and set warc_write_ok to false if there
 181    is an error.  */
 182 static bool
 183 warc_write_start_record (void)
 184 {
 185   if (!warc_write_ok)
 186     return false;
 187
 188   fflush (warc_current_file);
 189   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 190     warc_start_new_file (false);
 191
 192 #ifdef HAVE_LIBZ
 193   /* Start a GZIP stream, if required. */
 194   if (opt.warc_compression_enabled)
 195     {
 196       /* Record the starting offset of the new record. */
 197       warc_current_gzfile_offset = ftello (warc_current_file);
 198
 199       /* Reserve space for the extra GZIP header field.
 200          In warc_write_end_record we will fill this space
 201          with information about the uncompressed and
 202          compressed size of the record. */
 203       fprintf (warc_current_file, "XXXXXXXXXXXX");
 204       fflush (warc_current_file);
 205
 206       /* Start a new GZIP stream. */
 207       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 208       warc_current_gzfile_uncompressed_size = 0;
 209
 210       if (warc_current_gzfile == NULL)
 211         {
 212           logprintf (LOG_NOTQUIET,
 213 _("Error opening GZIP stream to WARC file.\n"));
 214           warc_write_ok = false;
 215           return false;
 216         }
 217     }
 218 #endif
 219
 220   warc_write_string ("WARC/1.0\r\n");
 221   return warc_write_ok;
 222 }
 223
 224 /* Writes a WARC header to the current WARC record.
 225    This method may be run after warc_write_start_record and
 226    before warc_write_block_from_file.  */
 227 static bool
 228 warc_write_header (const char *name, const char *value)
 229 {
 230   if (value)
 231     {
 232       warc_write_string (name);
 233       warc_write_string (": ");
 234       warc_write_string (value);
 235       warc_write_string ("\r\n");
 236     }
 237   return warc_write_ok;
 238 }
 239
 240 /* Copies the contents of DATA_IN to the WARC record.
 241    Adds a Content-Length header to the WARC record.
 242    Run this method after warc_write_header,
 243    then run warc_write_end_record. */
 244 static bool
 245 warc_write_block_from_file (FILE *data_in)
 246 {
 247   /* Add the Content-Length header. */
 248   char *content_length;
 249   fseeko (data_in, 0L, SEEK_END);
 250   if (! asprintf (&content_length, "%ld", ftello (data_in)))
 251     {
 252       warc_write_ok = false;
 253       return false;
 254     }
 255   warc_write_header ("Content-Length", content_length);
 256   free (content_length);
 257
 258   /* End of the WARC header section. */
 259   warc_write_string ("\r\n");
 260
 261   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 262     warc_write_ok = false;
 263
 264   /* Copy the data in the file to the WARC record. */
 265   char buffer[BUFSIZ];
 266   size_t s;
 267   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 268     {
 269       if (warc_write_buffer (buffer, s) < s)
 270         warc_write_ok = false;
 271     }
 272
 273   return warc_write_ok;
 274 }
 275
 276 /* Run this method to close the current WARC record.
 277
 278    If compression is enabled, this method closes the
 279    current GZIP stream and fills the extra GZIP header
 280    with the uncompressed and compressed length of the
 281    record. */
 282 static bool
 283 warc_write_end_record (void)
 284 {
 285   warc_write_buffer ("\r\n\r\n", 4);
 286
 287 #ifdef HAVE_LIBZ
 288   /* We start a new gzip stream for each record.  */
 289   if (warc_write_ok && warc_current_gzfile)
 290     {
 291       if (gzclose (warc_current_gzfile) != Z_OK)
 292         {
 293           warc_write_ok = false;
 294           return false;
 295         }
 296
 297       fflush (warc_current_file);
 298       fseeko (warc_current_file, 0, SEEK_END);
 299
 300       /* The WARC standard suggests that we add 'skip length' data in the
 301          extra header field of the GZIP stream.
 302
 303          In warc_write_start_record we reserved space for this extra header.
 304          This extra space starts at warc_current_gzfile_offset and fills
 305          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 306          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 307
 308          We need to do three things:
 309          1. Move the static GZIP header to warc_current_gzfile_offset;
 310          2. Set the FEXTRA flag in the GZIP header;
 311          3. Write the extra GZIP header after the static header, that is,
 312             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 313       */
 314
 315       /* Calculate the uncompressed and compressed sizes. */
 316       off_t current_offset = ftello (warc_current_file);
 317       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 318       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 319
 320       /* Go back to the static GZIP header. */
 321       fseeko (warc_current_file, warc_current_gzfile_offset
 322               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 323
 324       /* Read the header. */
 325       char static_header[GZIP_STATIC_HEADER_SIZE];
 326       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
 327                              warc_current_file);
 328       if (result != GZIP_STATIC_HEADER_SIZE)
 329         {
 330           warc_write_ok = false;
 331           return false;
 332         }
 333
 334       /* Set the FEXTRA flag in the flags byte of the header. */
 335       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 336
 337       /* Write the header back to the file, but starting at
 338          warc_current_gzfile_offset. */
 339       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 340       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 341
 342       /* Prepare the extra GZIP header. */
 343       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 344       /* XLEN, the length of the extra header fields.  */
 345       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 346       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 347       /* The extra header field identifier for the WARC skip length. */
 348       extra_header[2]  = 's';
 349       extra_header[3]  = 'l';
 350       /* The size of the uncompressed record.  */
 351       extra_header[4]  = (uncompressed_size & 255);
 352       extra_header[5]  = (uncompressed_size >> 8) & 255;
 353       extra_header[6]  = (uncompressed_size >> 16) & 255;
 354       extra_header[7]  = (uncompressed_size >> 24) & 255;
 355       /* The size of the compressed record.  */
 356       extra_header[8]  = (compressed_size & 255);
 357       extra_header[9]  = (compressed_size >> 8) & 255;
 358       extra_header[10] = (compressed_size >> 16) & 255;
 359       extra_header[11] = (compressed_size >> 24) & 255;
 360
 361       /* Write the extra header after the static header. */
 362       fseeko (warc_current_file, warc_current_gzfile_offset
 363               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 364       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 365
 366       /* Done, move back to the end of the file. */
 367       fflush (warc_current_file);
 368       fseeko (warc_current_file, 0, SEEK_END);
 369     }
 370 #endif /* HAVE_LIBZ */
 371
 372   return warc_write_ok;
 373 }
 374
 375
 376 /* Writes the WARC-Date header for the given timestamp to
 377    the current WARC record.
 378    If timestamp is NULL, the current time will be used.  */
 379 static bool
 380 warc_write_date_header (const char *timestamp)
 381 {
 382   if (timestamp == NULL)
 383     {
 384       char current_timestamp[21];
 385       warc_timestamp (current_timestamp);
 386       timestamp = current_timestamp;
 387     }
 388   return warc_write_header ("WARC-Date", timestamp);
 389 }
 390
 391 /* Writes the WARC-IP-Address header for the given IP to
 392    the current WARC record.  If IP is NULL, no header will
 393    be written.  */
 394 static bool
 395 warc_write_ip_header (ip_address *ip)
 396 {
 397   if (ip != NULL)
 398     return warc_write_header ("WARC-IP-Address", print_address (ip));
 399   else
 400     return warc_write_ok;
 401 }
 402
 403
 404 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 405    from gnulib/sha1.c.  This version calculates two digests in one go.
 406
 407    Compute SHA1 message digests for bytes read from STREAM.  The
 408    digest of the complete file will be written into the 16 bytes
 409    beginning at RES_BLOCK.
 410
 411    If payload_offset >= 0, a second digest will be calculated of the
 412    portion of the file starting at payload_offset and continuing to
 413    the end of the file.  The digest number will be written into the
 414    16 bytes beginning ad RES_PAYLOAD.  */
 415 static int
 416 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 417                                off_t payload_offset)
 418 {
 419 #define BLOCKSIZE 32768
 420
 421   struct sha1_ctx ctx_block;
 422   struct sha1_ctx ctx_payload;
 423   off_t pos;
 424   off_t sum;
 425
 426   char *buffer = malloc (BLOCKSIZE + 72);
 427   if (!buffer)
 428     return 1;
 429
 430   /* Initialize the computation context.  */
 431   sha1_init_ctx (&ctx_block);
 432   if (payload_offset >= 0)
 433     sha1_init_ctx (&ctx_payload);
 434
 435   pos = 0;
 436
 437   /* Iterate over full file contents.  */
 438   while (1)
 439     {
 440       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 441          computation function processes the whole buffer so that with the
 442          next round of the loop another block can be read.  */
 443       off_t n;
 444       sum = 0;
 445
 446       /* Read block.  Take care for partial reads.  */
 447       while (1)
 448         {
 449           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 450
 451           sum += n;
 452           pos += n;
 453
 454           if (sum == BLOCKSIZE)
 455             break;
 456
 457           if (n == 0)
 458             {
 459               /* Check for the error flag IFF N == 0, so that we don't
 460                  exit the loop after a partial read due to e.g., EAGAIN
 461                  or EWOULDBLOCK.  */
 462               if (ferror (stream))
 463                 {
 464                   free (buffer);
 465                   return 1;
 466                 }
 467               goto process_partial_block;
 468             }
 469
 470           /* We've read at least one byte, so ignore errors.  But always
 471              check for EOF, since feof may be true even though N > 0.
 472              Otherwise, we could end up calling fread after EOF.  */
 473           if (feof (stream))
 474             goto process_partial_block;
 475         }
 476
 477       /* Process buffer with BLOCKSIZE bytes.  Note that
 478                         BLOCKSIZE % 64 == 0
 479        */
 480       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 481       if (payload_offset >= 0 && payload_offset < pos)
 482         {
 483           /* At least part of the buffer contains data from payload. */
 484           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 485           if (start_of_payload <= 0)
 486             /* All bytes in the buffer belong to the payload. */
 487             start_of_payload = 0;
 488
 489           /* Process the payload part of the buffer.
 490              Note: we can't use  sha1_process_block  here even if we
 491              process the complete buffer.  Because the payload doesn't
 492              have to start with a full block, there may still be some
 493              bytes left from the previous buffer.  Therefore, we need
 494              to continue with  sha1_process_bytes.  */
 495           sha1_process_bytes (buffer + start_of_payload,
 496                               BLOCKSIZE - start_of_payload, &ctx_payload);
 497         }
 498     }
 499
 500  process_partial_block:;
 501
 502   /* Process any remaining bytes.  */
 503   if (sum > 0)
 504     {
 505       sha1_process_bytes (buffer, sum, &ctx_block);
 506       if (payload_offset >= 0 && payload_offset < pos)
 507         {
 508           /* At least part of the buffer contains data from payload. */
 509           off_t start_of_payload = payload_offset - (pos - sum);
 510           if (start_of_payload <= 0)
 511             /* All bytes in the buffer belong to the payload. */
 512             start_of_payload = 0;
 513
 514           /* Process the payload part of the buffer. */
 515           sha1_process_bytes (buffer + start_of_payload,
 516                               sum - start_of_payload, &ctx_payload);
 517         }
 518     }
 519
 520   /* Construct result in desired memory.  */
 521   sha1_finish_ctx (&ctx_block,   res_block);
 522   if (payload_offset >= 0)
 523     sha1_finish_ctx (&ctx_payload, res_payload);
 524   free (buffer);
 525   return 0;
 526
 527 #undef BLOCKSIZE
 528 }
 529
 530 /* Converts the SHA1 digest to a base32-encoded string.
 531    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 532 static char *
 533 warc_base32_sha1_digest (char *sha1_digest)
 534 {
 535   // length: "sha1:" + digest + "\0"
 536   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 537   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
 538                  BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 539   memcpy (sha1_base32, "sha1:", 5);
 540   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 541   return sha1_base32;
 542 }
 543
 544
 545 /* Sets the digest headers of the record.
 546    This method will calculate the block digest and, if payload_offset >= 0,
 547    will also calculate the payload digest of the payload starting at the
 548    provided offset.  */
 549 static void
 550 warc_write_digest_headers (FILE *file, long payload_offset)
 551 {
 552   if (opt.warc_digests_enabled)
 553     {
 554       /* Calculate the block and payload digests. */
 555       char sha1_res_block[SHA1_DIGEST_SIZE];
 556       char sha1_res_payload[SHA1_DIGEST_SIZE];
 557
 558       rewind (file);
 559       if (warc_sha1_stream_with_payload (file, sha1_res_block,
 560           sha1_res_payload, payload_offset) == 0)
 561         {
 562           char *digest;
 563
 564           digest = warc_base32_sha1_digest (sha1_res_block);
 565           warc_write_header ("WARC-Block-Digest", digest);
 566           free (digest);
 567
 568           if (payload_offset >= 0)
 569             {
 570               digest = warc_base32_sha1_digest (sha1_res_payload);
 571               warc_write_header ("WARC-Payload-Digest", digest);
 572               free (digest);
 573             }
 574         }
 575     }
 576 }
 577
 578
 579 /* Fills timestamp with the current time and date.
 580    The UTC time is formatted following ISO 8601, as required
 581    for use in the WARC-Date header.
 582    The timestamp will be 21 characters long. */
 583 void
 584 warc_timestamp (char *timestamp)
 585 {
 586   time_t rawtime;
 587   struct tm * timeinfo;
 588   time ( &rawtime );
 589   timeinfo = gmtime (&rawtime);
 590   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 591 }
 592
 593 #ifdef HAVE_LIBUUID
 594 /* Fills urn_str with a UUID in the format required
 595    for the WARC-Record-Id header.
 596    The string will be 47 characters long. */
 597 void
 598 warc_uuid_str (char *urn_str)
 599 {
 600   char uuid_str[37];
 601
 602   uuid_t record_id;
 603   uuid_generate (record_id);
 604   uuid_unparse (record_id, uuid_str);
 605
 606   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 607 }
 608 #else
 609 /* Fills urn_str with a UUID based on random numbers in the format
 610    required for the WARC-Record-Id header.
 611    (See RFC 4122, UUID version 4.)
 612
 613    Note: this is a fallback method, it is much better to use the
 614    methods provided by libuuid.
 615
 616    The string will be 47 characters long. */
 617 void
 618 warc_uuid_str (char *urn_str)
 619 {
 620   // RFC 4122, a version 4 UUID with only random numbers
 621
 622   unsigned char uuid_data[16];
 623   int i;
 624   for (i=0; i<16; i++)
 625     uuid_data[i] = random_number (255);
 626
 627   // Set the four most significant bits (bits 12 through 15) of the
 628   // time_hi_and_version field to the 4-bit version number
 629   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 630
 631   // Set the two most significant bits (bits 6 and 7) of the
 632   // clock_seq_hi_and_reserved to zero and one, respectively.
 633   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 634
 635   sprintf (urn_str,
 636     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 637     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 638     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 639     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 640     uuid_data[15]);
 641 }
 642 #endif
 643
 644 /* Write a warcinfo record to the current file.
 645    Updates warc_current_warcinfo_uuid_str. */
 646 static bool
 647 warc_write_warcinfo_record (char *filename)
 648 {
 649   /* Write warc-info record as the first record of the file. */
 650   /* We add the record id of this info record to the other records in the
 651      file. */
 652   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 653   warc_uuid_str (warc_current_warcinfo_uuid_str);
 654
 655   char timestamp[22];
 656   warc_timestamp (timestamp);
 657
 658   char *filename_copy, *filename_basename;
 659   filename_copy = strdup (filename);
 660   filename_basename = strdup (basename (filename_copy));
 661
 662   warc_write_start_record ();
 663   warc_write_header ("WARC-Type", "warcinfo");
 664   warc_write_header ("Content-Type", "application/warc-fields");
 665   warc_write_header ("WARC-Date", timestamp);
 666   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 667   warc_write_header ("WARC-Filename", filename_basename);
 668
 669   /* Create content.  */
 670   FILE *warc_tmp = warc_tempfile ();
 671   if (warc_tmp == NULL)
 672     {
 673       free (filename_copy);
 674       free (filename_basename);
 675       return false;
 676     }
 677
 678   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 679   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 680   fprintf (warc_tmp,
 681 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 682   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 683   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 684   /* Add the user headers, if any. */
 685   if (opt.warc_user_headers)
 686     {
 687       int i;
 688       for (i = 0; opt.warc_user_headers[i]; i++)
 689         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 690     }
 691   fprintf(warc_tmp, "\r\n");
 692
 693   warc_write_digest_headers (warc_tmp, -1);
 694   warc_write_block_from_file (warc_tmp);
 695   warc_write_end_record ();
 696
 697   if (! warc_write_ok)
 698     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 699
 700   free (filename_copy);
 701   free (filename_basename);
 702   fclose (warc_tmp);
 703   return warc_write_ok;
 704 }
 705
 706 /* Opens a new WARC file.
 707    If META is true, generates a filename ending with 'meta.warc.gz'.
 708
 709    This method will:
 710    1. close the current WARC file (if there is one);
 711    2. increment warc_current_file_number;
 712    3. open a new WARC file;
 713    4. write the initial warcinfo record.
 714
 715    Returns true on success, false otherwise.
 716    */
 717 static bool
 718 warc_start_new_file (bool meta)
 719 {
 720   if (opt.warc_filename == NULL)
 721     return false;
 722
 723   if (warc_current_file != NULL)
 724     fclose (warc_current_file);
 725   if (warc_current_warcinfo_uuid_str)
 726     free (warc_current_warcinfo_uuid_str);
 727   if (warc_current_filename)
 728     free (warc_current_filename);
 729
 730   warc_current_file_number++;
 731
 732   int base_filename_length = strlen (opt.warc_filename);
 733   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 734   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 735   warc_current_filename = new_filename;
 736
 737 #ifdef HAVE_LIBZ
 738   const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 739 #else
 740   const char *extension = "warc";
 741 #endif
 742
 743   /* If max size is enabled, we add a serial number to the file names. */
 744   if (meta)
 745     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 746   else if (opt.warc_maxsize > 0)
 747     {
 748       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
 749                warc_current_file_number, extension);
 750     }
 751   else
 752     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 753
 754   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 755
 756   /* Open the WARC file. */
 757   warc_current_file = fopen (new_filename, "wb+");
 758   if (warc_current_file == NULL)
 759     {
 760       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
 761                  quote (new_filename));
 762       return false;
 763     }
 764
 765   if (! warc_write_warcinfo_record (new_filename))
 766     return false;
 767
 768   /* Add warcinfo uuid to manifest. */
 769   if (warc_manifest_fp)
 770     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 771
 772   return true;
 773 }
 774
 775 /* Opens the CDX file for output. */
 776 static bool
 777 warc_start_cdx_file (void)
 778 {
 779   int filename_length = strlen (opt.warc_filename);
 780   char *cdx_filename = alloca (filename_length + 4 + 1);
 781   memcpy (cdx_filename, opt.warc_filename, filename_length);
 782   memcpy (cdx_filename + filename_length, ".cdx", 5);
 783   warc_current_cdx_file = fopen (cdx_filename, "a+");
 784   if (warc_current_cdx_file == NULL)
 785     return false;
 786
 787   /* Print the CDX header.
 788    *
 789    * a - original url
 790    * b - date
 791    * m - mime type
 792    * s - response code
 793    * k - new style checksum
 794    * r - redirect
 795    * M - meta tags
 796    * V - compressed arc file offset
 797    * g - file name
 798    * u - record-id
 799    */
 800   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 801   fflush (warc_current_cdx_file);
 802
 803   return true;
 804 }
 805
 806 #define CDX_FIELDSEP " \t\r\n"
 807
 808 /* Parse the CDX header and find the field numbers of the original url,
 809    checksum and record ID fields. */
 810 static bool
 811 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
 812                        int *field_num_checksum, int *field_num_record_id)
 813 {
 814   *field_num_original_url = -1;
 815   *field_num_checksum = -1;
 816   *field_num_record_id = -1;
 817
 818   char *token;
 819   char *save_ptr;
 820   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 821
 822   if (token != NULL && strcmp (token, "CDX") == 0)
 823     {
 824       int field_num = 0;
 825       while (token != NULL)
 826         {
 827           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 828           if (token != NULL)
 829             {
 830               switch (token[0])
 831                 {
 832                 case 'a':
 833                   *field_num_original_url = field_num;
 834                   break;
 835                 case 'k':
 836                   *field_num_checksum = field_num;
 837                   break;
 838                 case 'u':
 839                   *field_num_record_id = field_num;
 840                   break;
 841                 }
 842             }
 843           field_num++;
 844         }
 845     }
 846
 847   return *field_num_original_url != -1
 848          && *field_num_checksum != -1
 849          && *field_num_record_id != -1;
 850 }
 851
 852 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 853 static void
 854 warc_process_cdx_line (char *lineptr, int field_num_original_url,
 855                        int field_num_checksum, int field_num_record_id)
 856 {
 857   char *original_url = NULL;
 858   char *checksum = NULL;
 859   char *record_id = NULL;
 860
 861   char *token;
 862   char *save_ptr;
 863   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 864
 865   /* Read this line to get the fields we need. */
 866   int field_num = 0;
 867   while (token != NULL)
 868     {
 869       char **val;
 870       if (field_num == field_num_original_url)
 871         val = &original_url;
 872       else if (field_num == field_num_checksum)
 873         val = &checksum;
 874       else if (field_num == field_num_record_id)
 875         val = &record_id;
 876       else
 877         val = NULL;
 878
 879       if (val != NULL)
 880         *val = strdup (token);
 881
 882       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 883       field_num++;
 884     }
 885
 886   if (original_url != NULL && checksum != NULL && record_id != NULL)
 887     {
 888       /* For some extra efficiency, we decode the base32 encoded
 889          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 890          bytes.  */
 891       size_t checksum_l;
 892       char * checksum_v;
 893       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
 894                            &checksum_l);
 895       free (checksum);
 896
 897       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 898         {
 899           /* This is a valid line with a valid checksum. */
 900           struct warc_cdx_record *rec;
 901           rec = malloc (sizeof (struct warc_cdx_record));
 902           rec->url = original_url;
 903           rec->uuid = record_id;
 904           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 905           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 906           free (checksum_v);
 907         }
 908       else
 909         {
 910           free (original_url);
 911           if (checksum_v != NULL)
 912             free (checksum_v);
 913           free (record_id);
 914         }
 915     }
 916   else
 917     {
 918       xfree_null(checksum);
 919       xfree_null(original_url);
 920       xfree_null(record_id);
 921     }
 922 }
 923
 924 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 925    the warc_cdx_dedup_table. */
 926 static bool
 927 warc_load_cdx_dedup_file (void)
 928 {
 929   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 930   if (f == NULL)
 931     return false;
 932
 933   int field_num_original_url = -1;
 934   int field_num_checksum = -1;
 935   int field_num_record_id = -1;
 936
 937   char *lineptr = NULL;
 938   size_t n = 0;
 939   ssize_t line_length;
 940
 941   /* The first line should contain the CDX header.
 942      Format:  " CDX x x x x x"
 943      where x are field type indicators.  For our purposes, we only
 944      need 'a' (the original url), 'k' (the SHA1 checksum) and
 945      'u' (the WARC record id). */
 946   line_length = getline (&lineptr, &n, f);
 947   if (line_length != -1)
 948     warc_parse_cdx_header (lineptr, &field_num_original_url,
 949                            &field_num_checksum, &field_num_record_id);
 950
 951   /* If the file contains all three fields, read the complete file. */
 952   if (field_num_original_url == -1
 953       || field_num_checksum == -1
 954       || field_num_record_id == -1)
 955     {
 956       if (field_num_original_url == -1)
 957         logprintf (LOG_NOTQUIET,
 958 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 959       if (field_num_checksum == -1)
 960         logprintf (LOG_NOTQUIET,
 961 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 962       if (field_num_record_id == -1)
 963         logprintf (LOG_NOTQUIET,
 964 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 965     }
 966   else
 967     {
 968       /* Initialize the table. */
 969       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
 970                                              warc_cmp_sha1_digest);
 971
 972       do
 973         {
 974           line_length = getline (&lineptr, &n, f);
 975           if (line_length != -1)
 976             {
 977               warc_process_cdx_line (lineptr, field_num_original_url,
 978                             field_num_checksum, field_num_record_id);
 979             }
 980
 981         }
 982       while (line_length != -1);
 983
 984       /* Print results. */
 985       int nrecords = hash_table_count (warc_cdx_dedup_table);
 986       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 987                                         "Loaded %d records from CDX.\n\n",
 988                                          nrecords),
 989                               nrecords);
 990     }
 991
 992   free (lineptr);
 993   fclose (f);
 994
 995   return true;
 996 }
 997 #undef CDX_FIELDSEP
 998
 999 /* Returns the existing duplicate CDX record for the given url and payload
1000    digest.  Returns NULL if the url is not found or if the payload digest
1001    does not match, or if CDX deduplication is disabled. */
1002 static struct warc_cdx_record *
1003 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1004 {
1005   if (warc_cdx_dedup_table == NULL)
1006     return NULL;
1007
1008   char *key;
1009   struct warc_cdx_record *rec_existing;
1010   int found = hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload,
1011                                    &key, &rec_existing);
1012
1013   if (found && strcmp (rec_existing->url, url) == 0)
1014     return rec_existing;
1015   else
1016     return NULL;
1017 }
1018
1019 /* Initializes the WARC writer (if opt.warc_filename is set).
1020    This should be called before any WARC record is written. */
1021 void
1022 warc_init (void)
1023 {
1024   warc_write_ok = true;
1025
1026   if (opt.warc_filename != NULL)
1027     {
1028       if (opt.warc_cdx_dedup_filename != NULL)
1029         {
1030           if (! warc_load_cdx_dedup_file ())
1031             {
1032               logprintf (LOG_NOTQUIET,
1033                          _("Could not read CDX file %s for deduplication.\n"),
1034                          quote (opt.warc_cdx_dedup_filename));
1035               exit(1);
1036             }
1037         }
1038
1039       warc_manifest_fp = warc_tempfile ();
1040       if (warc_manifest_fp == NULL)
1041         {
1042           logprintf (LOG_NOTQUIET,
1043                      _("Could not open temporary WARC manifest file.\n"));
1044           exit(1);
1045         }
1046
1047       if (opt.warc_keep_log)
1048         {
1049           warc_log_fp = warc_tempfile ();
1050           if (warc_log_fp == NULL)
1051             {
1052               logprintf (LOG_NOTQUIET,
1053                          _("Could not open temporary WARC log file.\n"));
1054               exit(1);
1055             }
1056           log_set_warc_log_fp (warc_log_fp);
1057         }
1058
1059       warc_current_file_number = -1;
1060       if (! warc_start_new_file (false))
1061         {
1062           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1063           exit(1);
1064         }
1065
1066       if (opt.warc_cdx_enabled)
1067         {
1068           if (! warc_start_cdx_file ())
1069             {
1070               logprintf (LOG_NOTQUIET,
1071                          _("Could not open CDX file for output.\n"));
1072               exit(1);
1073             }
1074         }
1075     }
1076 }
1077
1078 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1079 static void
1080 warc_write_metadata (void)
1081 {
1082   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1083   if (opt.warc_maxsize > 0)
1084     warc_start_new_file (true);
1085
1086   char manifest_uuid [48];
1087   warc_uuid_str (manifest_uuid);
1088
1089   fflush (warc_manifest_fp);
1090   warc_write_resource_record (manifest_uuid,
1091                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1092                               NULL, NULL, NULL, "text/plain",
1093                               warc_manifest_fp, -1);
1094   /* warc_write_resource_record has closed warc_manifest_fp. */
1095
1096   FILE * warc_tmp_fp = warc_tempfile ();
1097   if (warc_tmp_fp == NULL)
1098     {
1099       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1100       exit(1);
1101     }
1102   fflush (warc_tmp_fp);
1103   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1104
1105   warc_write_resource_record (manifest_uuid,
1106                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1107                               NULL, NULL, NULL, "text/plain",
1108                               warc_tmp_fp, -1);
1109   /* warc_write_resource_record has closed warc_tmp_fp. */
1110
1111   if (warc_log_fp != NULL)
1112     {
1113       warc_write_resource_record (NULL,
1114                               "metadata://gnu.org/software/wget/warc/wget.log",
1115                                   NULL, manifest_uuid, NULL, "text/plain",
1116                                   warc_log_fp, -1);
1117       /* warc_write_resource_record has closed warc_log_fp. */
1118
1119       warc_log_fp = NULL;
1120       log_set_warc_log_fp (NULL);
1121     }
1122 }
1123
1124 /* Finishes the WARC writing.
1125    This should be called at the end of the program. */
1126 void
1127 warc_close (void)
1128 {
1129   if (warc_current_file != NULL)
1130     {
1131       warc_write_metadata ();
1132       free (warc_current_warcinfo_uuid_str);
1133       fclose (warc_current_file);
1134     }
1135   if (warc_current_cdx_file != NULL)
1136     fclose (warc_current_cdx_file);
1137   if (warc_log_fp != NULL)
1138     {
1139       fclose (warc_log_fp);
1140       log_set_warc_log_fp (NULL);
1141     }
1142 }
1143
1144 /* Creates a temporary file for writing WARC output.
1145    The temporary file will be created in opt.warc_tempdir.
1146    Returns the pointer to the temporary file, or NULL. */
1147 FILE *
1148 warc_tempfile (void)
1149 {
1150   char filename[100];
1151   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1152     return NULL;
1153
1154   int fd = mkstemp (filename);
1155   if (fd < 0)
1156     return NULL;
1157
1158   if (unlink (filename) < 0)
1159     return NULL;
1160
1161   return fdopen (fd, "wb+");
1162 }
1163
1164
1165 /* Writes a request record to the WARC file.
1166    url  is the target uri of the request,
1167    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1168    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1169    body  is a pointer to a file containing the request headers and body.
1170    ip  is the ip address of the server (or NULL),
1171    Calling this function will close body.
1172    Returns true on success, false on error. */
1173 bool
1174 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1175                            ip_address *ip, FILE *body, off_t payload_offset)
1176 {
1177   warc_write_start_record ();
1178   warc_write_header ("WARC-Type", "request");
1179   warc_write_header ("WARC-Target-URI", url);
1180   warc_write_header ("Content-Type", "application/http;msgtype=request");
1181   warc_write_date_header (timestamp_str);
1182   warc_write_header ("WARC-Record-ID", record_uuid);
1183   warc_write_ip_header (ip);
1184   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1185   warc_write_digest_headers (body, payload_offset);
1186   warc_write_block_from_file (body);
1187   warc_write_end_record ();
1188
1189   fclose (body);
1190
1191   return warc_write_ok;
1192 }
1193
1194 /* Writes a response record to the CDX file.
1195    url  is the target uri of the request/response,
1196    timestamp_str  is the timestamp of the request that generated this response,
1197                   (generated with warc_timestamp),
1198    mime_type  is the mime type of the response body (will be printed to CDX),
1199    response_code  is the HTTP response code (will be printed to CDX),
1200    payload_digest  is the sha1 digest of the payload,
1201    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1202    offset  is the position of the WARC record in the WARC file,
1203    warc_filename  is the filename of the WARC,
1204    response_uuid  is the uuid of the response.
1205    Returns true on success, false on error. */
1206 static bool
1207 warc_write_cdx_record (const char *url, const char *timestamp_str,
1208                        const char *mime_type, int response_code,
1209                        const char *payload_digest, const char *redirect_location,
1210                        off_t offset, const char *warc_filename,
1211                        const char *response_uuid)
1212 {
1213   /* Transform the timestamp. */
1214   char timestamp_str_cdx [15];
1215   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1216   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1217   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1218   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1219   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1220   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1221   timestamp_str_cdx[14] = '\0';
1222
1223   /* Rewrite the checksum. */
1224   const char *checksum;
1225   if (payload_digest != NULL)
1226     checksum = payload_digest + 5; /* Skip the "sha1:" */
1227   else
1228     checksum = "-";
1229
1230   if (mime_type == NULL || strlen(mime_type) == 0)
1231     mime_type = "-";
1232   if (redirect_location == NULL || strlen(redirect_location) == 0)
1233     redirect_location = "-";
1234
1235   /* Print the CDX line. */
1236   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url,
1237            timestamp_str_cdx, url, mime_type, response_code, checksum,
1238            redirect_location, offset, warc_current_filename, response_uuid);
1239   fflush (warc_current_cdx_file);
1240
1241   return true;
1242 }
1243
1244 /* Writes a revisit record to the WARC file.
1245    url  is the target uri of the request/response,
1246    timestamp_str  is the timestamp of the request that generated this response
1247                   (generated with warc_timestamp),
1248    concurrent_to_uuid  is the uuid of the request for that generated this response
1249                  (generated with warc_uuid_str),
1250    refers_to_uuid  is the uuid of the original response
1251                  (generated with warc_uuid_str),
1252    payload_digest  is the sha1 digest of the payload,
1253    ip  is the ip address of the server (or NULL),
1254    body  is a pointer to a file containing the response headers (without payload).
1255    Calling this function will close body.
1256    Returns true on success, false on error. */
1257 static bool
1258 warc_write_revisit_record (char *url, char *timestamp_str,
1259                            char *concurrent_to_uuid, char *payload_digest,
1260                            char *refers_to, ip_address *ip, FILE *body)
1261 {
1262   char revisit_uuid [48];
1263   warc_uuid_str (revisit_uuid);
1264
1265   char *block_digest = NULL;
1266   char sha1_res_block[SHA1_DIGEST_SIZE];
1267   sha1_stream (body, sha1_res_block);
1268   block_digest = warc_base32_sha1_digest (sha1_res_block);
1269
1270   warc_write_start_record ();
1271   warc_write_header ("WARC-Type", "revisit");
1272   warc_write_header ("WARC-Record-ID", revisit_uuid);
1273   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1274   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1275   warc_write_header ("WARC-Refers-To", refers_to);
1276   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1277   warc_write_header ("WARC-Truncated", "length");
1278   warc_write_header ("WARC-Target-URI", url);
1279   warc_write_date_header (timestamp_str);
1280   warc_write_ip_header (ip);
1281   warc_write_header ("Content-Type", "application/http;msgtype=response");
1282   warc_write_header ("WARC-Block-Digest", block_digest);
1283   warc_write_header ("WARC-Payload-Digest", payload_digest);
1284   warc_write_block_from_file (body);
1285   warc_write_end_record ();
1286
1287   fclose (body);
1288   free (block_digest);
1289
1290   return warc_write_ok;
1291 }
1292
1293 /* Writes a response record to the WARC file.
1294    url  is the target uri of the request/response,
1295    timestamp_str  is the timestamp of the request that generated this response
1296                   (generated with warc_timestamp),
1297    concurrent_to_uuid  is the uuid of the request for that generated this response
1298                  (generated with warc_uuid_str),
1299    ip  is the ip address of the server (or NULL),
1300    body  is a pointer to a file containing the response headers and body.
1301    mime_type  is the mime type of the response body (will be printed to CDX),
1302    response_code  is the HTTP response code (will be printed to CDX),
1303    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1304    Calling this function will close body.
1305    Returns true on success, false on error. */
1306 bool
1307 warc_write_response_record (char *url, char *timestamp_str,
1308                             char *concurrent_to_uuid, ip_address *ip,
1309                             FILE *body, off_t payload_offset, char *mime_type,
1310                             int response_code, char *redirect_location)
1311 {
1312   char *block_digest = NULL;
1313   char *payload_digest = NULL;
1314   char sha1_res_block[SHA1_DIGEST_SIZE];
1315   char sha1_res_payload[SHA1_DIGEST_SIZE];
1316
1317   if (opt.warc_digests_enabled)
1318     {
1319       /* Calculate the block and payload digests. */
1320       rewind (body);
1321       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1322           payload_offset) == 0)
1323         {
1324           /* Decide (based on url + payload digest) if we have seen this
1325              data before. */
1326           struct warc_cdx_record *rec_existing;
1327           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1328           if (rec_existing != NULL)
1329             {
1330               bool result;
1331
1332               /* Found an existing record. */
1333               logprintf (LOG_VERBOSE,
1334           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1335
1336               /* Remove the payload from the file. */
1337               if (payload_offset > 0)
1338                 {
1339                   if (ftruncate (fileno (body), payload_offset) == -1)
1340                     return false;
1341                 }
1342
1343               /* Send the original payload digest. */
1344               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1345               result = warc_write_revisit_record (url, timestamp_str,
1346                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1347                          ip, body);
1348               free (payload_digest);
1349
1350               return result;
1351             }
1352
1353           block_digest = warc_base32_sha1_digest (sha1_res_block);
1354           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1355         }
1356     }
1357
1358   /* Not a revisit, just store the record. */
1359
1360   char response_uuid [48];
1361   warc_uuid_str (response_uuid);
1362
1363   fseeko (warc_current_file, 0L, SEEK_END);
1364   off_t offset = ftello (warc_current_file);
1365
1366   warc_write_start_record ();
1367   warc_write_header ("WARC-Type", "response");
1368   warc_write_header ("WARC-Record-ID", response_uuid);
1369   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1370   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1371   warc_write_header ("WARC-Target-URI", url);
1372   warc_write_date_header (timestamp_str);
1373   warc_write_ip_header (ip);
1374   warc_write_header ("WARC-Block-Digest", block_digest);
1375   warc_write_header ("WARC-Payload-Digest", payload_digest);
1376   warc_write_header ("Content-Type", "application/http;msgtype=response");
1377   warc_write_block_from_file (body);
1378   warc_write_end_record ();
1379
1380   fclose (body);
1381
1382   if (warc_write_ok && opt.warc_cdx_enabled)
1383     {
1384       /* Add this record to the CDX. */
1385       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1386       payload_digest, redirect_location, offset, warc_current_filename,
1387       response_uuid);
1388     }
1389
1390   if (block_digest)
1391     free (block_digest);
1392   if (payload_digest)
1393     free (payload_digest);
1394
1395   return warc_write_ok;
1396 }
1397
1398 /* Writes a resource record to the WARC file.
1399    resource_uuid  is the uuid of the resource (or NULL),
1400    url  is the target uri of the resource,
1401    timestamp_str  is the timestamp (generated with warc_timestamp),
1402    concurrent_to_uuid  is the uuid of the request for that generated this
1403    resource (generated with warc_uuid_str) or NULL,
1404    ip  is the ip address of the server (or NULL),
1405    content_type  is the mime type of the body (or NULL),
1406    body  is a pointer to a file containing the resource data.
1407    Calling this function will close body.
1408    Returns true on success, false on error. */
1409 bool
1410 warc_write_resource_record (char *resource_uuid, const char *url,
1411                  const char *timestamp_str, const char *concurrent_to_uuid,
1412                  ip_address *ip, const char *content_type, FILE *body,
1413                  off_t payload_offset)
1414 {
1415   if (resource_uuid == NULL)
1416     {
1417       resource_uuid = alloca (48);
1418       warc_uuid_str (resource_uuid);
1419     }
1420
1421   if (content_type == NULL)
1422     content_type = "application/octet-stream";
1423
1424   warc_write_start_record ();
1425   warc_write_header ("WARC-Type", "resource");
1426   warc_write_header ("WARC-Record-ID", resource_uuid);
1427   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1428   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1429   warc_write_header ("WARC-Target-URI", url);
1430   warc_write_date_header (timestamp_str);
1431   warc_write_ip_header (ip);
1432   warc_write_digest_headers (body, payload_offset);
1433   warc_write_header ("Content-Type", content_type);
1434   warc_write_block_from_file (body);
1435   warc_write_end_record ();
1436
1437   fclose (body);
1438
1439   return warc_write_ok;
1440 }