sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #else
  55 #include <fcntl.h>
  56 #endif
  57
  58 #include "warc.h"
  59
  60 #ifndef O_TEMPORARY
  61 #define O_TEMPORARY 0
  62 #endif
  63
  64 extern char *version_string;
  65
  66 /* Set by main in main.c */
  67 extern char *program_argstring;
  68
  69
  70 /* The log file (a temporary file that contains a copy
  71    of the wget log). */
  72 static FILE *warc_log_fp;
  73
  74 /* The manifest file (a temporary file that contains the
  75    warcinfo uuid of every file in this crawl). */
  76 static FILE *warc_manifest_fp;
  77
  78 /* The current WARC file (or NULL, if WARC is disabled). */
  79 static FILE *warc_current_file;
  80
  81 #ifdef HAVE_LIBZ
  82 /* The gzip stream for the current WARC file
  83    (or NULL, if WARC or gzip is disabled). */
  84 static gzFile warc_current_gzfile;
  85
  86 /* The offset of the current gzip record in the WARC file. */
  87 static off_t warc_current_gzfile_offset;
  88
  89 /* The uncompressed size (so far) of the current record. */
  90 static off_t warc_current_gzfile_uncompressed_size;
  91 # endif
  92
  93 /* This is true until a warc_write_* method fails. */
  94 static bool warc_write_ok;
  95
  96 /* The current CDX file (or NULL, if CDX is disabled). */
  97 static FILE *warc_current_cdx_file;
  98
  99 /* The record id of the warcinfo record of the current WARC file.  */
 100 static char *warc_current_warcinfo_uuid_str;
 101
 102 /* The file name of the current WARC file. */
 103 static char *warc_current_filename;
 104
 105 /* The serial number of the current WARC file.  This number is
 106    incremented each time a new file is opened and is used in the
 107    WARC file's filename. */
 108 static int warc_current_file_number;
 109
 110 /* The table of CDX records, if deduplication is enabled. */
 111 struct hash_table * warc_cdx_dedup_table;
 112
 113 static bool warc_start_new_file (bool meta);
 114
 115
 116 struct warc_cdx_record
 117 {
 118   char *url;
 119   char *uuid;
 120   char digest[SHA1_DIGEST_SIZE];
 121 };
 122
 123 static unsigned long
 124 warc_hash_sha1_digest (const void *key)
 125 {
 126   /* We just use some of the first bytes of the digest. */
 127   unsigned long v = 0;
 128   memcpy (&v, key, sizeof (unsigned long));
 129   return v;
 130 }
 131
 132 static int
 133 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 134 {
 135   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 136 }
 137
 138
 139
 140 /* Writes SIZE bytes from BUFFER to the current WARC file,
 141    through gzwrite if compression is enabled.
 142    Returns the number of uncompressed bytes written.  */
 143 static size_t
 144 warc_write_buffer (const char *buffer, size_t size)
 145 {
 146 #ifdef HAVE_LIBZ
 147   if (warc_current_gzfile)
 148     {
 149       warc_current_gzfile_uncompressed_size += size;
 150       return gzwrite (warc_current_gzfile, buffer, size);
 151     }
 152   else
 153 #endif
 154     return fwrite (buffer, 1, size, warc_current_file);
 155 }
 156
 157 /* Writes STR to the current WARC file.
 158    Returns false and set warc_write_ok to false if there
 159    is an error.  */
 160 static bool
 161 warc_write_string (const char *str)
 162 {
 163   if (!warc_write_ok)
 164     return false;
 165
 166   size_t n = strlen (str);
 167   if (n != warc_write_buffer (str, n))
 168     warc_write_ok = false;
 169
 170   return warc_write_ok;
 171 }
 172
 173
 174 #define EXTRA_GZIP_HEADER_SIZE 14
 175 #define GZIP_STATIC_HEADER_SIZE  10
 176 #define FLG_FEXTRA          0x04
 177 #define OFF_FLG             3
 178
 179 /* Starts a new WARC record.  Writes the version header.
 180    If opt.warc_maxsize is set and the current file is becoming
 181    too large, this will open a new WARC file.
 182
 183    If compression is enabled, this will start a new
 184    gzip stream in the current WARC file.
 185
 186    Returns false and set warc_write_ok to false if there
 187    is an error.  */
 188 static bool
 189 warc_write_start_record (void)
 190 {
 191   if (!warc_write_ok)
 192     return false;
 193
 194   fflush (warc_current_file);
 195   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 196     warc_start_new_file (false);
 197
 198 #ifdef HAVE_LIBZ
 199   /* Start a GZIP stream, if required. */
 200   if (opt.warc_compression_enabled)
 201     {
 202       /* Record the starting offset of the new record. */
 203       warc_current_gzfile_offset = ftello (warc_current_file);
 204
 205       /* Reserve space for the extra GZIP header field.
 206          In warc_write_end_record we will fill this space
 207          with information about the uncompressed and
 208          compressed size of the record. */
 209       fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
 210       fflush (warc_current_file);
 211
 212       /* Start a new GZIP stream. */
 213       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 214       warc_current_gzfile_uncompressed_size = 0;
 215
 216       if (warc_current_gzfile == NULL)
 217         {
 218           logprintf (LOG_NOTQUIET,
 219 _("Error opening GZIP stream to WARC file.\n"));
 220           warc_write_ok = false;
 221           return false;
 222         }
 223     }
 224 #endif
 225
 226   warc_write_string ("WARC/1.0\r\n");
 227   return warc_write_ok;
 228 }
 229
 230 /* Writes a WARC header to the current WARC record.
 231    This method may be run after warc_write_start_record and
 232    before warc_write_block_from_file.  */
 233 static bool
 234 warc_write_header (const char *name, const char *value)
 235 {
 236   if (value)
 237     {
 238       warc_write_string (name);
 239       warc_write_string (": ");
 240       warc_write_string (value);
 241       warc_write_string ("\r\n");
 242     }
 243   return warc_write_ok;
 244 }
 245
 246 /* Copies the contents of DATA_IN to the WARC record.
 247    Adds a Content-Length header to the WARC record.
 248    Run this method after warc_write_header,
 249    then run warc_write_end_record. */
 250 static bool
 251 warc_write_block_from_file (FILE *data_in)
 252 {
 253   /* Add the Content-Length header. */
 254   char content_length[MAX_INT_TO_STRING_LEN(off_t)];
 255   fseeko (data_in, 0L, SEEK_END);
 256   number_to_string (content_length, ftello (data_in));
 257   warc_write_header ("Content-Length", content_length);
 258
 259   /* End of the WARC header section. */
 260   warc_write_string ("\r\n");
 261
 262   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 263     warc_write_ok = false;
 264
 265   /* Copy the data in the file to the WARC record. */
 266   char buffer[BUFSIZ];
 267   size_t s;
 268   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 269     {
 270       if (warc_write_buffer (buffer, s) < s)
 271         warc_write_ok = false;
 272     }
 273
 274   return warc_write_ok;
 275 }
 276
 277 /* Run this method to close the current WARC record.
 278
 279    If compression is enabled, this method closes the
 280    current GZIP stream and fills the extra GZIP header
 281    with the uncompressed and compressed length of the
 282    record. */
 283 static bool
 284 warc_write_end_record (void)
 285 {
 286   warc_write_buffer ("\r\n\r\n", 4);
 287
 288 #ifdef HAVE_LIBZ
 289   /* We start a new gzip stream for each record.  */
 290   if (warc_write_ok && warc_current_gzfile)
 291     {
 292       if (gzclose (warc_current_gzfile) != Z_OK)
 293         {
 294           warc_write_ok = false;
 295           return false;
 296         }
 297
 298       fflush (warc_current_file);
 299       fseeko (warc_current_file, 0, SEEK_END);
 300
 301       /* The WARC standard suggests that we add 'skip length' data in the
 302          extra header field of the GZIP stream.
 303
 304          In warc_write_start_record we reserved space for this extra header.
 305          This extra space starts at warc_current_gzfile_offset and fills
 306          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 307          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 308
 309          We need to do three things:
 310          1. Move the static GZIP header to warc_current_gzfile_offset;
 311          2. Set the FEXTRA flag in the GZIP header;
 312          3. Write the extra GZIP header after the static header, that is,
 313             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 314       */
 315
 316       /* Calculate the uncompressed and compressed sizes. */
 317       off_t current_offset = ftello (warc_current_file);
 318       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 319       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 320
 321       /* Go back to the static GZIP header. */
 322       fseeko (warc_current_file, warc_current_gzfile_offset
 323               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 324
 325       /* Read the header. */
 326       char static_header[GZIP_STATIC_HEADER_SIZE];
 327       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
 328                              warc_current_file);
 329       if (result != GZIP_STATIC_HEADER_SIZE)
 330         {
 331           warc_write_ok = false;
 332           return false;
 333         }
 334
 335       /* Set the FEXTRA flag in the flags byte of the header. */
 336       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 337
 338       /* Write the header back to the file, but starting at
 339          warc_current_gzfile_offset. */
 340       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 341       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 342
 343       /* Prepare the extra GZIP header. */
 344       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 345       /* XLEN, the length of the extra header fields.  */
 346       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 347       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 348       /* The extra header field identifier for the WARC skip length. */
 349       extra_header[2]  = 's';
 350       extra_header[3]  = 'l';
 351       /* The size of the field value (8 bytes).  */
 352       extra_header[4]  = (8 & 255);
 353       extra_header[5]  = ((8 >> 8) & 255);
 354       /* The size of the uncompressed record.  */
 355       extra_header[6]  = (uncompressed_size & 255);
 356       extra_header[7]  = (uncompressed_size >> 8) & 255;
 357       extra_header[8]  = (uncompressed_size >> 16) & 255;
 358       extra_header[9]  = (uncompressed_size >> 24) & 255;
 359       /* The size of the compressed record.  */
 360       extra_header[10] = (compressed_size & 255);
 361       extra_header[11] = (compressed_size >> 8) & 255;
 362       extra_header[12] = (compressed_size >> 16) & 255;
 363       extra_header[13] = (compressed_size >> 24) & 255;
 364
 365       /* Write the extra header after the static header. */
 366       fseeko (warc_current_file, warc_current_gzfile_offset
 367               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 368       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 369
 370       /* Done, move back to the end of the file. */
 371       fflush (warc_current_file);
 372       fseeko (warc_current_file, 0, SEEK_END);
 373     }
 374 #endif /* HAVE_LIBZ */
 375
 376   return warc_write_ok;
 377 }
 378
 379
 380 /* Writes the WARC-Date header for the given timestamp to
 381    the current WARC record.
 382    If timestamp is NULL, the current time will be used.  */
 383 static bool
 384 warc_write_date_header (const char *timestamp)
 385 {
 386   if (timestamp == NULL)
 387     {
 388       char current_timestamp[21];
 389       warc_timestamp (current_timestamp);
 390       timestamp = current_timestamp;
 391     }
 392   return warc_write_header ("WARC-Date", timestamp);
 393 }
 394
 395 /* Writes the WARC-IP-Address header for the given IP to
 396    the current WARC record.  If IP is NULL, no header will
 397    be written.  */
 398 static bool
 399 warc_write_ip_header (ip_address *ip)
 400 {
 401   if (ip != NULL)
 402     return warc_write_header ("WARC-IP-Address", print_address (ip));
 403   else
 404     return warc_write_ok;
 405 }
 406
 407
 408 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 409    from gnulib/sha1.c.  This version calculates two digests in one go.
 410
 411    Compute SHA1 message digests for bytes read from STREAM.  The
 412    digest of the complete file will be written into the 16 bytes
 413    beginning at RES_BLOCK.
 414
 415    If payload_offset >= 0, a second digest will be calculated of the
 416    portion of the file starting at payload_offset and continuing to
 417    the end of the file.  The digest number will be written into the
 418    16 bytes beginning ad RES_PAYLOAD.  */
 419 static int
 420 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 421                                off_t payload_offset)
 422 {
 423 #define BLOCKSIZE 32768
 424
 425   struct sha1_ctx ctx_block;
 426   struct sha1_ctx ctx_payload;
 427   off_t pos;
 428   off_t sum;
 429
 430   char *buffer = malloc (BLOCKSIZE + 72);
 431   if (!buffer)
 432     return 1;
 433
 434   /* Initialize the computation context.  */
 435   sha1_init_ctx (&ctx_block);
 436   if (payload_offset >= 0)
 437     sha1_init_ctx (&ctx_payload);
 438
 439   pos = 0;
 440
 441   /* Iterate over full file contents.  */
 442   while (1)
 443     {
 444       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 445          computation function processes the whole buffer so that with the
 446          next round of the loop another block can be read.  */
 447       off_t n;
 448       sum = 0;
 449
 450       /* Read block.  Take care for partial reads.  */
 451       while (1)
 452         {
 453           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 454
 455           sum += n;
 456           pos += n;
 457
 458           if (sum == BLOCKSIZE)
 459             break;
 460
 461           if (n == 0)
 462             {
 463               /* Check for the error flag IFF N == 0, so that we don't
 464                  exit the loop after a partial read due to e.g., EAGAIN
 465                  or EWOULDBLOCK.  */
 466               if (ferror (stream))
 467                 {
 468                   free (buffer);
 469                   return 1;
 470                 }
 471               goto process_partial_block;
 472             }
 473
 474           /* We've read at least one byte, so ignore errors.  But always
 475              check for EOF, since feof may be true even though N > 0.
 476              Otherwise, we could end up calling fread after EOF.  */
 477           if (feof (stream))
 478             goto process_partial_block;
 479         }
 480
 481       /* Process buffer with BLOCKSIZE bytes.  Note that
 482                         BLOCKSIZE % 64 == 0
 483        */
 484       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 485       if (payload_offset >= 0 && payload_offset < pos)
 486         {
 487           /* At least part of the buffer contains data from payload. */
 488           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 489           if (start_of_payload <= 0)
 490             /* All bytes in the buffer belong to the payload. */
 491             start_of_payload = 0;
 492
 493           /* Process the payload part of the buffer.
 494              Note: we can't use  sha1_process_block  here even if we
 495              process the complete buffer.  Because the payload doesn't
 496              have to start with a full block, there may still be some
 497              bytes left from the previous buffer.  Therefore, we need
 498              to continue with  sha1_process_bytes.  */
 499           sha1_process_bytes (buffer + start_of_payload,
 500                               BLOCKSIZE - start_of_payload, &ctx_payload);
 501         }
 502     }
 503
 504  process_partial_block:;
 505
 506   /* Process any remaining bytes.  */
 507   if (sum > 0)
 508     {
 509       sha1_process_bytes (buffer, sum, &ctx_block);
 510       if (payload_offset >= 0 && payload_offset < pos)
 511         {
 512           /* At least part of the buffer contains data from payload. */
 513           off_t start_of_payload = payload_offset - (pos - sum);
 514           if (start_of_payload <= 0)
 515             /* All bytes in the buffer belong to the payload. */
 516             start_of_payload = 0;
 517
 518           /* Process the payload part of the buffer. */
 519           sha1_process_bytes (buffer + start_of_payload,
 520                               sum - start_of_payload, &ctx_payload);
 521         }
 522     }
 523
 524   /* Construct result in desired memory.  */
 525   sha1_finish_ctx (&ctx_block,   res_block);
 526   if (payload_offset >= 0)
 527     sha1_finish_ctx (&ctx_payload, res_payload);
 528   free (buffer);
 529   return 0;
 530
 531 #undef BLOCKSIZE
 532 }
 533
 534 /* Converts the SHA1 digest to a base32-encoded string.
 535    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 536 static char *
 537 warc_base32_sha1_digest (char *sha1_digest)
 538 {
 539   /* length: "sha1:" + digest + "\0" */
 540   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 541   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
 542                  BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 543   memcpy (sha1_base32, "sha1:", 5);
 544   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 545   return sha1_base32;
 546 }
 547
 548
 549 /* Sets the digest headers of the record.
 550    This method will calculate the block digest and, if payload_offset >= 0,
 551    will also calculate the payload digest of the payload starting at the
 552    provided offset.  */
 553 static void
 554 warc_write_digest_headers (FILE *file, long payload_offset)
 555 {
 556   if (opt.warc_digests_enabled)
 557     {
 558       /* Calculate the block and payload digests. */
 559       char sha1_res_block[SHA1_DIGEST_SIZE];
 560       char sha1_res_payload[SHA1_DIGEST_SIZE];
 561
 562       rewind (file);
 563       if (warc_sha1_stream_with_payload (file, sha1_res_block,
 564           sha1_res_payload, payload_offset) == 0)
 565         {
 566           char *digest;
 567
 568           digest = warc_base32_sha1_digest (sha1_res_block);
 569           warc_write_header ("WARC-Block-Digest", digest);
 570           free (digest);
 571
 572           if (payload_offset >= 0)
 573             {
 574               digest = warc_base32_sha1_digest (sha1_res_payload);
 575               warc_write_header ("WARC-Payload-Digest", digest);
 576               free (digest);
 577             }
 578         }
 579     }
 580 }
 581
 582
 583 /* Fills timestamp with the current time and date.
 584    The UTC time is formatted following ISO 8601, as required
 585    for use in the WARC-Date header.
 586    The timestamp will be 21 characters long. */
 587 void
 588 warc_timestamp (char *timestamp)
 589 {
 590   time_t rawtime;
 591   struct tm * timeinfo;
 592   time ( &rawtime );
 593   timeinfo = gmtime (&rawtime);
 594   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 595 }
 596
 597 #ifdef HAVE_LIBUUID
 598 /* Fills urn_str with a UUID in the format required
 599    for the WARC-Record-Id header.
 600    The string will be 47 characters long. */
 601 void
 602 warc_uuid_str (char *urn_str)
 603 {
 604   char uuid_str[37];
 605
 606   uuid_t record_id;
 607   uuid_generate (record_id);
 608   uuid_unparse (record_id, uuid_str);
 609
 610   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 611 }
 612 #else
 613 /* Fills urn_str with a UUID based on random numbers in the format
 614    required for the WARC-Record-Id header.
 615    (See RFC 4122, UUID version 4.)
 616
 617    Note: this is a fallback method, it is much better to use the
 618    methods provided by libuuid.
 619
 620    The string will be 47 characters long. */
 621 void
 622 warc_uuid_str (char *urn_str)
 623 {
 624   // RFC 4122, a version 4 UUID with only random numbers
 625
 626   unsigned char uuid_data[16];
 627   int i;
 628   for (i=0; i<16; i++)
 629     uuid_data[i] = random_number (255);
 630
 631   // Set the four most significant bits (bits 12 through 15) of the
 632   // time_hi_and_version field to the 4-bit version number
 633   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 634
 635   // Set the two most significant bits (bits 6 and 7) of the
 636   // clock_seq_hi_and_reserved to zero and one, respectively.
 637   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 638
 639   sprintf (urn_str,
 640     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 641     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 642     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 643     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 644     uuid_data[15]);
 645 }
 646 #endif
 647
 648 /* Write a warcinfo record to the current file.
 649    Updates warc_current_warcinfo_uuid_str. */
 650 static bool
 651 warc_write_warcinfo_record (char *filename)
 652 {
 653   /* Write warc-info record as the first record of the file. */
 654   /* We add the record id of this info record to the other records in the
 655      file. */
 656   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 657   warc_uuid_str (warc_current_warcinfo_uuid_str);
 658
 659   char timestamp[22];
 660   warc_timestamp (timestamp);
 661
 662   char *filename_copy, *filename_basename;
 663   filename_copy = strdup (filename);
 664   filename_basename = strdup (basename (filename_copy));
 665
 666   warc_write_start_record ();
 667   warc_write_header ("WARC-Type", "warcinfo");
 668   warc_write_header ("Content-Type", "application/warc-fields");
 669   warc_write_header ("WARC-Date", timestamp);
 670   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 671   warc_write_header ("WARC-Filename", filename_basename);
 672
 673   /* Create content.  */
 674   FILE *warc_tmp = warc_tempfile ();
 675   if (warc_tmp == NULL)
 676     {
 677       free (filename_copy);
 678       free (filename_basename);
 679       return false;
 680     }
 681
 682   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 683   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 684   fprintf (warc_tmp,
 685 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 686   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 687   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 688   /* Add the user headers, if any. */
 689   if (opt.warc_user_headers)
 690     {
 691       int i;
 692       for (i = 0; opt.warc_user_headers[i]; i++)
 693         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 694     }
 695   fprintf(warc_tmp, "\r\n");
 696
 697   warc_write_digest_headers (warc_tmp, -1);
 698   warc_write_block_from_file (warc_tmp);
 699   warc_write_end_record ();
 700
 701   if (! warc_write_ok)
 702     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 703
 704   free (filename_copy);
 705   free (filename_basename);
 706   fclose (warc_tmp);
 707   return warc_write_ok;
 708 }
 709
 710 /* Opens a new WARC file.
 711    If META is true, generates a filename ending with 'meta.warc.gz'.
 712
 713    This method will:
 714    1. close the current WARC file (if there is one);
 715    2. increment warc_current_file_number;
 716    3. open a new WARC file;
 717    4. write the initial warcinfo record.
 718
 719    Returns true on success, false otherwise.
 720    */
 721 static bool
 722 warc_start_new_file (bool meta)
 723 {
 724   if (opt.warc_filename == NULL)
 725     return false;
 726
 727   if (warc_current_file != NULL)
 728     fclose (warc_current_file);
 729   if (warc_current_warcinfo_uuid_str)
 730     free (warc_current_warcinfo_uuid_str);
 731   if (warc_current_filename)
 732     free (warc_current_filename);
 733
 734   warc_current_file_number++;
 735
 736   int base_filename_length = strlen (opt.warc_filename);
 737   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 738   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 739   warc_current_filename = new_filename;
 740
 741 #ifdef HAVE_LIBZ
 742   const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 743 #else
 744   const char *extension = "warc";
 745 #endif
 746
 747   /* If max size is enabled, we add a serial number to the file names. */
 748   if (meta)
 749     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 750   else if (opt.warc_maxsize > 0)
 751     {
 752       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
 753                warc_current_file_number, extension);
 754     }
 755   else
 756     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 757
 758   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 759
 760   /* Open the WARC file. */
 761   warc_current_file = fopen (new_filename, "wb+");
 762   if (warc_current_file == NULL)
 763     {
 764       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
 765                  quote (new_filename));
 766       return false;
 767     }
 768
 769   if (! warc_write_warcinfo_record (new_filename))
 770     return false;
 771
 772   /* Add warcinfo uuid to manifest. */
 773   if (warc_manifest_fp)
 774     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 775
 776   return true;
 777 }
 778
 779 /* Opens the CDX file for output. */
 780 static bool
 781 warc_start_cdx_file (void)
 782 {
 783   int filename_length = strlen (opt.warc_filename);
 784   char *cdx_filename = alloca (filename_length + 4 + 1);
 785   memcpy (cdx_filename, opt.warc_filename, filename_length);
 786   memcpy (cdx_filename + filename_length, ".cdx", 5);
 787   warc_current_cdx_file = fopen (cdx_filename, "a+");
 788   if (warc_current_cdx_file == NULL)
 789     return false;
 790
 791   /* Print the CDX header.
 792    *
 793    * a - original url
 794    * b - date
 795    * m - mime type
 796    * s - response code
 797    * k - new style checksum
 798    * r - redirect
 799    * M - meta tags
 800    * V - compressed arc file offset
 801    * g - file name
 802    * u - record-id
 803    */
 804   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 805   fflush (warc_current_cdx_file);
 806
 807   return true;
 808 }
 809
 810 #define CDX_FIELDSEP " \t\r\n"
 811
 812 /* Parse the CDX header and find the field numbers of the original url,
 813    checksum and record ID fields. */
 814 static bool
 815 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
 816                        int *field_num_checksum, int *field_num_record_id)
 817 {
 818   *field_num_original_url = -1;
 819   *field_num_checksum = -1;
 820   *field_num_record_id = -1;
 821
 822   char *token;
 823   char *save_ptr;
 824   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 825
 826   if (token != NULL && strcmp (token, "CDX") == 0)
 827     {
 828       int field_num = 0;
 829       while (token != NULL)
 830         {
 831           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 832           if (token != NULL)
 833             {
 834               switch (token[0])
 835                 {
 836                 case 'a':
 837                   *field_num_original_url = field_num;
 838                   break;
 839                 case 'k':
 840                   *field_num_checksum = field_num;
 841                   break;
 842                 case 'u':
 843                   *field_num_record_id = field_num;
 844                   break;
 845                 }
 846             }
 847           field_num++;
 848         }
 849     }
 850
 851   return *field_num_original_url != -1
 852          && *field_num_checksum != -1
 853          && *field_num_record_id != -1;
 854 }
 855
 856 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 857 static void
 858 warc_process_cdx_line (char *lineptr, int field_num_original_url,
 859                        int field_num_checksum, int field_num_record_id)
 860 {
 861   char *original_url = NULL;
 862   char *checksum = NULL;
 863   char *record_id = NULL;
 864
 865   char *token;
 866   char *save_ptr;
 867   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 868
 869   /* Read this line to get the fields we need. */
 870   int field_num = 0;
 871   while (token != NULL)
 872     {
 873       char **val;
 874       if (field_num == field_num_original_url)
 875         val = &original_url;
 876       else if (field_num == field_num_checksum)
 877         val = &checksum;
 878       else if (field_num == field_num_record_id)
 879         val = &record_id;
 880       else
 881         val = NULL;
 882
 883       if (val != NULL)
 884         *val = strdup (token);
 885
 886       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 887       field_num++;
 888     }
 889
 890   if (original_url != NULL && checksum != NULL && record_id != NULL)
 891     {
 892       /* For some extra efficiency, we decode the base32 encoded
 893          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 894          bytes.  */
 895       size_t checksum_l;
 896       char * checksum_v;
 897       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
 898                            &checksum_l);
 899       free (checksum);
 900
 901       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 902         {
 903           /* This is a valid line with a valid checksum. */
 904           struct warc_cdx_record *rec;
 905           rec = malloc (sizeof (struct warc_cdx_record));
 906           rec->url = original_url;
 907           rec->uuid = record_id;
 908           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 909           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 910           free (checksum_v);
 911         }
 912       else
 913         {
 914           free (original_url);
 915           if (checksum_v != NULL)
 916             free (checksum_v);
 917           free (record_id);
 918         }
 919     }
 920   else
 921     {
 922       xfree_null(checksum);
 923       xfree_null(original_url);
 924       xfree_null(record_id);
 925     }
 926 }
 927
 928 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 929    the warc_cdx_dedup_table. */
 930 static bool
 931 warc_load_cdx_dedup_file (void)
 932 {
 933   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 934   if (f == NULL)
 935     return false;
 936
 937   int field_num_original_url = -1;
 938   int field_num_checksum = -1;
 939   int field_num_record_id = -1;
 940
 941   char *lineptr = NULL;
 942   size_t n = 0;
 943   ssize_t line_length;
 944
 945   /* The first line should contain the CDX header.
 946      Format:  " CDX x x x x x"
 947      where x are field type indicators.  For our purposes, we only
 948      need 'a' (the original url), 'k' (the SHA1 checksum) and
 949      'u' (the WARC record id). */
 950   line_length = getline (&lineptr, &n, f);
 951   if (line_length != -1)
 952     warc_parse_cdx_header (lineptr, &field_num_original_url,
 953                            &field_num_checksum, &field_num_record_id);
 954
 955   /* If the file contains all three fields, read the complete file. */
 956   if (field_num_original_url == -1
 957       || field_num_checksum == -1
 958       || field_num_record_id == -1)
 959     {
 960       if (field_num_original_url == -1)
 961         logprintf (LOG_NOTQUIET,
 962 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 963       if (field_num_checksum == -1)
 964         logprintf (LOG_NOTQUIET,
 965 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 966       if (field_num_record_id == -1)
 967         logprintf (LOG_NOTQUIET,
 968 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 969     }
 970   else
 971     {
 972       /* Initialize the table. */
 973       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
 974                                              warc_cmp_sha1_digest);
 975
 976       do
 977         {
 978           line_length = getline (&lineptr, &n, f);
 979           if (line_length != -1)
 980             {
 981               warc_process_cdx_line (lineptr, field_num_original_url,
 982                             field_num_checksum, field_num_record_id);
 983             }
 984
 985         }
 986       while (line_length != -1);
 987
 988       /* Print results. */
 989       int nrecords = hash_table_count (warc_cdx_dedup_table);
 990       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 991                                         "Loaded %d records from CDX.\n\n",
 992                                          nrecords),
 993                               nrecords);
 994     }
 995
 996   free (lineptr);
 997   fclose (f);
 998
 999   return true;
1000 }
1001 #undef CDX_FIELDSEP
1002
1003 /* Returns the existing duplicate CDX record for the given url and payload
1004    digest.  Returns NULL if the url is not found or if the payload digest
1005    does not match, or if CDX deduplication is disabled. */
1006 static struct warc_cdx_record *
1007 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1008 {
1009   if (warc_cdx_dedup_table == NULL)
1010     return NULL;
1011
1012   struct warc_cdx_record *rec_existing
1013     = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1014
1015   if (rec_existing && strcmp (rec_existing->url, url) == 0)
1016     return rec_existing;
1017   else
1018     return NULL;
1019 }
1020
1021 /* Initializes the WARC writer (if opt.warc_filename is set).
1022    This should be called before any WARC record is written. */
1023 void
1024 warc_init (void)
1025 {
1026   warc_write_ok = true;
1027
1028   if (opt.warc_filename != NULL)
1029     {
1030       if (opt.warc_cdx_dedup_filename != NULL)
1031         {
1032           if (! warc_load_cdx_dedup_file ())
1033             {
1034               logprintf (LOG_NOTQUIET,
1035                          _("Could not read CDX file %s for deduplication.\n"),
1036                          quote (opt.warc_cdx_dedup_filename));
1037               exit(1);
1038             }
1039         }
1040
1041       warc_manifest_fp = warc_tempfile ();
1042       if (warc_manifest_fp == NULL)
1043         {
1044           logprintf (LOG_NOTQUIET,
1045                      _("Could not open temporary WARC manifest file.\n"));
1046           exit(1);
1047         }
1048
1049       if (opt.warc_keep_log)
1050         {
1051           warc_log_fp = warc_tempfile ();
1052           if (warc_log_fp == NULL)
1053             {
1054               logprintf (LOG_NOTQUIET,
1055                          _("Could not open temporary WARC log file.\n"));
1056               exit(1);
1057             }
1058           log_set_warc_log_fp (warc_log_fp);
1059         }
1060
1061       warc_current_file_number = -1;
1062       if (! warc_start_new_file (false))
1063         {
1064           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1065           exit(1);
1066         }
1067
1068       if (opt.warc_cdx_enabled)
1069         {
1070           if (! warc_start_cdx_file ())
1071             {
1072               logprintf (LOG_NOTQUIET,
1073                          _("Could not open CDX file for output.\n"));
1074               exit(1);
1075             }
1076         }
1077     }
1078 }
1079
1080 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1081 static void
1082 warc_write_metadata (void)
1083 {
1084   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1085   if (opt.warc_maxsize > 0)
1086     warc_start_new_file (true);
1087
1088   char manifest_uuid [48];
1089   warc_uuid_str (manifest_uuid);
1090
1091   fflush (warc_manifest_fp);
1092   warc_write_metadata_record (manifest_uuid,
1093                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1094                               NULL, NULL, NULL, "text/plain",
1095                               warc_manifest_fp, -1);
1096   /* warc_write_resource_record has closed warc_manifest_fp. */
1097
1098   FILE * warc_tmp_fp = warc_tempfile ();
1099   if (warc_tmp_fp == NULL)
1100     {
1101       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1102       exit(1);
1103     }
1104   fflush (warc_tmp_fp);
1105   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1106
1107   warc_write_resource_record (NULL,
1108                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1109                               NULL, manifest_uuid, NULL, "text/plain",
1110                               warc_tmp_fp, -1);
1111   /* warc_write_resource_record has closed warc_tmp_fp. */
1112
1113   if (warc_log_fp != NULL)
1114     {
1115       warc_write_resource_record (NULL,
1116                               "metadata://gnu.org/software/wget/warc/wget.log",
1117                                   NULL, manifest_uuid, NULL, "text/plain",
1118                                   warc_log_fp, -1);
1119       /* warc_write_resource_record has closed warc_log_fp. */
1120
1121       warc_log_fp = NULL;
1122       log_set_warc_log_fp (NULL);
1123     }
1124 }
1125
1126 /* Finishes the WARC writing.
1127    This should be called at the end of the program. */
1128 void
1129 warc_close (void)
1130 {
1131   if (warc_current_file != NULL)
1132     {
1133       warc_write_metadata ();
1134       free (warc_current_warcinfo_uuid_str);
1135       fclose (warc_current_file);
1136     }
1137   if (warc_current_cdx_file != NULL)
1138     fclose (warc_current_cdx_file);
1139   if (warc_log_fp != NULL)
1140     {
1141       fclose (warc_log_fp);
1142       log_set_warc_log_fp (NULL);
1143     }
1144 }
1145
1146 /* Creates a temporary file for writing WARC output.
1147    The temporary file will be created in opt.warc_tempdir.
1148    Returns the pointer to the temporary file, or NULL. */
1149 FILE *
1150 warc_tempfile (void)
1151 {
1152   char filename[100];
1153   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1154     return NULL;
1155
1156   int fd = mkostemp (filename, O_TEMPORARY);
1157   if (fd < 0)
1158     return NULL;
1159
1160 #if !O_TEMPORARY
1161   if (unlink (filename) < 0)
1162     return NULL;
1163 #endif
1164
1165
1166   return fdopen (fd, "wb+");
1167 }
1168
1169
1170 /* Writes a request record to the WARC file.
1171    url  is the target uri of the request,
1172    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1173    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1174    body  is a pointer to a file containing the request headers and body.
1175    ip  is the ip address of the server (or NULL),
1176    Calling this function will close body.
1177    Returns true on success, false on error. */
1178 bool
1179 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1180                            ip_address *ip, FILE *body, off_t payload_offset)
1181 {
1182   warc_write_start_record ();
1183   warc_write_header ("WARC-Type", "request");
1184   warc_write_header ("WARC-Target-URI", url);
1185   warc_write_header ("Content-Type", "application/http;msgtype=request");
1186   warc_write_date_header (timestamp_str);
1187   warc_write_header ("WARC-Record-ID", record_uuid);
1188   warc_write_ip_header (ip);
1189   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1190   warc_write_digest_headers (body, payload_offset);
1191   warc_write_block_from_file (body);
1192   warc_write_end_record ();
1193
1194   fclose (body);
1195
1196   return warc_write_ok;
1197 }
1198
1199 /* Writes a response record to the CDX file.
1200    url  is the target uri of the request/response,
1201    timestamp_str  is the timestamp of the request that generated this response,
1202                   (generated with warc_timestamp),
1203    mime_type  is the mime type of the response body (will be printed to CDX),
1204    response_code  is the HTTP response code (will be printed to CDX),
1205    payload_digest  is the sha1 digest of the payload,
1206    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1207    offset  is the position of the WARC record in the WARC file,
1208    warc_filename  is the filename of the WARC,
1209    response_uuid  is the uuid of the response.
1210    Returns true on success, false on error. */
1211 static bool
1212 warc_write_cdx_record (const char *url, const char *timestamp_str,
1213                        const char *mime_type, int response_code,
1214                        const char *payload_digest, const char *redirect_location,
1215                        off_t offset, const char *warc_filename,
1216                        const char *response_uuid)
1217 {
1218   /* Transform the timestamp. */
1219   char timestamp_str_cdx [15];
1220   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1221   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1222   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1223   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1224   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1225   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1226   timestamp_str_cdx[14] = '\0';
1227
1228   /* Rewrite the checksum. */
1229   const char *checksum;
1230   if (payload_digest != NULL)
1231     checksum = payload_digest + 5; /* Skip the "sha1:" */
1232   else
1233     checksum = "-";
1234
1235   if (mime_type == NULL || strlen(mime_type) == 0)
1236     mime_type = "-";
1237   if (redirect_location == NULL || strlen(redirect_location) == 0)
1238     redirect_location = "-";
1239
1240   char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1241   number_to_string (offset_string, offset);
1242
1243   /* Print the CDX line. */
1244   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1245            timestamp_str_cdx, url, mime_type, response_code, checksum,
1246            redirect_location, offset_string, warc_current_filename,
1247            response_uuid);
1248   fflush (warc_current_cdx_file);
1249
1250   return true;
1251 }
1252
1253 /* Writes a revisit record to the WARC file.
1254    url  is the target uri of the request/response,
1255    timestamp_str  is the timestamp of the request that generated this response
1256                   (generated with warc_timestamp),
1257    concurrent_to_uuid  is the uuid of the request for that generated this response
1258                  (generated with warc_uuid_str),
1259    refers_to_uuid  is the uuid of the original response
1260                  (generated with warc_uuid_str),
1261    payload_digest  is the sha1 digest of the payload,
1262    ip  is the ip address of the server (or NULL),
1263    body  is a pointer to a file containing the response headers (without payload).
1264    Calling this function will close body.
1265    Returns true on success, false on error. */
1266 static bool
1267 warc_write_revisit_record (char *url, char *timestamp_str,
1268                            char *concurrent_to_uuid, char *payload_digest,
1269                            char *refers_to, ip_address *ip, FILE *body)
1270 {
1271   char revisit_uuid [48];
1272   warc_uuid_str (revisit_uuid);
1273
1274   char *block_digest = NULL;
1275   char sha1_res_block[SHA1_DIGEST_SIZE];
1276   sha1_stream (body, sha1_res_block);
1277   block_digest = warc_base32_sha1_digest (sha1_res_block);
1278
1279   warc_write_start_record ();
1280   warc_write_header ("WARC-Type", "revisit");
1281   warc_write_header ("WARC-Record-ID", revisit_uuid);
1282   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1283   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1284   warc_write_header ("WARC-Refers-To", refers_to);
1285   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1286   warc_write_header ("WARC-Truncated", "length");
1287   warc_write_header ("WARC-Target-URI", url);
1288   warc_write_date_header (timestamp_str);
1289   warc_write_ip_header (ip);
1290   warc_write_header ("Content-Type", "application/http;msgtype=response");
1291   warc_write_header ("WARC-Block-Digest", block_digest);
1292   warc_write_header ("WARC-Payload-Digest", payload_digest);
1293   warc_write_block_from_file (body);
1294   warc_write_end_record ();
1295
1296   fclose (body);
1297   free (block_digest);
1298
1299   return warc_write_ok;
1300 }
1301
1302 /* Writes a response record to the WARC file.
1303    url  is the target uri of the request/response,
1304    timestamp_str  is the timestamp of the request that generated this response
1305                   (generated with warc_timestamp),
1306    concurrent_to_uuid  is the uuid of the request for that generated this response
1307                  (generated with warc_uuid_str),
1308    ip  is the ip address of the server (or NULL),
1309    body  is a pointer to a file containing the response headers and body.
1310    mime_type  is the mime type of the response body (will be printed to CDX),
1311    response_code  is the HTTP response code (will be printed to CDX),
1312    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1313    Calling this function will close body.
1314    Returns true on success, false on error. */
1315 bool
1316 warc_write_response_record (char *url, char *timestamp_str,
1317                             char *concurrent_to_uuid, ip_address *ip,
1318                             FILE *body, off_t payload_offset, char *mime_type,
1319                             int response_code, char *redirect_location)
1320 {
1321   char *block_digest = NULL;
1322   char *payload_digest = NULL;
1323   char sha1_res_block[SHA1_DIGEST_SIZE];
1324   char sha1_res_payload[SHA1_DIGEST_SIZE];
1325
1326   if (opt.warc_digests_enabled)
1327     {
1328       /* Calculate the block and payload digests. */
1329       rewind (body);
1330       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1331           payload_offset) == 0)
1332         {
1333           /* Decide (based on url + payload digest) if we have seen this
1334              data before. */
1335           struct warc_cdx_record *rec_existing;
1336           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1337           if (rec_existing != NULL)
1338             {
1339               bool result;
1340
1341               /* Found an existing record. */
1342               logprintf (LOG_VERBOSE,
1343           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1344
1345               /* Remove the payload from the file. */
1346               if (payload_offset > 0)
1347                 {
1348                   if (ftruncate (fileno (body), payload_offset) == -1)
1349                     return false;
1350                 }
1351
1352               /* Send the original payload digest. */
1353               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1354               result = warc_write_revisit_record (url, timestamp_str,
1355                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1356                          ip, body);
1357               free (payload_digest);
1358
1359               return result;
1360             }
1361
1362           block_digest = warc_base32_sha1_digest (sha1_res_block);
1363           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1364         }
1365     }
1366
1367   /* Not a revisit, just store the record. */
1368
1369   char response_uuid [48];
1370   warc_uuid_str (response_uuid);
1371
1372   fseeko (warc_current_file, 0L, SEEK_END);
1373   off_t offset = ftello (warc_current_file);
1374
1375   warc_write_start_record ();
1376   warc_write_header ("WARC-Type", "response");
1377   warc_write_header ("WARC-Record-ID", response_uuid);
1378   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1379   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1380   warc_write_header ("WARC-Target-URI", url);
1381   warc_write_date_header (timestamp_str);
1382   warc_write_ip_header (ip);
1383   warc_write_header ("WARC-Block-Digest", block_digest);
1384   warc_write_header ("WARC-Payload-Digest", payload_digest);
1385   warc_write_header ("Content-Type", "application/http;msgtype=response");
1386   warc_write_block_from_file (body);
1387   warc_write_end_record ();
1388
1389   fclose (body);
1390
1391   if (warc_write_ok && opt.warc_cdx_enabled)
1392     {
1393       /* Add this record to the CDX. */
1394       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1395       payload_digest, redirect_location, offset, warc_current_filename,
1396       response_uuid);
1397     }
1398
1399   if (block_digest)
1400     free (block_digest);
1401   if (payload_digest)
1402     free (payload_digest);
1403
1404   return warc_write_ok;
1405 }
1406
1407 /* Writes a resource or metadata record to the WARC file.
1408    warc_type  is either "resource" or "metadata",
1409    resource_uuid  is the uuid of the resource (or NULL),
1410    url  is the target uri of the resource,
1411    timestamp_str  is the timestamp (generated with warc_timestamp),
1412    concurrent_to_uuid  is the uuid of the record that generated this,
1413    resource (generated with warc_uuid_str) or NULL,
1414    ip  is the ip address of the server (or NULL),
1415    content_type  is the mime type of the body (or NULL),
1416    body  is a pointer to a file containing the resource data.
1417    Calling this function will close body.
1418    Returns true on success, false on error. */
1419 static bool
1420 warc_write_record (const char *record_type, char *resource_uuid,
1421                  const char *url, const char *timestamp_str,
1422                  const char *concurrent_to_uuid,
1423                  ip_address *ip, const char *content_type, FILE *body,
1424                  off_t payload_offset)
1425 {
1426   if (resource_uuid == NULL)
1427     {
1428       resource_uuid = alloca (48);
1429       warc_uuid_str (resource_uuid);
1430     }
1431
1432   if (content_type == NULL)
1433     content_type = "application/octet-stream";
1434
1435   warc_write_start_record ();
1436   warc_write_header ("WARC-Type", record_type);
1437   warc_write_header ("WARC-Record-ID", resource_uuid);
1438   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1439   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1440   warc_write_header ("WARC-Target-URI", url);
1441   warc_write_date_header (timestamp_str);
1442   warc_write_ip_header (ip);
1443   warc_write_digest_headers (body, payload_offset);
1444   warc_write_header ("Content-Type", content_type);
1445   warc_write_block_from_file (body);
1446   warc_write_end_record ();
1447
1448   fclose (body);
1449
1450   return warc_write_ok;
1451 }
1452
1453 /* Writes a resource record to the WARC file.
1454    resource_uuid  is the uuid of the resource (or NULL),
1455    url  is the target uri of the resource,
1456    timestamp_str  is the timestamp (generated with warc_timestamp),
1457    concurrent_to_uuid  is the uuid of the record that generated this,
1458    resource (generated with warc_uuid_str) or NULL,
1459    ip  is the ip address of the server (or NULL),
1460    content_type  is the mime type of the body (or NULL),
1461    body  is a pointer to a file containing the resource data.
1462    Calling this function will close body.
1463    Returns true on success, false on error. */
1464 bool
1465 warc_write_resource_record (char *resource_uuid, const char *url,
1466                  const char *timestamp_str, const char *concurrent_to_uuid,
1467                  ip_address *ip, const char *content_type, FILE *body,
1468                  off_t payload_offset)
1469 {
1470   return warc_write_record ("resource",
1471       resource_uuid, url, timestamp_str, concurrent_to_uuid,
1472       ip, content_type, body, payload_offset);
1473 }
1474
1475 /* Writes a metadata record to the WARC file.
1476    record_uuid  is the uuid of the record (or NULL),
1477    url  is the target uri of the record,
1478    timestamp_str  is the timestamp (generated with warc_timestamp),
1479    concurrent_to_uuid  is the uuid of the record that generated this,
1480    record (generated with warc_uuid_str) or NULL,
1481    ip  is the ip address of the server (or NULL),
1482    content_type  is the mime type of the body (or NULL),
1483    body  is a pointer to a file containing the record data.
1484    Calling this function will close body.
1485    Returns true on success, false on error. */
1486 bool
1487 warc_write_metadata_record (char *record_uuid, const char *url,
1488                  const char *timestamp_str, const char *concurrent_to_uuid,
1489                  ip_address *ip, const char *content_type, FILE *body,
1490                  off_t payload_offset)
1491 {
1492   return warc_write_record ("metadata",
1493       record_uuid, url, timestamp_str, concurrent_to_uuid,
1494       ip, content_type, body, payload_offset);
1495 }