sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #else
  55 #include <fcntl.h>
  56 #endif
  57
  58 #include "warc.h"
  59
  60 #ifndef O_TEMPORARY
  61 #define O_TEMPORARY 0
  62 #endif
  63
  64 extern char *version_string;
  65
  66 /* Set by main in main.c */
  67 extern char *program_argstring;
  68
  69
  70 /* The log file (a temporary file that contains a copy
  71    of the wget log). */
  72 static FILE *warc_log_fp;
  73
  74 /* The manifest file (a temporary file that contains the
  75    warcinfo uuid of every file in this crawl). */
  76 static FILE *warc_manifest_fp;
  77
  78 /* The current WARC file (or NULL, if WARC is disabled). */
  79 static FILE *warc_current_file;
  80
  81 #ifdef HAVE_LIBZ
  82 /* The gzip stream for the current WARC file
  83    (or NULL, if WARC or gzip is disabled). */
  84 static gzFile warc_current_gzfile;
  85
  86 /* The offset of the current gzip record in the WARC file. */
  87 static off_t warc_current_gzfile_offset;
  88
  89 /* The uncompressed size (so far) of the current record. */
  90 static off_t warc_current_gzfile_uncompressed_size;
  91 # endif
  92
  93 /* This is true until a warc_write_* method fails. */
  94 static bool warc_write_ok;
  95
  96 /* The current CDX file (or NULL, if CDX is disabled). */
  97 static FILE *warc_current_cdx_file;
  98
  99 /* The record id of the warcinfo record of the current WARC file.  */
 100 static char *warc_current_warcinfo_uuid_str;
 101
 102 /* The file name of the current WARC file. */
 103 static char *warc_current_filename;
 104
 105 /* The serial number of the current WARC file.  This number is
 106    incremented each time a new file is opened and is used in the
 107    WARC file's filename. */
 108 static int warc_current_file_number;
 109
 110 /* The table of CDX records, if deduplication is enabled. */
 111 struct hash_table * warc_cdx_dedup_table;
 112
 113 static bool warc_start_new_file (bool meta);
 114
 115
 116 struct warc_cdx_record
 117 {
 118   char *url;
 119   char *uuid;
 120   char digest[SHA1_DIGEST_SIZE];
 121 };
 122
 123 static unsigned long
 124 warc_hash_sha1_digest (const void *key)
 125 {
 126   /* We just use some of the first bytes of the digest. */
 127   unsigned long v = 0;
 128   memcpy (&v, key, sizeof (unsigned long));
 129   return v;
 130 }
 131
 132 static int
 133 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 134 {
 135   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 136 }
 137
 138
 139
 140 /* Writes SIZE bytes from BUFFER to the current WARC file,
 141    through gzwrite if compression is enabled.
 142    Returns the number of uncompressed bytes written.  */
 143 static size_t
 144 warc_write_buffer (const char *buffer, size_t size)
 145 {
 146 #ifdef HAVE_LIBZ
 147   if (warc_current_gzfile)
 148     {
 149       warc_current_gzfile_uncompressed_size += size;
 150       return gzwrite (warc_current_gzfile, buffer, size);
 151     }
 152   else
 153 #endif
 154     return fwrite (buffer, 1, size, warc_current_file);
 155 }
 156
 157 /* Writes STR to the current WARC file.
 158    Returns false and set warc_write_ok to false if there
 159    is an error.  */
 160 static bool
 161 warc_write_string (const char *str)
 162 {
 163   if (!warc_write_ok)
 164     return false;
 165
 166   size_t n = strlen (str);
 167   if (n != warc_write_buffer (str, n))
 168     warc_write_ok = false;
 169
 170   return warc_write_ok;
 171 }
 172
 173
 174 #define EXTRA_GZIP_HEADER_SIZE 14
 175 #define GZIP_STATIC_HEADER_SIZE  10
 176 #define FLG_FEXTRA          0x04
 177 #define OFF_FLG             3
 178
 179 /* Starts a new WARC record.  Writes the version header.
 180    If opt.warc_maxsize is set and the current file is becoming
 181    too large, this will open a new WARC file.
 182
 183    If compression is enabled, this will start a new
 184    gzip stream in the current WARC file.
 185
 186    Returns false and set warc_write_ok to false if there
 187    is an error.  */
 188 static bool
 189 warc_write_start_record (void)
 190 {
 191   if (!warc_write_ok)
 192     return false;
 193
 194   fflush (warc_current_file);
 195   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 196     warc_start_new_file (false);
 197
 198 #ifdef HAVE_LIBZ
 199   /* Start a GZIP stream, if required. */
 200   if (opt.warc_compression_enabled)
 201     {
 202       /* Record the starting offset of the new record. */
 203       warc_current_gzfile_offset = ftello (warc_current_file);
 204
 205       /* Reserve space for the extra GZIP header field.
 206          In warc_write_end_record we will fill this space
 207          with information about the uncompressed and
 208          compressed size of the record. */
 209       fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
 210       fflush (warc_current_file);
 211
 212       /* Start a new GZIP stream. */
 213       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 214       warc_current_gzfile_uncompressed_size = 0;
 215
 216       if (warc_current_gzfile == NULL)
 217         {
 218           logprintf (LOG_NOTQUIET,
 219 _("Error opening GZIP stream to WARC file.\n"));
 220           warc_write_ok = false;
 221           return false;
 222         }
 223     }
 224 #endif
 225
 226   warc_write_string ("WARC/1.0\r\n");
 227   return warc_write_ok;
 228 }
 229
 230 /* Writes a WARC header to the current WARC record.
 231    This method may be run after warc_write_start_record and
 232    before warc_write_block_from_file.  */
 233 static bool
 234 warc_write_header (const char *name, const char *value)
 235 {
 236   if (value)
 237     {
 238       warc_write_string (name);
 239       warc_write_string (": ");
 240       warc_write_string (value);
 241       warc_write_string ("\r\n");
 242     }
 243   return warc_write_ok;
 244 }
 245
 246 /* Copies the contents of DATA_IN to the WARC record.
 247    Adds a Content-Length header to the WARC record.
 248    Run this method after warc_write_header,
 249    then run warc_write_end_record. */
 250 static bool
 251 warc_write_block_from_file (FILE *data_in)
 252 {
 253   /* Add the Content-Length header. */
 254   char content_length[MAX_INT_TO_STRING_LEN(off_t)];
 255   fseeko (data_in, 0L, SEEK_END);
 256   number_to_string (content_length, ftello (data_in));
 257   warc_write_header ("Content-Length", content_length);
 258
 259   /* End of the WARC header section. */
 260   warc_write_string ("\r\n");
 261
 262   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 263     warc_write_ok = false;
 264
 265   /* Copy the data in the file to the WARC record. */
 266   char buffer[BUFSIZ];
 267   size_t s;
 268   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 269     {
 270       if (warc_write_buffer (buffer, s) < s)
 271         warc_write_ok = false;
 272     }
 273
 274   return warc_write_ok;
 275 }
 276
 277 /* Run this method to close the current WARC record.
 278
 279    If compression is enabled, this method closes the
 280    current GZIP stream and fills the extra GZIP header
 281    with the uncompressed and compressed length of the
 282    record. */
 283 static bool
 284 warc_write_end_record (void)
 285 {
 286   warc_write_buffer ("\r\n\r\n", 4);
 287
 288 #ifdef HAVE_LIBZ
 289   /* We start a new gzip stream for each record.  */
 290   if (warc_write_ok && warc_current_gzfile)
 291     {
 292       if (gzclose (warc_current_gzfile) != Z_OK)
 293         {
 294           warc_write_ok = false;
 295           return false;
 296         }
 297
 298       fflush (warc_current_file);
 299       fseeko (warc_current_file, 0, SEEK_END);
 300
 301       /* The WARC standard suggests that we add 'skip length' data in the
 302          extra header field of the GZIP stream.
 303
 304          In warc_write_start_record we reserved space for this extra header.
 305          This extra space starts at warc_current_gzfile_offset and fills
 306          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 307          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 308
 309          We need to do three things:
 310          1. Move the static GZIP header to warc_current_gzfile_offset;
 311          2. Set the FEXTRA flag in the GZIP header;
 312          3. Write the extra GZIP header after the static header, that is,
 313             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 314       */
 315
 316       /* Calculate the uncompressed and compressed sizes. */
 317       off_t current_offset = ftello (warc_current_file);
 318       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 319       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 320
 321       /* Go back to the static GZIP header. */
 322       fseeko (warc_current_file, warc_current_gzfile_offset
 323               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 324
 325       /* Read the header. */
 326       char static_header[GZIP_STATIC_HEADER_SIZE];
 327       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
 328                              warc_current_file);
 329       if (result != GZIP_STATIC_HEADER_SIZE)
 330         {
 331           warc_write_ok = false;
 332           return false;
 333         }
 334
 335       /* Set the FEXTRA flag in the flags byte of the header. */
 336       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 337
 338       /* Write the header back to the file, but starting at
 339          warc_current_gzfile_offset. */
 340       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 341       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 342
 343       /* Prepare the extra GZIP header. */
 344       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 345       /* XLEN, the length of the extra header fields.  */
 346       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 347       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 348       /* The extra header field identifier for the WARC skip length. */
 349       extra_header[2]  = 's';
 350       extra_header[3]  = 'l';
 351       /* The size of the field value (8 bytes).  */
 352       extra_header[4]  = (8 & 255);
 353       extra_header[5]  = ((8 >> 8) & 255);
 354       /* The size of the uncompressed record.  */
 355       extra_header[6]  = (uncompressed_size & 255);
 356       extra_header[7]  = (uncompressed_size >> 8) & 255;
 357       extra_header[8]  = (uncompressed_size >> 16) & 255;
 358       extra_header[9]  = (uncompressed_size >> 24) & 255;
 359       /* The size of the compressed record.  */
 360       extra_header[10] = (compressed_size & 255);
 361       extra_header[11] = (compressed_size >> 8) & 255;
 362       extra_header[12] = (compressed_size >> 16) & 255;
 363       extra_header[13] = (compressed_size >> 24) & 255;
 364
 365       /* Write the extra header after the static header. */
 366       fseeko (warc_current_file, warc_current_gzfile_offset
 367               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 368       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 369
 370       /* Done, move back to the end of the file. */
 371       fflush (warc_current_file);
 372       fseeko (warc_current_file, 0, SEEK_END);
 373     }
 374 #endif /* HAVE_LIBZ */
 375
 376   return warc_write_ok;
 377 }
 378
 379
 380 /* Writes the WARC-Date header for the given timestamp to
 381    the current WARC record.
 382    If timestamp is NULL, the current time will be used.  */
 383 static bool
 384 warc_write_date_header (const char *timestamp)
 385 {
 386   if (timestamp == NULL)
 387     {
 388       char current_timestamp[21];
 389       warc_timestamp (current_timestamp);
 390       timestamp = current_timestamp;
 391     }
 392   return warc_write_header ("WARC-Date", timestamp);
 393 }
 394
 395 /* Writes the WARC-IP-Address header for the given IP to
 396    the current WARC record.  If IP is NULL, no header will
 397    be written.  */
 398 static bool
 399 warc_write_ip_header (ip_address *ip)
 400 {
 401   if (ip != NULL)
 402     return warc_write_header ("WARC-IP-Address", print_address (ip));
 403   else
 404     return warc_write_ok;
 405 }
 406
 407
 408 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 409    from gnulib/sha1.c.  This version calculates two digests in one go.
 410
 411    Compute SHA1 message digests for bytes read from STREAM.  The
 412    digest of the complete file will be written into the 16 bytes
 413    beginning at RES_BLOCK.
 414
 415    If payload_offset >= 0, a second digest will be calculated of the
 416    portion of the file starting at payload_offset and continuing to
 417    the end of the file.  The digest number will be written into the
 418    16 bytes beginning ad RES_PAYLOAD.  */
 419 static int
 420 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 421                                off_t payload_offset)
 422 {
 423 #define BLOCKSIZE 32768
 424
 425   struct sha1_ctx ctx_block;
 426   struct sha1_ctx ctx_payload;
 427   off_t pos;
 428   off_t sum;
 429
 430   char *buffer = malloc (BLOCKSIZE + 72);
 431   if (!buffer)
 432     return 1;
 433
 434   /* Initialize the computation context.  */
 435   sha1_init_ctx (&ctx_block);
 436   if (payload_offset >= 0)
 437     sha1_init_ctx (&ctx_payload);
 438
 439   pos = 0;
 440
 441   /* Iterate over full file contents.  */
 442   while (1)
 443     {
 444       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 445          computation function processes the whole buffer so that with the
 446          next round of the loop another block can be read.  */
 447       off_t n;
 448       sum = 0;
 449
 450       /* Read block.  Take care for partial reads.  */
 451       while (1)
 452         {
 453           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 454
 455           sum += n;
 456           pos += n;
 457
 458           if (sum == BLOCKSIZE)
 459             break;
 460
 461           if (n == 0)
 462             {
 463               /* Check for the error flag IFF N == 0, so that we don't
 464                  exit the loop after a partial read due to e.g., EAGAIN
 465                  or EWOULDBLOCK.  */
 466               if (ferror (stream))
 467                 {
 468                   free (buffer);
 469                   return 1;
 470                 }
 471               goto process_partial_block;
 472             }
 473
 474           /* We've read at least one byte, so ignore errors.  But always
 475              check for EOF, since feof may be true even though N > 0.
 476              Otherwise, we could end up calling fread after EOF.  */
 477           if (feof (stream))
 478             goto process_partial_block;
 479         }
 480
 481       /* Process buffer with BLOCKSIZE bytes.  Note that
 482                         BLOCKSIZE % 64 == 0
 483        */
 484       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 485       if (payload_offset >= 0 && payload_offset < pos)
 486         {
 487           /* At least part of the buffer contains data from payload. */
 488           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 489           if (start_of_payload <= 0)
 490             /* All bytes in the buffer belong to the payload. */
 491             start_of_payload = 0;
 492
 493           /* Process the payload part of the buffer.
 494              Note: we can't use  sha1_process_block  here even if we
 495              process the complete buffer.  Because the payload doesn't
 496              have to start with a full block, there may still be some
 497              bytes left from the previous buffer.  Therefore, we need
 498              to continue with  sha1_process_bytes.  */
 499           sha1_process_bytes (buffer + start_of_payload,
 500                               BLOCKSIZE - start_of_payload, &ctx_payload);
 501         }
 502     }
 503
 504  process_partial_block:;
 505
 506   /* Process any remaining bytes.  */
 507   if (sum > 0)
 508     {
 509       sha1_process_bytes (buffer, sum, &ctx_block);
 510       if (payload_offset >= 0 && payload_offset < pos)
 511         {
 512           /* At least part of the buffer contains data from payload. */
 513           off_t start_of_payload = payload_offset - (pos - sum);
 514           if (start_of_payload <= 0)
 515             /* All bytes in the buffer belong to the payload. */
 516             start_of_payload = 0;
 517
 518           /* Process the payload part of the buffer. */
 519           sha1_process_bytes (buffer + start_of_payload,
 520                               sum - start_of_payload, &ctx_payload);
 521         }
 522     }
 523
 524   /* Construct result in desired memory.  */
 525   sha1_finish_ctx (&ctx_block,   res_block);
 526   if (payload_offset >= 0)
 527     sha1_finish_ctx (&ctx_payload, res_payload);
 528   free (buffer);
 529   return 0;
 530
 531 #undef BLOCKSIZE
 532 }
 533
 534 /* Converts the SHA1 digest to a base32-encoded string.
 535    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 536 static char *
 537 warc_base32_sha1_digest (char *sha1_digest)
 538 {
 539   /* length: "sha1:" + digest + "\0" */
 540   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 541   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
 542                  BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 543   memcpy (sha1_base32, "sha1:", 5);
 544   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 545   return sha1_base32;
 546 }
 547
 548
 549 /* Sets the digest headers of the record.
 550    This method will calculate the block digest and, if payload_offset >= 0,
 551    will also calculate the payload digest of the payload starting at the
 552    provided offset.  */
 553 static void
 554 warc_write_digest_headers (FILE *file, long payload_offset)
 555 {
 556   if (opt.warc_digests_enabled)
 557     {
 558       /* Calculate the block and payload digests. */
 559       char sha1_res_block[SHA1_DIGEST_SIZE];
 560       char sha1_res_payload[SHA1_DIGEST_SIZE];
 561
 562       rewind (file);
 563       if (warc_sha1_stream_with_payload (file, sha1_res_block,
 564           sha1_res_payload, payload_offset) == 0)
 565         {
 566           char *digest;
 567
 568           digest = warc_base32_sha1_digest (sha1_res_block);
 569           warc_write_header ("WARC-Block-Digest", digest);
 570           free (digest);
 571
 572           if (payload_offset >= 0)
 573             {
 574               digest = warc_base32_sha1_digest (sha1_res_payload);
 575               warc_write_header ("WARC-Payload-Digest", digest);
 576               free (digest);
 577             }
 578         }
 579     }
 580 }
 581
 582
 583 /* Fills timestamp with the current time and date.
 584    The UTC time is formatted following ISO 8601, as required
 585    for use in the WARC-Date header.
 586    The timestamp will be 21 characters long. */
 587 void
 588 warc_timestamp (char *timestamp)
 589 {
 590   time_t rawtime;
 591   struct tm * timeinfo;
 592   time ( &rawtime );
 593   timeinfo = gmtime (&rawtime);
 594   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 595 }
 596
 597 #ifdef HAVE_LIBUUID
 598 /* Fills urn_str with a UUID in the format required
 599    for the WARC-Record-Id header.
 600    The string will be 47 characters long. */
 601 void
 602 warc_uuid_str (char *urn_str)
 603 {
 604   char uuid_str[37];
 605
 606   uuid_t record_id;
 607   uuid_generate (record_id);
 608   uuid_unparse (record_id, uuid_str);
 609
 610   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 611 }
 612 #else
 613 /* Fills urn_str with a UUID based on random numbers in the format
 614    required for the WARC-Record-Id header.
 615    (See RFC 4122, UUID version 4.)
 616
 617    Note: this is a fallback method, it is much better to use the
 618    methods provided by libuuid.
 619
 620    The string will be 47 characters long. */
 621 void
 622 warc_uuid_str (char *urn_str)
 623 {
 624   // RFC 4122, a version 4 UUID with only random numbers
 625
 626   unsigned char uuid_data[16];
 627   int i;
 628   for (i=0; i<16; i++)
 629     uuid_data[i] = random_number (255);
 630
 631   // Set the four most significant bits (bits 12 through 15) of the
 632   // time_hi_and_version field to the 4-bit version number
 633   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 634
 635   // Set the two most significant bits (bits 6 and 7) of the
 636   // clock_seq_hi_and_reserved to zero and one, respectively.
 637   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 638
 639   sprintf (urn_str,
 640     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 641     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 642     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 643     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 644     uuid_data[15]);
 645 }
 646 #endif
 647
 648 /* Write a warcinfo record to the current file.
 649    Updates warc_current_warcinfo_uuid_str. */
 650 static bool
 651 warc_write_warcinfo_record (char *filename)
 652 {
 653   /* Write warc-info record as the first record of the file. */
 654   /* We add the record id of this info record to the other records in the
 655      file. */
 656   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 657   warc_uuid_str (warc_current_warcinfo_uuid_str);
 658
 659   char timestamp[22];
 660   warc_timestamp (timestamp);
 661
 662   char *filename_copy, *filename_basename;
 663   filename_copy = strdup (filename);
 664   filename_basename = strdup (basename (filename_copy));
 665
 666   warc_write_start_record ();
 667   warc_write_header ("WARC-Type", "warcinfo");
 668   warc_write_header ("Content-Type", "application/warc-fields");
 669   warc_write_header ("WARC-Date", timestamp);
 670   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 671   warc_write_header ("WARC-Filename", filename_basename);
 672
 673   /* Create content.  */
 674   FILE *warc_tmp = warc_tempfile ();
 675   if (warc_tmp == NULL)
 676     {
 677       free (filename_copy);
 678       free (filename_basename);
 679       return false;
 680     }
 681
 682   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 683   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 684   fprintf (warc_tmp,
 685 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 686   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 687   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 688   /* Add the user headers, if any. */
 689   if (opt.warc_user_headers)
 690     {
 691       int i;
 692       for (i = 0; opt.warc_user_headers[i]; i++)
 693         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 694     }
 695   fprintf(warc_tmp, "\r\n");
 696
 697   warc_write_digest_headers (warc_tmp, -1);
 698   warc_write_block_from_file (warc_tmp);
 699   warc_write_end_record ();
 700
 701   if (! warc_write_ok)
 702     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 703
 704   free (filename_copy);
 705   free (filename_basename);
 706   fclose (warc_tmp);
 707   return warc_write_ok;
 708 }
 709
 710 /* Opens a new WARC file.
 711    If META is true, generates a filename ending with 'meta.warc.gz'.
 712
 713    This method will:
 714    1. close the current WARC file (if there is one);
 715    2. increment warc_current_file_number;
 716    3. open a new WARC file;
 717    4. write the initial warcinfo record.
 718
 719    Returns true on success, false otherwise.
 720    */
 721 static bool
 722 warc_start_new_file (bool meta)
 723 {
 724   if (opt.warc_filename == NULL)
 725     return false;
 726
 727   if (warc_current_file != NULL)
 728     fclose (warc_current_file);
 729
 730   free (warc_current_warcinfo_uuid_str);
 731   free (warc_current_filename);
 732
 733   warc_current_file_number++;
 734
 735   int base_filename_length = strlen (opt.warc_filename);
 736   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 737   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 738   warc_current_filename = new_filename;
 739
 740 #ifdef __VMS
 741 # define WARC_GZ "warc-gz"
 742 #else /* def __VMS */
 743 # define WARC_GZ "warc.gz"
 744 #endif /* def __VMS [else] */
 745
 746 #ifdef HAVE_LIBZ
 747   const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
 748 #else
 749   const char *extension = "warc";
 750 #endif
 751
 752   /* If max size is enabled, we add a serial number to the file names. */
 753   if (meta)
 754     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 755   else if (opt.warc_maxsize > 0)
 756     {
 757       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
 758                warc_current_file_number, extension);
 759     }
 760   else
 761     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 762
 763   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 764
 765   /* Open the WARC file. */
 766   warc_current_file = fopen (new_filename, "wb+");
 767   if (warc_current_file == NULL)
 768     {
 769       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
 770                  quote (new_filename));
 771       return false;
 772     }
 773
 774   if (! warc_write_warcinfo_record (new_filename))
 775     return false;
 776
 777   /* Add warcinfo uuid to manifest. */
 778   if (warc_manifest_fp)
 779     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 780
 781   return true;
 782 }
 783
 784 /* Opens the CDX file for output. */
 785 static bool
 786 warc_start_cdx_file (void)
 787 {
 788   int filename_length = strlen (opt.warc_filename);
 789   char *cdx_filename = alloca (filename_length + 4 + 1);
 790   memcpy (cdx_filename, opt.warc_filename, filename_length);
 791   memcpy (cdx_filename + filename_length, ".cdx", 5);
 792   warc_current_cdx_file = fopen (cdx_filename, "a+");
 793   if (warc_current_cdx_file == NULL)
 794     return false;
 795
 796   /* Print the CDX header.
 797    *
 798    * a - original url
 799    * b - date
 800    * m - mime type
 801    * s - response code
 802    * k - new style checksum
 803    * r - redirect
 804    * M - meta tags
 805    * V - compressed arc file offset
 806    * g - file name
 807    * u - record-id
 808    */
 809   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 810   fflush (warc_current_cdx_file);
 811
 812   return true;
 813 }
 814
 815 #define CDX_FIELDSEP " \t\r\n"
 816
 817 /* Parse the CDX header and find the field numbers of the original url,
 818    checksum and record ID fields. */
 819 static bool
 820 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
 821                        int *field_num_checksum, int *field_num_record_id)
 822 {
 823   *field_num_original_url = -1;
 824   *field_num_checksum = -1;
 825   *field_num_record_id = -1;
 826
 827   char *token;
 828   char *save_ptr;
 829   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 830
 831   if (token != NULL && strcmp (token, "CDX") == 0)
 832     {
 833       int field_num = 0;
 834       while (token != NULL)
 835         {
 836           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 837           if (token != NULL)
 838             {
 839               switch (token[0])
 840                 {
 841                 case 'a':
 842                   *field_num_original_url = field_num;
 843                   break;
 844                 case 'k':
 845                   *field_num_checksum = field_num;
 846                   break;
 847                 case 'u':
 848                   *field_num_record_id = field_num;
 849                   break;
 850                 }
 851             }
 852           field_num++;
 853         }
 854     }
 855
 856   return *field_num_original_url != -1
 857          && *field_num_checksum != -1
 858          && *field_num_record_id != -1;
 859 }
 860
 861 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 862 static void
 863 warc_process_cdx_line (char *lineptr, int field_num_original_url,
 864                        int field_num_checksum, int field_num_record_id)
 865 {
 866   char *original_url = NULL;
 867   char *checksum = NULL;
 868   char *record_id = NULL;
 869
 870   char *token;
 871   char *save_ptr;
 872   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 873
 874   /* Read this line to get the fields we need. */
 875   int field_num = 0;
 876   while (token != NULL)
 877     {
 878       char **val;
 879       if (field_num == field_num_original_url)
 880         val = &original_url;
 881       else if (field_num == field_num_checksum)
 882         val = &checksum;
 883       else if (field_num == field_num_record_id)
 884         val = &record_id;
 885       else
 886         val = NULL;
 887
 888       if (val != NULL)
 889         *val = strdup (token);
 890
 891       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 892       field_num++;
 893     }
 894
 895   if (original_url != NULL && checksum != NULL && record_id != NULL)
 896     {
 897       /* For some extra efficiency, we decode the base32 encoded
 898          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 899          bytes.  */
 900       size_t checksum_l;
 901       char * checksum_v;
 902       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
 903                            &checksum_l);
 904       free (checksum);
 905
 906       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 907         {
 908           /* This is a valid line with a valid checksum. */
 909           struct warc_cdx_record *rec;
 910           rec = malloc (sizeof (struct warc_cdx_record));
 911           rec->url = original_url;
 912           rec->uuid = record_id;
 913           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 914           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 915           free (checksum_v);
 916         }
 917       else
 918         {
 919           free (original_url);
 920           free (checksum_v);
 921           free (record_id);
 922         }
 923     }
 924   else
 925     {
 926       xfree_null(checksum);
 927       xfree_null(original_url);
 928       xfree_null(record_id);
 929     }
 930 }
 931
 932 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 933    the warc_cdx_dedup_table. */
 934 static bool
 935 warc_load_cdx_dedup_file (void)
 936 {
 937   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 938   if (f == NULL)
 939     return false;
 940
 941   int field_num_original_url = -1;
 942   int field_num_checksum = -1;
 943   int field_num_record_id = -1;
 944
 945   char *lineptr = NULL;
 946   size_t n = 0;
 947   ssize_t line_length;
 948
 949   /* The first line should contain the CDX header.
 950      Format:  " CDX x x x x x"
 951      where x are field type indicators.  For our purposes, we only
 952      need 'a' (the original url), 'k' (the SHA1 checksum) and
 953      'u' (the WARC record id). */
 954   line_length = getline (&lineptr, &n, f);
 955   if (line_length != -1)
 956     warc_parse_cdx_header (lineptr, &field_num_original_url,
 957                            &field_num_checksum, &field_num_record_id);
 958
 959   /* If the file contains all three fields, read the complete file. */
 960   if (field_num_original_url == -1
 961       || field_num_checksum == -1
 962       || field_num_record_id == -1)
 963     {
 964       if (field_num_original_url == -1)
 965         logprintf (LOG_NOTQUIET,
 966 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 967       if (field_num_checksum == -1)
 968         logprintf (LOG_NOTQUIET,
 969 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 970       if (field_num_record_id == -1)
 971         logprintf (LOG_NOTQUIET,
 972 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 973     }
 974   else
 975     {
 976       /* Initialize the table. */
 977       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
 978                                              warc_cmp_sha1_digest);
 979
 980       do
 981         {
 982           line_length = getline (&lineptr, &n, f);
 983           if (line_length != -1)
 984             {
 985               warc_process_cdx_line (lineptr, field_num_original_url,
 986                             field_num_checksum, field_num_record_id);
 987             }
 988
 989         }
 990       while (line_length != -1);
 991
 992       /* Print results. */
 993       int nrecords = hash_table_count (warc_cdx_dedup_table);
 994       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 995                                         "Loaded %d records from CDX.\n\n",
 996                                          nrecords),
 997                               nrecords);
 998     }
 999
1000   free (lineptr);
1001   fclose (f);
1002
1003   return true;
1004 }
1005 #undef CDX_FIELDSEP
1006
1007 /* Returns the existing duplicate CDX record for the given url and payload
1008    digest.  Returns NULL if the url is not found or if the payload digest
1009    does not match, or if CDX deduplication is disabled. */
1010 static struct warc_cdx_record *
1011 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1012 {
1013   if (warc_cdx_dedup_table == NULL)
1014     return NULL;
1015
1016   struct warc_cdx_record *rec_existing
1017     = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1018
1019   if (rec_existing && strcmp (rec_existing->url, url) == 0)
1020     return rec_existing;
1021   else
1022     return NULL;
1023 }
1024
1025 /* Initializes the WARC writer (if opt.warc_filename is set).
1026    This should be called before any WARC record is written. */
1027 void
1028 warc_init (void)
1029 {
1030   warc_write_ok = true;
1031
1032   if (opt.warc_filename != NULL)
1033     {
1034       if (opt.warc_cdx_dedup_filename != NULL)
1035         {
1036           if (! warc_load_cdx_dedup_file ())
1037             {
1038               logprintf (LOG_NOTQUIET,
1039                          _("Could not read CDX file %s for deduplication.\n"),
1040                          quote (opt.warc_cdx_dedup_filename));
1041               exit(1);
1042             }
1043         }
1044
1045       warc_manifest_fp = warc_tempfile ();
1046       if (warc_manifest_fp == NULL)
1047         {
1048           logprintf (LOG_NOTQUIET,
1049                      _("Could not open temporary WARC manifest file.\n"));
1050           exit(1);
1051         }
1052
1053       if (opt.warc_keep_log)
1054         {
1055           warc_log_fp = warc_tempfile ();
1056           if (warc_log_fp == NULL)
1057             {
1058               logprintf (LOG_NOTQUIET,
1059                          _("Could not open temporary WARC log file.\n"));
1060               exit(1);
1061             }
1062           log_set_warc_log_fp (warc_log_fp);
1063         }
1064
1065       warc_current_file_number = -1;
1066       if (! warc_start_new_file (false))
1067         {
1068           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1069           exit(1);
1070         }
1071
1072       if (opt.warc_cdx_enabled)
1073         {
1074           if (! warc_start_cdx_file ())
1075             {
1076               logprintf (LOG_NOTQUIET,
1077                          _("Could not open CDX file for output.\n"));
1078               exit(1);
1079             }
1080         }
1081     }
1082 }
1083
1084 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1085 static void
1086 warc_write_metadata (void)
1087 {
1088   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1089   if (opt.warc_maxsize > 0)
1090     warc_start_new_file (true);
1091
1092   char manifest_uuid [48];
1093   warc_uuid_str (manifest_uuid);
1094
1095   fflush (warc_manifest_fp);
1096   warc_write_metadata_record (manifest_uuid,
1097                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1098                               NULL, NULL, NULL, "text/plain",
1099                               warc_manifest_fp, -1);
1100   /* warc_write_resource_record has closed warc_manifest_fp. */
1101
1102   FILE * warc_tmp_fp = warc_tempfile ();
1103   if (warc_tmp_fp == NULL)
1104     {
1105       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1106       exit(1);
1107     }
1108   fflush (warc_tmp_fp);
1109   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1110
1111   warc_write_resource_record (NULL,
1112                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1113                               NULL, manifest_uuid, NULL, "text/plain",
1114                               warc_tmp_fp, -1);
1115   /* warc_write_resource_record has closed warc_tmp_fp. */
1116
1117   if (warc_log_fp != NULL)
1118     {
1119       warc_write_resource_record (NULL,
1120                               "metadata://gnu.org/software/wget/warc/wget.log",
1121                                   NULL, manifest_uuid, NULL, "text/plain",
1122                                   warc_log_fp, -1);
1123       /* warc_write_resource_record has closed warc_log_fp. */
1124
1125       warc_log_fp = NULL;
1126       log_set_warc_log_fp (NULL);
1127     }
1128 }
1129
1130 /* Finishes the WARC writing.
1131    This should be called at the end of the program. */
1132 void
1133 warc_close (void)
1134 {
1135   if (warc_current_file != NULL)
1136     {
1137       warc_write_metadata ();
1138       free (warc_current_warcinfo_uuid_str);
1139       fclose (warc_current_file);
1140     }
1141   if (warc_current_cdx_file != NULL)
1142     fclose (warc_current_cdx_file);
1143   if (warc_log_fp != NULL)
1144     {
1145       fclose (warc_log_fp);
1146       log_set_warc_log_fp (NULL);
1147     }
1148 }
1149
1150 /* Creates a temporary file for writing WARC output.
1151    The temporary file will be created in opt.warc_tempdir.
1152    Returns the pointer to the temporary file, or NULL. */
1153 FILE *
1154 warc_tempfile (void)
1155 {
1156   char filename[100];
1157   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1158     return NULL;
1159
1160 #ifdef __VMS
1161   /* 2013-07-12 SMS.
1162    * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
1163    * mktemp() to uniquify the (VMS-style) name, and then use a normal
1164    * fopen() with a "create temp file marked for delete" option.
1165    */
1166   {
1167     char *tfn;
1168
1169     tfn = mktemp (filename);            /* Get unique name from template. */
1170     if (tfn == NULL)
1171       return NULL;
1172     return fopen (tfn, "w+", "fop=tmd");    /* Create auto-delete temp file. */
1173   }
1174 #else /* def __VMS */
1175   int fd = mkostemp (filename, O_TEMPORARY);
1176   if (fd < 0)
1177     return NULL;
1178
1179 #if !O_TEMPORARY
1180   if (unlink (filename) < 0)
1181     return NULL;
1182 #endif
1183
1184   return fdopen (fd, "wb+");
1185 #endif /* def __VMS [else] */
1186 }
1187
1188
1189 /* Writes a request record to the WARC file.
1190    url  is the target uri of the request,
1191    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1192    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1193    body  is a pointer to a file containing the request headers and body.
1194    ip  is the ip address of the server (or NULL),
1195    Calling this function will close body.
1196    Returns true on success, false on error. */
1197 bool
1198 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1199                            ip_address *ip, FILE *body, off_t payload_offset)
1200 {
1201   warc_write_start_record ();
1202   warc_write_header ("WARC-Type", "request");
1203   warc_write_header ("WARC-Target-URI", url);
1204   warc_write_header ("Content-Type", "application/http;msgtype=request");
1205   warc_write_date_header (timestamp_str);
1206   warc_write_header ("WARC-Record-ID", record_uuid);
1207   warc_write_ip_header (ip);
1208   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1209   warc_write_digest_headers (body, payload_offset);
1210   warc_write_block_from_file (body);
1211   warc_write_end_record ();
1212
1213   fclose (body);
1214
1215   return warc_write_ok;
1216 }
1217
1218 /* Writes a response record to the CDX file.
1219    url  is the target uri of the request/response,
1220    timestamp_str  is the timestamp of the request that generated this response,
1221                   (generated with warc_timestamp),
1222    mime_type  is the mime type of the response body (will be printed to CDX),
1223    response_code  is the HTTP response code (will be printed to CDX),
1224    payload_digest  is the sha1 digest of the payload,
1225    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1226    offset  is the position of the WARC record in the WARC file,
1227    warc_filename  is the filename of the WARC,
1228    response_uuid  is the uuid of the response.
1229    Returns true on success, false on error. */
1230 static bool
1231 warc_write_cdx_record (const char *url, const char *timestamp_str,
1232                        const char *mime_type, int response_code,
1233                        const char *payload_digest, const char *redirect_location,
1234                        off_t offset, const char *warc_filename,
1235                        const char *response_uuid)
1236 {
1237   /* Transform the timestamp. */
1238   char timestamp_str_cdx [15];
1239   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1240   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1241   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1242   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1243   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1244   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1245   timestamp_str_cdx[14] = '\0';
1246
1247   /* Rewrite the checksum. */
1248   const char *checksum;
1249   if (payload_digest != NULL)
1250     checksum = payload_digest + 5; /* Skip the "sha1:" */
1251   else
1252     checksum = "-";
1253
1254   if (mime_type == NULL || strlen(mime_type) == 0)
1255     mime_type = "-";
1256   if (redirect_location == NULL || strlen(redirect_location) == 0)
1257     redirect_location = "-";
1258
1259   char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1260   number_to_string (offset_string, offset);
1261
1262   /* Print the CDX line. */
1263   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1264            timestamp_str_cdx, url, mime_type, response_code, checksum,
1265            redirect_location, offset_string, warc_current_filename,
1266            response_uuid);
1267   fflush (warc_current_cdx_file);
1268
1269   return true;
1270 }
1271
1272 /* Writes a revisit record to the WARC file.
1273    url  is the target uri of the request/response,
1274    timestamp_str  is the timestamp of the request that generated this response
1275                   (generated with warc_timestamp),
1276    concurrent_to_uuid  is the uuid of the request for that generated this response
1277                  (generated with warc_uuid_str),
1278    refers_to_uuid  is the uuid of the original response
1279                  (generated with warc_uuid_str),
1280    payload_digest  is the sha1 digest of the payload,
1281    ip  is the ip address of the server (or NULL),
1282    body  is a pointer to a file containing the response headers (without payload).
1283    Calling this function will close body.
1284    Returns true on success, false on error. */
1285 static bool
1286 warc_write_revisit_record (char *url, char *timestamp_str,
1287                            char *concurrent_to_uuid, char *payload_digest,
1288                            char *refers_to, ip_address *ip, FILE *body)
1289 {
1290   char revisit_uuid [48];
1291   warc_uuid_str (revisit_uuid);
1292
1293   char *block_digest = NULL;
1294   char sha1_res_block[SHA1_DIGEST_SIZE];
1295   sha1_stream (body, sha1_res_block);
1296   block_digest = warc_base32_sha1_digest (sha1_res_block);
1297
1298   warc_write_start_record ();
1299   warc_write_header ("WARC-Type", "revisit");
1300   warc_write_header ("WARC-Record-ID", revisit_uuid);
1301   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1302   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1303   warc_write_header ("WARC-Refers-To", refers_to);
1304   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1305   warc_write_header ("WARC-Truncated", "length");
1306   warc_write_header ("WARC-Target-URI", url);
1307   warc_write_date_header (timestamp_str);
1308   warc_write_ip_header (ip);
1309   warc_write_header ("Content-Type", "application/http;msgtype=response");
1310   warc_write_header ("WARC-Block-Digest", block_digest);
1311   warc_write_header ("WARC-Payload-Digest", payload_digest);
1312   warc_write_block_from_file (body);
1313   warc_write_end_record ();
1314
1315   fclose (body);
1316   free (block_digest);
1317
1318   return warc_write_ok;
1319 }
1320
1321 /* Writes a response record to the WARC file.
1322    url  is the target uri of the request/response,
1323    timestamp_str  is the timestamp of the request that generated this response
1324                   (generated with warc_timestamp),
1325    concurrent_to_uuid  is the uuid of the request for that generated this response
1326                  (generated with warc_uuid_str),
1327    ip  is the ip address of the server (or NULL),
1328    body  is a pointer to a file containing the response headers and body.
1329    mime_type  is the mime type of the response body (will be printed to CDX),
1330    response_code  is the HTTP response code (will be printed to CDX),
1331    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1332    Calling this function will close body.
1333    Returns true on success, false on error. */
1334 bool
1335 warc_write_response_record (char *url, char *timestamp_str,
1336                             char *concurrent_to_uuid, ip_address *ip,
1337                             FILE *body, off_t payload_offset, char *mime_type,
1338                             int response_code, char *redirect_location)
1339 {
1340   char *block_digest = NULL;
1341   char *payload_digest = NULL;
1342   char sha1_res_block[SHA1_DIGEST_SIZE];
1343   char sha1_res_payload[SHA1_DIGEST_SIZE];
1344
1345   if (opt.warc_digests_enabled)
1346     {
1347       /* Calculate the block and payload digests. */
1348       rewind (body);
1349       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1350           payload_offset) == 0)
1351         {
1352           /* Decide (based on url + payload digest) if we have seen this
1353              data before. */
1354           struct warc_cdx_record *rec_existing;
1355           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1356           if (rec_existing != NULL)
1357             {
1358               bool result;
1359
1360               /* Found an existing record. */
1361               logprintf (LOG_VERBOSE,
1362           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1363
1364               /* Remove the payload from the file. */
1365               if (payload_offset > 0)
1366                 {
1367                   if (ftruncate (fileno (body), payload_offset) == -1)
1368                     return false;
1369                 }
1370
1371               /* Send the original payload digest. */
1372               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1373               result = warc_write_revisit_record (url, timestamp_str,
1374                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1375                          ip, body);
1376               free (payload_digest);
1377
1378               return result;
1379             }
1380
1381           block_digest = warc_base32_sha1_digest (sha1_res_block);
1382           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1383         }
1384     }
1385
1386   /* Not a revisit, just store the record. */
1387
1388   char response_uuid [48];
1389   warc_uuid_str (response_uuid);
1390
1391   fseeko (warc_current_file, 0L, SEEK_END);
1392   off_t offset = ftello (warc_current_file);
1393
1394   warc_write_start_record ();
1395   warc_write_header ("WARC-Type", "response");
1396   warc_write_header ("WARC-Record-ID", response_uuid);
1397   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1398   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1399   warc_write_header ("WARC-Target-URI", url);
1400   warc_write_date_header (timestamp_str);
1401   warc_write_ip_header (ip);
1402   warc_write_header ("WARC-Block-Digest", block_digest);
1403   warc_write_header ("WARC-Payload-Digest", payload_digest);
1404   warc_write_header ("Content-Type", "application/http;msgtype=response");
1405   warc_write_block_from_file (body);
1406   warc_write_end_record ();
1407
1408   fclose (body);
1409
1410   if (warc_write_ok && opt.warc_cdx_enabled)
1411     {
1412       /* Add this record to the CDX. */
1413       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1414       payload_digest, redirect_location, offset, warc_current_filename,
1415       response_uuid);
1416     }
1417
1418   free (block_digest);
1419   free (payload_digest);
1420
1421   return warc_write_ok;
1422 }
1423
1424 /* Writes a resource or metadata record to the WARC file.
1425    warc_type  is either "resource" or "metadata",
1426    resource_uuid  is the uuid of the resource (or NULL),
1427    url  is the target uri of the resource,
1428    timestamp_str  is the timestamp (generated with warc_timestamp),
1429    concurrent_to_uuid  is the uuid of the record that generated this,
1430    resource (generated with warc_uuid_str) or NULL,
1431    ip  is the ip address of the server (or NULL),
1432    content_type  is the mime type of the body (or NULL),
1433    body  is a pointer to a file containing the resource data.
1434    Calling this function will close body.
1435    Returns true on success, false on error. */
1436 static bool
1437 warc_write_record (const char *record_type, char *resource_uuid,
1438                  const char *url, const char *timestamp_str,
1439                  const char *concurrent_to_uuid,
1440                  ip_address *ip, const char *content_type, FILE *body,
1441                  off_t payload_offset)
1442 {
1443   if (resource_uuid == NULL)
1444     {
1445       resource_uuid = alloca (48);
1446       warc_uuid_str (resource_uuid);
1447     }
1448
1449   if (content_type == NULL)
1450     content_type = "application/octet-stream";
1451
1452   warc_write_start_record ();
1453   warc_write_header ("WARC-Type", record_type);
1454   warc_write_header ("WARC-Record-ID", resource_uuid);
1455   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1456   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1457   warc_write_header ("WARC-Target-URI", url);
1458   warc_write_date_header (timestamp_str);
1459   warc_write_ip_header (ip);
1460   warc_write_digest_headers (body, payload_offset);
1461   warc_write_header ("Content-Type", content_type);
1462   warc_write_block_from_file (body);
1463   warc_write_end_record ();
1464
1465   fclose (body);
1466
1467   return warc_write_ok;
1468 }
1469
1470 /* Writes a resource record to the WARC file.
1471    resource_uuid  is the uuid of the resource (or NULL),
1472    url  is the target uri of the resource,
1473    timestamp_str  is the timestamp (generated with warc_timestamp),
1474    concurrent_to_uuid  is the uuid of the record that generated this,
1475    resource (generated with warc_uuid_str) or NULL,
1476    ip  is the ip address of the server (or NULL),
1477    content_type  is the mime type of the body (or NULL),
1478    body  is a pointer to a file containing the resource data.
1479    Calling this function will close body.
1480    Returns true on success, false on error. */
1481 bool
1482 warc_write_resource_record (char *resource_uuid, const char *url,
1483                  const char *timestamp_str, const char *concurrent_to_uuid,
1484                  ip_address *ip, const char *content_type, FILE *body,
1485                  off_t payload_offset)
1486 {
1487   return warc_write_record ("resource",
1488       resource_uuid, url, timestamp_str, concurrent_to_uuid,
1489       ip, content_type, body, payload_offset);
1490 }
1491
1492 /* Writes a metadata record to the WARC file.
1493    record_uuid  is the uuid of the record (or NULL),
1494    url  is the target uri of the record,
1495    timestamp_str  is the timestamp (generated with warc_timestamp),
1496    concurrent_to_uuid  is the uuid of the record that generated this,
1497    record (generated with warc_uuid_str) or NULL,
1498    ip  is the ip address of the server (or NULL),
1499    content_type  is the mime type of the body (or NULL),
1500    body  is a pointer to a file containing the record data.
1501    Calling this function will close body.
1502    Returns true on success, false on error. */
1503 bool
1504 warc_write_metadata_record (char *record_uuid, const char *url,
1505                  const char *timestamp_str, const char *concurrent_to_uuid,
1506                  ip_address *ip, const char *content_type, FILE *body,
1507                  off_t payload_offset)
1508 {
1509   return warc_write_record ("metadata",
1510       record_uuid, url, timestamp_str, concurrent_to_uuid,
1511       ip, content_type, body, payload_offset);
1512 }