sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #else
  55 #include <fcntl.h>
  56 #endif
  57
  58 #include "warc.h"
  59
  60 #ifndef O_TEMPORARY
  61 #define O_TEMPORARY 0
  62 #endif
  63
  64 extern char *version_string;
  65
  66 /* Set by main in main.c */
  67 extern char *program_argstring;
  68
  69
  70 /* The log file (a temporary file that contains a copy
  71    of the wget log). */
  72 static FILE *warc_log_fp;
  73
  74 /* The manifest file (a temporary file that contains the
  75    warcinfo uuid of every file in this crawl). */
  76 static FILE *warc_manifest_fp;
  77
  78 /* The current WARC file (or NULL, if WARC is disabled). */
  79 static FILE *warc_current_file;
  80
  81 #ifdef HAVE_LIBZ
  82 /* The gzip stream for the current WARC file
  83    (or NULL, if WARC or gzip is disabled). */
  84 static gzFile warc_current_gzfile;
  85
  86 /* The offset of the current gzip record in the WARC file. */
  87 static off_t warc_current_gzfile_offset;
  88
  89 /* The uncompressed size (so far) of the current record. */
  90 static off_t warc_current_gzfile_uncompressed_size;
  91 # endif
  92
  93 /* This is true until a warc_write_* method fails. */
  94 static bool warc_write_ok;
  95
  96 /* The current CDX file (or NULL, if CDX is disabled). */
  97 static FILE *warc_current_cdx_file;
  98
  99 /* The record id of the warcinfo record of the current WARC file.  */
 100 static char *warc_current_warcinfo_uuid_str;
 101
 102 /* The file name of the current WARC file. */
 103 static char *warc_current_filename;
 104
 105 /* The serial number of the current WARC file.  This number is
 106    incremented each time a new file is opened and is used in the
 107    WARC file's filename. */
 108 static int warc_current_file_number;
 109
 110 /* The table of CDX records, if deduplication is enabled. */
 111 struct hash_table * warc_cdx_dedup_table;
 112
 113 static bool warc_start_new_file (bool meta);
 114
 115
 116 struct warc_cdx_record
 117 {
 118   char *url;
 119   char *uuid;
 120   char digest[SHA1_DIGEST_SIZE];
 121 };
 122
 123 static unsigned long
 124 warc_hash_sha1_digest (const void *key)
 125 {
 126   /* We just use some of the first bytes of the digest. */
 127   unsigned long v = 0;
 128   memcpy (&v, key, sizeof (unsigned long));
 129   return v;
 130 }
 131
 132 static int
 133 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 134 {
 135   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 136 }
 137
 138
 139
 140 /* Writes SIZE bytes from BUFFER to the current WARC file,
 141    through gzwrite if compression is enabled.
 142    Returns the number of uncompressed bytes written.  */
 143 static size_t
 144 warc_write_buffer (const char *buffer, size_t size)
 145 {
 146 #ifdef HAVE_LIBZ
 147   if (warc_current_gzfile)
 148     {
 149       warc_current_gzfile_uncompressed_size += size;
 150       return gzwrite (warc_current_gzfile, buffer, size);
 151     }
 152   else
 153 #endif
 154     return fwrite (buffer, 1, size, warc_current_file);
 155 }
 156
 157 /* Writes STR to the current WARC file.
 158    Returns false and set warc_write_ok to false if there
 159    is an error.  */
 160 static bool
 161 warc_write_string (const char *str)
 162 {
 163   if (!warc_write_ok)
 164     return false;
 165
 166   size_t n = strlen (str);
 167   if (n != warc_write_buffer (str, n))
 168     warc_write_ok = false;
 169
 170   return warc_write_ok;
 171 }
 172
 173
 174 #define EXTRA_GZIP_HEADER_SIZE 14
 175 #define GZIP_STATIC_HEADER_SIZE  10
 176 #define FLG_FEXTRA          0x04
 177 #define OFF_FLG             3
 178
 179 /* Starts a new WARC record.  Writes the version header.
 180    If opt.warc_maxsize is set and the current file is becoming
 181    too large, this will open a new WARC file.
 182
 183    If compression is enabled, this will start a new
 184    gzip stream in the current WARC file.
 185
 186    Returns false and set warc_write_ok to false if there
 187    is an error.  */
 188 static bool
 189 warc_write_start_record (void)
 190 {
 191   if (!warc_write_ok)
 192     return false;
 193
 194   fflush (warc_current_file);
 195   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 196     warc_start_new_file (false);
 197
 198 #ifdef HAVE_LIBZ
 199   /* Start a GZIP stream, if required. */
 200   if (opt.warc_compression_enabled)
 201     {
 202       /* Record the starting offset of the new record. */
 203       warc_current_gzfile_offset = ftello (warc_current_file);
 204
 205       /* Reserve space for the extra GZIP header field.
 206          In warc_write_end_record we will fill this space
 207          with information about the uncompressed and
 208          compressed size of the record. */
 209       fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
 210       fflush (warc_current_file);
 211
 212       /* Start a new GZIP stream. */
 213       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 214       warc_current_gzfile_uncompressed_size = 0;
 215
 216       if (warc_current_gzfile == NULL)
 217         {
 218           logprintf (LOG_NOTQUIET,
 219 _("Error opening GZIP stream to WARC file.\n"));
 220           warc_write_ok = false;
 221           return false;
 222         }
 223     }
 224 #endif
 225
 226   warc_write_string ("WARC/1.0\r\n");
 227   return warc_write_ok;
 228 }
 229
 230 /* Writes a WARC header to the current WARC record.
 231    This method may be run after warc_write_start_record and
 232    before warc_write_block_from_file.  */
 233 static bool
 234 warc_write_header (const char *name, const char *value)
 235 {
 236   if (value)
 237     {
 238       warc_write_string (name);
 239       warc_write_string (": ");
 240       warc_write_string (value);
 241       warc_write_string ("\r\n");
 242     }
 243   return warc_write_ok;
 244 }
 245
 246 /* Copies the contents of DATA_IN to the WARC record.
 247    Adds a Content-Length header to the WARC record.
 248    Run this method after warc_write_header,
 249    then run warc_write_end_record. */
 250 static bool
 251 warc_write_block_from_file (FILE *data_in)
 252 {
 253   /* Add the Content-Length header. */
 254   char content_length[MAX_INT_TO_STRING_LEN(off_t)];
 255   fseeko (data_in, 0L, SEEK_END);
 256   number_to_string (content_length, ftello (data_in));
 257   warc_write_header ("Content-Length", content_length);
 258
 259   /* End of the WARC header section. */
 260   warc_write_string ("\r\n");
 261
 262   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 263     warc_write_ok = false;
 264
 265   /* Copy the data in the file to the WARC record. */
 266   char buffer[BUFSIZ];
 267   size_t s;
 268   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 269     {
 270       if (warc_write_buffer (buffer, s) < s)
 271         warc_write_ok = false;
 272     }
 273
 274   return warc_write_ok;
 275 }
 276
 277 /* Run this method to close the current WARC record.
 278
 279    If compression is enabled, this method closes the
 280    current GZIP stream and fills the extra GZIP header
 281    with the uncompressed and compressed length of the
 282    record. */
 283 static bool
 284 warc_write_end_record (void)
 285 {
 286   warc_write_buffer ("\r\n\r\n", 4);
 287
 288 #ifdef HAVE_LIBZ
 289   /* We start a new gzip stream for each record.  */
 290   if (warc_write_ok && warc_current_gzfile)
 291     {
 292       if (gzclose (warc_current_gzfile) != Z_OK)
 293         {
 294           warc_write_ok = false;
 295           return false;
 296         }
 297
 298       fflush (warc_current_file);
 299       fseeko (warc_current_file, 0, SEEK_END);
 300
 301       /* The WARC standard suggests that we add 'skip length' data in the
 302          extra header field of the GZIP stream.
 303
 304          In warc_write_start_record we reserved space for this extra header.
 305          This extra space starts at warc_current_gzfile_offset and fills
 306          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 307          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 308
 309          We need to do three things:
 310          1. Move the static GZIP header to warc_current_gzfile_offset;
 311          2. Set the FEXTRA flag in the GZIP header;
 312          3. Write the extra GZIP header after the static header, that is,
 313             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 314       */
 315
 316       /* Calculate the uncompressed and compressed sizes. */
 317       off_t current_offset = ftello (warc_current_file);
 318       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 319       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 320
 321       /* Go back to the static GZIP header. */
 322       fseeko (warc_current_file, warc_current_gzfile_offset
 323               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 324
 325       /* Read the header. */
 326       char static_header[GZIP_STATIC_HEADER_SIZE];
 327       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
 328                              warc_current_file);
 329       if (result != GZIP_STATIC_HEADER_SIZE)
 330         {
 331           warc_write_ok = false;
 332           return false;
 333         }
 334
 335       /* Set the FEXTRA flag in the flags byte of the header. */
 336       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 337
 338       /* Write the header back to the file, but starting at
 339          warc_current_gzfile_offset. */
 340       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 341       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 342
 343       /* Prepare the extra GZIP header. */
 344       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 345       /* XLEN, the length of the extra header fields.  */
 346       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 347       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 348       /* The extra header field identifier for the WARC skip length. */
 349       extra_header[2]  = 's';
 350       extra_header[3]  = 'l';
 351       /* The size of the field value (8 bytes).  */
 352       extra_header[4]  = (8 & 255);
 353       extra_header[5]  = ((8 >> 8) & 255);
 354       /* The size of the uncompressed record.  */
 355       extra_header[6]  = (uncompressed_size & 255);
 356       extra_header[7]  = (uncompressed_size >> 8) & 255;
 357       extra_header[8]  = (uncompressed_size >> 16) & 255;
 358       extra_header[9]  = (uncompressed_size >> 24) & 255;
 359       /* The size of the compressed record.  */
 360       extra_header[10] = (compressed_size & 255);
 361       extra_header[11] = (compressed_size >> 8) & 255;
 362       extra_header[12] = (compressed_size >> 16) & 255;
 363       extra_header[13] = (compressed_size >> 24) & 255;
 364
 365       /* Write the extra header after the static header. */
 366       fseeko (warc_current_file, warc_current_gzfile_offset
 367               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 368       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 369
 370       /* Done, move back to the end of the file. */
 371       fflush (warc_current_file);
 372       fseeko (warc_current_file, 0, SEEK_END);
 373     }
 374 #endif /* HAVE_LIBZ */
 375
 376   return warc_write_ok;
 377 }
 378
 379
 380 /* Writes the WARC-Date header for the given timestamp to
 381    the current WARC record.
 382    If timestamp is NULL, the current time will be used.  */
 383 static bool
 384 warc_write_date_header (const char *timestamp)
 385 {
 386   if (timestamp == NULL)
 387     {
 388       char current_timestamp[21];
 389       warc_timestamp (current_timestamp);
 390       timestamp = current_timestamp;
 391     }
 392   return warc_write_header ("WARC-Date", timestamp);
 393 }
 394
 395 /* Writes the WARC-IP-Address header for the given IP to
 396    the current WARC record.  If IP is NULL, no header will
 397    be written.  */
 398 static bool
 399 warc_write_ip_header (ip_address *ip)
 400 {
 401   if (ip != NULL)
 402     return warc_write_header ("WARC-IP-Address", print_address (ip));
 403   else
 404     return warc_write_ok;
 405 }
 406
 407
 408 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 409    from gnulib/sha1.c.  This version calculates two digests in one go.
 410
 411    Compute SHA1 message digests for bytes read from STREAM.  The
 412    digest of the complete file will be written into the 16 bytes
 413    beginning at RES_BLOCK.
 414
 415    If payload_offset >= 0, a second digest will be calculated of the
 416    portion of the file starting at payload_offset and continuing to
 417    the end of the file.  The digest number will be written into the
 418    16 bytes beginning ad RES_PAYLOAD.  */
 419 static int
 420 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 421                                off_t payload_offset)
 422 {
 423 #define BLOCKSIZE 32768
 424
 425   struct sha1_ctx ctx_block;
 426   struct sha1_ctx ctx_payload;
 427   off_t pos;
 428   off_t sum;
 429
 430   char *buffer = malloc (BLOCKSIZE + 72);
 431   if (!buffer)
 432     return 1;
 433
 434   /* Initialize the computation context.  */
 435   sha1_init_ctx (&ctx_block);
 436   if (payload_offset >= 0)
 437     sha1_init_ctx (&ctx_payload);
 438
 439   pos = 0;
 440
 441   /* Iterate over full file contents.  */
 442   while (1)
 443     {
 444       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 445          computation function processes the whole buffer so that with the
 446          next round of the loop another block can be read.  */
 447       off_t n;
 448       sum = 0;
 449
 450       /* Read block.  Take care for partial reads.  */
 451       while (1)
 452         {
 453           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 454
 455           sum += n;
 456           pos += n;
 457
 458           if (sum == BLOCKSIZE)
 459             break;
 460
 461           if (n == 0)
 462             {
 463               /* Check for the error flag IFF N == 0, so that we don't
 464                  exit the loop after a partial read due to e.g., EAGAIN
 465                  or EWOULDBLOCK.  */
 466               if (ferror (stream))
 467                 {
 468                   free (buffer);
 469                   return 1;
 470                 }
 471               goto process_partial_block;
 472             }
 473
 474           /* We've read at least one byte, so ignore errors.  But always
 475              check for EOF, since feof may be true even though N > 0.
 476              Otherwise, we could end up calling fread after EOF.  */
 477           if (feof (stream))
 478             goto process_partial_block;
 479         }
 480
 481       /* Process buffer with BLOCKSIZE bytes.  Note that
 482                         BLOCKSIZE % 64 == 0
 483        */
 484       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 485       if (payload_offset >= 0 && payload_offset < pos)
 486         {
 487           /* At least part of the buffer contains data from payload. */
 488           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 489           if (start_of_payload <= 0)
 490             /* All bytes in the buffer belong to the payload. */
 491             start_of_payload = 0;
 492
 493           /* Process the payload part of the buffer.
 494              Note: we can't use  sha1_process_block  here even if we
 495              process the complete buffer.  Because the payload doesn't
 496              have to start with a full block, there may still be some
 497              bytes left from the previous buffer.  Therefore, we need
 498              to continue with  sha1_process_bytes.  */
 499           sha1_process_bytes (buffer + start_of_payload,
 500                               BLOCKSIZE - start_of_payload, &ctx_payload);
 501         }
 502     }
 503
 504  process_partial_block:;
 505
 506   /* Process any remaining bytes.  */
 507   if (sum > 0)
 508     {
 509       sha1_process_bytes (buffer, sum, &ctx_block);
 510       if (payload_offset >= 0 && payload_offset < pos)
 511         {
 512           /* At least part of the buffer contains data from payload. */
 513           off_t start_of_payload = payload_offset - (pos - sum);
 514           if (start_of_payload <= 0)
 515             /* All bytes in the buffer belong to the payload. */
 516             start_of_payload = 0;
 517
 518           /* Process the payload part of the buffer. */
 519           sha1_process_bytes (buffer + start_of_payload,
 520                               sum - start_of_payload, &ctx_payload);
 521         }
 522     }
 523
 524   /* Construct result in desired memory.  */
 525   sha1_finish_ctx (&ctx_block,   res_block);
 526   if (payload_offset >= 0)
 527     sha1_finish_ctx (&ctx_payload, res_payload);
 528   free (buffer);
 529   return 0;
 530
 531 #undef BLOCKSIZE
 532 }
 533
 534 /* Converts the SHA1 digest to a base32-encoded string.
 535    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 536 static char *
 537 warc_base32_sha1_digest (char *sha1_digest)
 538 {
 539   /* length: "sha1:" + digest + "\0" */
 540   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 541   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
 542                  BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 543   memcpy (sha1_base32, "sha1:", 5);
 544   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 545   return sha1_base32;
 546 }
 547
 548
 549 /* Sets the digest headers of the record.
 550    This method will calculate the block digest and, if payload_offset >= 0,
 551    will also calculate the payload digest of the payload starting at the
 552    provided offset.  */
 553 static void
 554 warc_write_digest_headers (FILE *file, long payload_offset)
 555 {
 556   if (opt.warc_digests_enabled)
 557     {
 558       /* Calculate the block and payload digests. */
 559       char sha1_res_block[SHA1_DIGEST_SIZE];
 560       char sha1_res_payload[SHA1_DIGEST_SIZE];
 561
 562       rewind (file);
 563       if (warc_sha1_stream_with_payload (file, sha1_res_block,
 564           sha1_res_payload, payload_offset) == 0)
 565         {
 566           char *digest;
 567
 568           digest = warc_base32_sha1_digest (sha1_res_block);
 569           warc_write_header ("WARC-Block-Digest", digest);
 570           free (digest);
 571
 572           if (payload_offset >= 0)
 573             {
 574               digest = warc_base32_sha1_digest (sha1_res_payload);
 575               warc_write_header ("WARC-Payload-Digest", digest);
 576               free (digest);
 577             }
 578         }
 579     }
 580 }
 581
 582
 583 /* Fills timestamp with the current time and date.
 584    The UTC time is formatted following ISO 8601, as required
 585    for use in the WARC-Date header.
 586    The timestamp will be 21 characters long. */
 587 void
 588 warc_timestamp (char *timestamp)
 589 {
 590   time_t rawtime;
 591   struct tm * timeinfo;
 592   time ( &rawtime );
 593   timeinfo = gmtime (&rawtime);
 594   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 595 }
 596
 597 #ifdef HAVE_LIBUUID
 598 /* Fills urn_str with a UUID in the format required
 599    for the WARC-Record-Id header.
 600    The string will be 47 characters long. */
 601 void
 602 warc_uuid_str (char *urn_str)
 603 {
 604   char uuid_str[37];
 605
 606   uuid_t record_id;
 607   uuid_generate (record_id);
 608   uuid_unparse (record_id, uuid_str);
 609
 610   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 611 }
 612 #else
 613 /* Fills urn_str with a UUID based on random numbers in the format
 614    required for the WARC-Record-Id header.
 615    (See RFC 4122, UUID version 4.)
 616
 617    Note: this is a fallback method, it is much better to use the
 618    methods provided by libuuid.
 619
 620    The string will be 47 characters long. */
 621 void
 622 warc_uuid_str (char *urn_str)
 623 {
 624   // RFC 4122, a version 4 UUID with only random numbers
 625
 626   unsigned char uuid_data[16];
 627   int i;
 628   for (i=0; i<16; i++)
 629     uuid_data[i] = random_number (255);
 630
 631   // Set the four most significant bits (bits 12 through 15) of the
 632   // time_hi_and_version field to the 4-bit version number
 633   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 634
 635   // Set the two most significant bits (bits 6 and 7) of the
 636   // clock_seq_hi_and_reserved to zero and one, respectively.
 637   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 638
 639   sprintf (urn_str,
 640     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 641     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 642     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 643     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 644     uuid_data[15]);
 645 }
 646 #endif
 647
 648 /* Write a warcinfo record to the current file.
 649    Updates warc_current_warcinfo_uuid_str. */
 650 static bool
 651 warc_write_warcinfo_record (char *filename)
 652 {
 653   /* Write warc-info record as the first record of the file. */
 654   /* We add the record id of this info record to the other records in the
 655      file. */
 656   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 657   warc_uuid_str (warc_current_warcinfo_uuid_str);
 658
 659   char timestamp[22];
 660   warc_timestamp (timestamp);
 661
 662   char *filename_copy, *filename_basename;
 663   filename_copy = strdup (filename);
 664   filename_basename = strdup (basename (filename_copy));
 665
 666   warc_write_start_record ();
 667   warc_write_header ("WARC-Type", "warcinfo");
 668   warc_write_header ("Content-Type", "application/warc-fields");
 669   warc_write_header ("WARC-Date", timestamp);
 670   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 671   warc_write_header ("WARC-Filename", filename_basename);
 672
 673   /* Create content.  */
 674   FILE *warc_tmp = warc_tempfile ();
 675   if (warc_tmp == NULL)
 676     {
 677       free (filename_copy);
 678       free (filename_basename);
 679       return false;
 680     }
 681
 682   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 683   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 684   fprintf (warc_tmp,
 685 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 686   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 687   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 688   /* Add the user headers, if any. */
 689   if (opt.warc_user_headers)
 690     {
 691       int i;
 692       for (i = 0; opt.warc_user_headers[i]; i++)
 693         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 694     }
 695   fprintf(warc_tmp, "\r\n");
 696
 697   warc_write_digest_headers (warc_tmp, -1);
 698   warc_write_block_from_file (warc_tmp);
 699   warc_write_end_record ();
 700
 701   if (! warc_write_ok)
 702     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 703
 704   free (filename_copy);
 705   free (filename_basename);
 706   fclose (warc_tmp);
 707   return warc_write_ok;
 708 }
 709
 710 /* Opens a new WARC file.
 711    If META is true, generates a filename ending with 'meta.warc.gz'.
 712
 713    This method will:
 714    1. close the current WARC file (if there is one);
 715    2. increment warc_current_file_number;
 716    3. open a new WARC file;
 717    4. write the initial warcinfo record.
 718
 719    Returns true on success, false otherwise.
 720    */
 721 static bool
 722 warc_start_new_file (bool meta)
 723 {
 724   if (opt.warc_filename == NULL)
 725     return false;
 726
 727   if (warc_current_file != NULL)
 728     fclose (warc_current_file);
 729   if (warc_current_warcinfo_uuid_str)
 730     free (warc_current_warcinfo_uuid_str);
 731   if (warc_current_filename)
 732     free (warc_current_filename);
 733
 734   warc_current_file_number++;
 735
 736   int base_filename_length = strlen (opt.warc_filename);
 737   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 738   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 739   warc_current_filename = new_filename;
 740
 741 #ifdef __VMS
 742 # define WARC_GZ "warc-gz"
 743 #else /* def __VMS */
 744 # define WARC_GZ "warc.gz"
 745 #endif /* def __VMS [else] */
 746
 747 #ifdef HAVE_LIBZ
 748   const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
 749 #else
 750   const char *extension = "warc";
 751 #endif
 752
 753   /* If max size is enabled, we add a serial number to the file names. */
 754   if (meta)
 755     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 756   else if (opt.warc_maxsize > 0)
 757     {
 758       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
 759                warc_current_file_number, extension);
 760     }
 761   else
 762     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 763
 764   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 765
 766   /* Open the WARC file. */
 767   warc_current_file = fopen (new_filename, "wb+");
 768   if (warc_current_file == NULL)
 769     {
 770       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
 771                  quote (new_filename));
 772       return false;
 773     }
 774
 775   if (! warc_write_warcinfo_record (new_filename))
 776     return false;
 777
 778   /* Add warcinfo uuid to manifest. */
 779   if (warc_manifest_fp)
 780     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 781
 782   return true;
 783 }
 784
 785 /* Opens the CDX file for output. */
 786 static bool
 787 warc_start_cdx_file (void)
 788 {
 789   int filename_length = strlen (opt.warc_filename);
 790   char *cdx_filename = alloca (filename_length + 4 + 1);
 791   memcpy (cdx_filename, opt.warc_filename, filename_length);
 792   memcpy (cdx_filename + filename_length, ".cdx", 5);
 793   warc_current_cdx_file = fopen (cdx_filename, "a+");
 794   if (warc_current_cdx_file == NULL)
 795     return false;
 796
 797   /* Print the CDX header.
 798    *
 799    * a - original url
 800    * b - date
 801    * m - mime type
 802    * s - response code
 803    * k - new style checksum
 804    * r - redirect
 805    * M - meta tags
 806    * V - compressed arc file offset
 807    * g - file name
 808    * u - record-id
 809    */
 810   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 811   fflush (warc_current_cdx_file);
 812
 813   return true;
 814 }
 815
 816 #define CDX_FIELDSEP " \t\r\n"
 817
 818 /* Parse the CDX header and find the field numbers of the original url,
 819    checksum and record ID fields. */
 820 static bool
 821 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
 822                        int *field_num_checksum, int *field_num_record_id)
 823 {
 824   *field_num_original_url = -1;
 825   *field_num_checksum = -1;
 826   *field_num_record_id = -1;
 827
 828   char *token;
 829   char *save_ptr;
 830   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 831
 832   if (token != NULL && strcmp (token, "CDX") == 0)
 833     {
 834       int field_num = 0;
 835       while (token != NULL)
 836         {
 837           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 838           if (token != NULL)
 839             {
 840               switch (token[0])
 841                 {
 842                 case 'a':
 843                   *field_num_original_url = field_num;
 844                   break;
 845                 case 'k':
 846                   *field_num_checksum = field_num;
 847                   break;
 848                 case 'u':
 849                   *field_num_record_id = field_num;
 850                   break;
 851                 }
 852             }
 853           field_num++;
 854         }
 855     }
 856
 857   return *field_num_original_url != -1
 858          && *field_num_checksum != -1
 859          && *field_num_record_id != -1;
 860 }
 861
 862 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 863 static void
 864 warc_process_cdx_line (char *lineptr, int field_num_original_url,
 865                        int field_num_checksum, int field_num_record_id)
 866 {
 867   char *original_url = NULL;
 868   char *checksum = NULL;
 869   char *record_id = NULL;
 870
 871   char *token;
 872   char *save_ptr;
 873   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 874
 875   /* Read this line to get the fields we need. */
 876   int field_num = 0;
 877   while (token != NULL)
 878     {
 879       char **val;
 880       if (field_num == field_num_original_url)
 881         val = &original_url;
 882       else if (field_num == field_num_checksum)
 883         val = &checksum;
 884       else if (field_num == field_num_record_id)
 885         val = &record_id;
 886       else
 887         val = NULL;
 888
 889       if (val != NULL)
 890         *val = strdup (token);
 891
 892       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 893       field_num++;
 894     }
 895
 896   if (original_url != NULL && checksum != NULL && record_id != NULL)
 897     {
 898       /* For some extra efficiency, we decode the base32 encoded
 899          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 900          bytes.  */
 901       size_t checksum_l;
 902       char * checksum_v;
 903       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
 904                            &checksum_l);
 905       free (checksum);
 906
 907       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 908         {
 909           /* This is a valid line with a valid checksum. */
 910           struct warc_cdx_record *rec;
 911           rec = malloc (sizeof (struct warc_cdx_record));
 912           rec->url = original_url;
 913           rec->uuid = record_id;
 914           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 915           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 916           free (checksum_v);
 917         }
 918       else
 919         {
 920           free (original_url);
 921           if (checksum_v != NULL)
 922             free (checksum_v);
 923           free (record_id);
 924         }
 925     }
 926   else
 927     {
 928       xfree_null(checksum);
 929       xfree_null(original_url);
 930       xfree_null(record_id);
 931     }
 932 }
 933
 934 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 935    the warc_cdx_dedup_table. */
 936 static bool
 937 warc_load_cdx_dedup_file (void)
 938 {
 939   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 940   if (f == NULL)
 941     return false;
 942
 943   int field_num_original_url = -1;
 944   int field_num_checksum = -1;
 945   int field_num_record_id = -1;
 946
 947   char *lineptr = NULL;
 948   size_t n = 0;
 949   ssize_t line_length;
 950
 951   /* The first line should contain the CDX header.
 952      Format:  " CDX x x x x x"
 953      where x are field type indicators.  For our purposes, we only
 954      need 'a' (the original url), 'k' (the SHA1 checksum) and
 955      'u' (the WARC record id). */
 956   line_length = getline (&lineptr, &n, f);
 957   if (line_length != -1)
 958     warc_parse_cdx_header (lineptr, &field_num_original_url,
 959                            &field_num_checksum, &field_num_record_id);
 960
 961   /* If the file contains all three fields, read the complete file. */
 962   if (field_num_original_url == -1
 963       || field_num_checksum == -1
 964       || field_num_record_id == -1)
 965     {
 966       if (field_num_original_url == -1)
 967         logprintf (LOG_NOTQUIET,
 968 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 969       if (field_num_checksum == -1)
 970         logprintf (LOG_NOTQUIET,
 971 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 972       if (field_num_record_id == -1)
 973         logprintf (LOG_NOTQUIET,
 974 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 975     }
 976   else
 977     {
 978       /* Initialize the table. */
 979       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
 980                                              warc_cmp_sha1_digest);
 981
 982       do
 983         {
 984           line_length = getline (&lineptr, &n, f);
 985           if (line_length != -1)
 986             {
 987               warc_process_cdx_line (lineptr, field_num_original_url,
 988                             field_num_checksum, field_num_record_id);
 989             }
 990
 991         }
 992       while (line_length != -1);
 993
 994       /* Print results. */
 995       int nrecords = hash_table_count (warc_cdx_dedup_table);
 996       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 997                                         "Loaded %d records from CDX.\n\n",
 998                                          nrecords),
 999                               nrecords);
1000     }
1001
1002   free (lineptr);
1003   fclose (f);
1004
1005   return true;
1006 }
1007 #undef CDX_FIELDSEP
1008
1009 /* Returns the existing duplicate CDX record for the given url and payload
1010    digest.  Returns NULL if the url is not found or if the payload digest
1011    does not match, or if CDX deduplication is disabled. */
1012 static struct warc_cdx_record *
1013 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1014 {
1015   if (warc_cdx_dedup_table == NULL)
1016     return NULL;
1017
1018   struct warc_cdx_record *rec_existing
1019     = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1020
1021   if (rec_existing && strcmp (rec_existing->url, url) == 0)
1022     return rec_existing;
1023   else
1024     return NULL;
1025 }
1026
1027 /* Initializes the WARC writer (if opt.warc_filename is set).
1028    This should be called before any WARC record is written. */
1029 void
1030 warc_init (void)
1031 {
1032   warc_write_ok = true;
1033
1034   if (opt.warc_filename != NULL)
1035     {
1036       if (opt.warc_cdx_dedup_filename != NULL)
1037         {
1038           if (! warc_load_cdx_dedup_file ())
1039             {
1040               logprintf (LOG_NOTQUIET,
1041                          _("Could not read CDX file %s for deduplication.\n"),
1042                          quote (opt.warc_cdx_dedup_filename));
1043               exit(1);
1044             }
1045         }
1046
1047       warc_manifest_fp = warc_tempfile ();
1048       if (warc_manifest_fp == NULL)
1049         {
1050           logprintf (LOG_NOTQUIET,
1051                      _("Could not open temporary WARC manifest file.\n"));
1052           exit(1);
1053         }
1054
1055       if (opt.warc_keep_log)
1056         {
1057           warc_log_fp = warc_tempfile ();
1058           if (warc_log_fp == NULL)
1059             {
1060               logprintf (LOG_NOTQUIET,
1061                          _("Could not open temporary WARC log file.\n"));
1062               exit(1);
1063             }
1064           log_set_warc_log_fp (warc_log_fp);
1065         }
1066
1067       warc_current_file_number = -1;
1068       if (! warc_start_new_file (false))
1069         {
1070           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1071           exit(1);
1072         }
1073
1074       if (opt.warc_cdx_enabled)
1075         {
1076           if (! warc_start_cdx_file ())
1077             {
1078               logprintf (LOG_NOTQUIET,
1079                          _("Could not open CDX file for output.\n"));
1080               exit(1);
1081             }
1082         }
1083     }
1084 }
1085
1086 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1087 static void
1088 warc_write_metadata (void)
1089 {
1090   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1091   if (opt.warc_maxsize > 0)
1092     warc_start_new_file (true);
1093
1094   char manifest_uuid [48];
1095   warc_uuid_str (manifest_uuid);
1096
1097   fflush (warc_manifest_fp);
1098   warc_write_metadata_record (manifest_uuid,
1099                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1100                               NULL, NULL, NULL, "text/plain",
1101                               warc_manifest_fp, -1);
1102   /* warc_write_resource_record has closed warc_manifest_fp. */
1103
1104   FILE * warc_tmp_fp = warc_tempfile ();
1105   if (warc_tmp_fp == NULL)
1106     {
1107       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1108       exit(1);
1109     }
1110   fflush (warc_tmp_fp);
1111   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1112
1113   warc_write_resource_record (NULL,
1114                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1115                               NULL, manifest_uuid, NULL, "text/plain",
1116                               warc_tmp_fp, -1);
1117   /* warc_write_resource_record has closed warc_tmp_fp. */
1118
1119   if (warc_log_fp != NULL)
1120     {
1121       warc_write_resource_record (NULL,
1122                               "metadata://gnu.org/software/wget/warc/wget.log",
1123                                   NULL, manifest_uuid, NULL, "text/plain",
1124                                   warc_log_fp, -1);
1125       /* warc_write_resource_record has closed warc_log_fp. */
1126
1127       warc_log_fp = NULL;
1128       log_set_warc_log_fp (NULL);
1129     }
1130 }
1131
1132 /* Finishes the WARC writing.
1133    This should be called at the end of the program. */
1134 void
1135 warc_close (void)
1136 {
1137   if (warc_current_file != NULL)
1138     {
1139       warc_write_metadata ();
1140       free (warc_current_warcinfo_uuid_str);
1141       fclose (warc_current_file);
1142     }
1143   if (warc_current_cdx_file != NULL)
1144     fclose (warc_current_cdx_file);
1145   if (warc_log_fp != NULL)
1146     {
1147       fclose (warc_log_fp);
1148       log_set_warc_log_fp (NULL);
1149     }
1150 }
1151
1152 /* Creates a temporary file for writing WARC output.
1153    The temporary file will be created in opt.warc_tempdir.
1154    Returns the pointer to the temporary file, or NULL. */
1155 FILE *
1156 warc_tempfile (void)
1157 {
1158   char filename[100];
1159   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1160     return NULL;
1161
1162 #ifdef __VMS
1163   /* 2013-07-12 SMS.
1164    * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
1165    * mktemp() to uniquify the (VMS-style) name, and then use a normal
1166    * fopen() with a "create temp file marked for delete" option.
1167    */
1168   {
1169     char *tfn;
1170
1171     tfn = mktemp (filename);            /* Get unique name from template. */
1172     if (tfn == NULL)
1173       return NULL;
1174     return fopen (tfn, "w+", "fop=tmd");    /* Create auto-delete temp file. */
1175   }
1176 #else /* def __VMS */
1177   int fd = mkostemp (filename, O_TEMPORARY);
1178   if (fd < 0)
1179     return NULL;
1180
1181 #if !O_TEMPORARY
1182   if (unlink (filename) < 0)
1183     return NULL;
1184 #endif
1185
1186   return fdopen (fd, "wb+");
1187 #endif /* def __VMS [else] */
1188 }
1189
1190
1191 /* Writes a request record to the WARC file.
1192    url  is the target uri of the request,
1193    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1194    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1195    body  is a pointer to a file containing the request headers and body.
1196    ip  is the ip address of the server (or NULL),
1197    Calling this function will close body.
1198    Returns true on success, false on error. */
1199 bool
1200 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1201                            ip_address *ip, FILE *body, off_t payload_offset)
1202 {
1203   warc_write_start_record ();
1204   warc_write_header ("WARC-Type", "request");
1205   warc_write_header ("WARC-Target-URI", url);
1206   warc_write_header ("Content-Type", "application/http;msgtype=request");
1207   warc_write_date_header (timestamp_str);
1208   warc_write_header ("WARC-Record-ID", record_uuid);
1209   warc_write_ip_header (ip);
1210   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1211   warc_write_digest_headers (body, payload_offset);
1212   warc_write_block_from_file (body);
1213   warc_write_end_record ();
1214
1215   fclose (body);
1216
1217   return warc_write_ok;
1218 }
1219
1220 /* Writes a response record to the CDX file.
1221    url  is the target uri of the request/response,
1222    timestamp_str  is the timestamp of the request that generated this response,
1223                   (generated with warc_timestamp),
1224    mime_type  is the mime type of the response body (will be printed to CDX),
1225    response_code  is the HTTP response code (will be printed to CDX),
1226    payload_digest  is the sha1 digest of the payload,
1227    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1228    offset  is the position of the WARC record in the WARC file,
1229    warc_filename  is the filename of the WARC,
1230    response_uuid  is the uuid of the response.
1231    Returns true on success, false on error. */
1232 static bool
1233 warc_write_cdx_record (const char *url, const char *timestamp_str,
1234                        const char *mime_type, int response_code,
1235                        const char *payload_digest, const char *redirect_location,
1236                        off_t offset, const char *warc_filename,
1237                        const char *response_uuid)
1238 {
1239   /* Transform the timestamp. */
1240   char timestamp_str_cdx [15];
1241   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1242   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1243   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1244   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1245   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1246   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1247   timestamp_str_cdx[14] = '\0';
1248
1249   /* Rewrite the checksum. */
1250   const char *checksum;
1251   if (payload_digest != NULL)
1252     checksum = payload_digest + 5; /* Skip the "sha1:" */
1253   else
1254     checksum = "-";
1255
1256   if (mime_type == NULL || strlen(mime_type) == 0)
1257     mime_type = "-";
1258   if (redirect_location == NULL || strlen(redirect_location) == 0)
1259     redirect_location = "-";
1260
1261   char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1262   number_to_string (offset_string, offset);
1263
1264   /* Print the CDX line. */
1265   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1266            timestamp_str_cdx, url, mime_type, response_code, checksum,
1267            redirect_location, offset_string, warc_current_filename,
1268            response_uuid);
1269   fflush (warc_current_cdx_file);
1270
1271   return true;
1272 }
1273
1274 /* Writes a revisit record to the WARC file.
1275    url  is the target uri of the request/response,
1276    timestamp_str  is the timestamp of the request that generated this response
1277                   (generated with warc_timestamp),
1278    concurrent_to_uuid  is the uuid of the request for that generated this response
1279                  (generated with warc_uuid_str),
1280    refers_to_uuid  is the uuid of the original response
1281                  (generated with warc_uuid_str),
1282    payload_digest  is the sha1 digest of the payload,
1283    ip  is the ip address of the server (or NULL),
1284    body  is a pointer to a file containing the response headers (without payload).
1285    Calling this function will close body.
1286    Returns true on success, false on error. */
1287 static bool
1288 warc_write_revisit_record (char *url, char *timestamp_str,
1289                            char *concurrent_to_uuid, char *payload_digest,
1290                            char *refers_to, ip_address *ip, FILE *body)
1291 {
1292   char revisit_uuid [48];
1293   warc_uuid_str (revisit_uuid);
1294
1295   char *block_digest = NULL;
1296   char sha1_res_block[SHA1_DIGEST_SIZE];
1297   sha1_stream (body, sha1_res_block);
1298   block_digest = warc_base32_sha1_digest (sha1_res_block);
1299
1300   warc_write_start_record ();
1301   warc_write_header ("WARC-Type", "revisit");
1302   warc_write_header ("WARC-Record-ID", revisit_uuid);
1303   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1304   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1305   warc_write_header ("WARC-Refers-To", refers_to);
1306   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1307   warc_write_header ("WARC-Truncated", "length");
1308   warc_write_header ("WARC-Target-URI", url);
1309   warc_write_date_header (timestamp_str);
1310   warc_write_ip_header (ip);
1311   warc_write_header ("Content-Type", "application/http;msgtype=response");
1312   warc_write_header ("WARC-Block-Digest", block_digest);
1313   warc_write_header ("WARC-Payload-Digest", payload_digest);
1314   warc_write_block_from_file (body);
1315   warc_write_end_record ();
1316
1317   fclose (body);
1318   free (block_digest);
1319
1320   return warc_write_ok;
1321 }
1322
1323 /* Writes a response record to the WARC file.
1324    url  is the target uri of the request/response,
1325    timestamp_str  is the timestamp of the request that generated this response
1326                   (generated with warc_timestamp),
1327    concurrent_to_uuid  is the uuid of the request for that generated this response
1328                  (generated with warc_uuid_str),
1329    ip  is the ip address of the server (or NULL),
1330    body  is a pointer to a file containing the response headers and body.
1331    mime_type  is the mime type of the response body (will be printed to CDX),
1332    response_code  is the HTTP response code (will be printed to CDX),
1333    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1334    Calling this function will close body.
1335    Returns true on success, false on error. */
1336 bool
1337 warc_write_response_record (char *url, char *timestamp_str,
1338                             char *concurrent_to_uuid, ip_address *ip,
1339                             FILE *body, off_t payload_offset, char *mime_type,
1340                             int response_code, char *redirect_location)
1341 {
1342   char *block_digest = NULL;
1343   char *payload_digest = NULL;
1344   char sha1_res_block[SHA1_DIGEST_SIZE];
1345   char sha1_res_payload[SHA1_DIGEST_SIZE];
1346
1347   if (opt.warc_digests_enabled)
1348     {
1349       /* Calculate the block and payload digests. */
1350       rewind (body);
1351       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1352           payload_offset) == 0)
1353         {
1354           /* Decide (based on url + payload digest) if we have seen this
1355              data before. */
1356           struct warc_cdx_record *rec_existing;
1357           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1358           if (rec_existing != NULL)
1359             {
1360               bool result;
1361
1362               /* Found an existing record. */
1363               logprintf (LOG_VERBOSE,
1364           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1365
1366               /* Remove the payload from the file. */
1367               if (payload_offset > 0)
1368                 {
1369                   if (ftruncate (fileno (body), payload_offset) == -1)
1370                     return false;
1371                 }
1372
1373               /* Send the original payload digest. */
1374               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1375               result = warc_write_revisit_record (url, timestamp_str,
1376                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1377                          ip, body);
1378               free (payload_digest);
1379
1380               return result;
1381             }
1382
1383           block_digest = warc_base32_sha1_digest (sha1_res_block);
1384           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1385         }
1386     }
1387
1388   /* Not a revisit, just store the record. */
1389
1390   char response_uuid [48];
1391   warc_uuid_str (response_uuid);
1392
1393   fseeko (warc_current_file, 0L, SEEK_END);
1394   off_t offset = ftello (warc_current_file);
1395
1396   warc_write_start_record ();
1397   warc_write_header ("WARC-Type", "response");
1398   warc_write_header ("WARC-Record-ID", response_uuid);
1399   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1400   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1401   warc_write_header ("WARC-Target-URI", url);
1402   warc_write_date_header (timestamp_str);
1403   warc_write_ip_header (ip);
1404   warc_write_header ("WARC-Block-Digest", block_digest);
1405   warc_write_header ("WARC-Payload-Digest", payload_digest);
1406   warc_write_header ("Content-Type", "application/http;msgtype=response");
1407   warc_write_block_from_file (body);
1408   warc_write_end_record ();
1409
1410   fclose (body);
1411
1412   if (warc_write_ok && opt.warc_cdx_enabled)
1413     {
1414       /* Add this record to the CDX. */
1415       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1416       payload_digest, redirect_location, offset, warc_current_filename,
1417       response_uuid);
1418     }
1419
1420   if (block_digest)
1421     free (block_digest);
1422   if (payload_digest)
1423     free (payload_digest);
1424
1425   return warc_write_ok;
1426 }
1427
1428 /* Writes a resource or metadata record to the WARC file.
1429    warc_type  is either "resource" or "metadata",
1430    resource_uuid  is the uuid of the resource (or NULL),
1431    url  is the target uri of the resource,
1432    timestamp_str  is the timestamp (generated with warc_timestamp),
1433    concurrent_to_uuid  is the uuid of the record that generated this,
1434    resource (generated with warc_uuid_str) or NULL,
1435    ip  is the ip address of the server (or NULL),
1436    content_type  is the mime type of the body (or NULL),
1437    body  is a pointer to a file containing the resource data.
1438    Calling this function will close body.
1439    Returns true on success, false on error. */
1440 static bool
1441 warc_write_record (const char *record_type, char *resource_uuid,
1442                  const char *url, const char *timestamp_str,
1443                  const char *concurrent_to_uuid,
1444                  ip_address *ip, const char *content_type, FILE *body,
1445                  off_t payload_offset)
1446 {
1447   if (resource_uuid == NULL)
1448     {
1449       resource_uuid = alloca (48);
1450       warc_uuid_str (resource_uuid);
1451     }
1452
1453   if (content_type == NULL)
1454     content_type = "application/octet-stream";
1455
1456   warc_write_start_record ();
1457   warc_write_header ("WARC-Type", record_type);
1458   warc_write_header ("WARC-Record-ID", resource_uuid);
1459   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1460   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1461   warc_write_header ("WARC-Target-URI", url);
1462   warc_write_date_header (timestamp_str);
1463   warc_write_ip_header (ip);
1464   warc_write_digest_headers (body, payload_offset);
1465   warc_write_header ("Content-Type", content_type);
1466   warc_write_block_from_file (body);
1467   warc_write_end_record ();
1468
1469   fclose (body);
1470
1471   return warc_write_ok;
1472 }
1473
1474 /* Writes a resource record to the WARC file.
1475    resource_uuid  is the uuid of the resource (or NULL),
1476    url  is the target uri of the resource,
1477    timestamp_str  is the timestamp (generated with warc_timestamp),
1478    concurrent_to_uuid  is the uuid of the record that generated this,
1479    resource (generated with warc_uuid_str) or NULL,
1480    ip  is the ip address of the server (or NULL),
1481    content_type  is the mime type of the body (or NULL),
1482    body  is a pointer to a file containing the resource data.
1483    Calling this function will close body.
1484    Returns true on success, false on error. */
1485 bool
1486 warc_write_resource_record (char *resource_uuid, const char *url,
1487                  const char *timestamp_str, const char *concurrent_to_uuid,
1488                  ip_address *ip, const char *content_type, FILE *body,
1489                  off_t payload_offset)
1490 {
1491   return warc_write_record ("resource",
1492       resource_uuid, url, timestamp_str, concurrent_to_uuid,
1493       ip, content_type, body, payload_offset);
1494 }
1495
1496 /* Writes a metadata record to the WARC file.
1497    record_uuid  is the uuid of the record (or NULL),
1498    url  is the target uri of the record,
1499    timestamp_str  is the timestamp (generated with warc_timestamp),
1500    concurrent_to_uuid  is the uuid of the record that generated this,
1501    record (generated with warc_uuid_str) or NULL,
1502    ip  is the ip address of the server (or NULL),
1503    content_type  is the mime type of the body (or NULL),
1504    body  is a pointer to a file containing the record data.
1505    Calling this function will close body.
1506    Returns true on success, false on error. */
1507 bool
1508 warc_write_metadata_record (char *record_uuid, const char *url,
1509                  const char *timestamp_str, const char *concurrent_to_uuid,
1510                  ip_address *ip, const char *content_type, FILE *body,
1511                  off_t payload_offset)
1512 {
1513   return warc_write_record ("metadata",
1514       record_uuid, url, timestamp_str, concurrent_to_uuid,
1515       ip, content_type, body, payload_offset);
1516 }