sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files. */
   2 #define _GNU_SOURCE
   3
   4 #include "wget.h"
   5 #include "hash.h"
   6 #include "utils.h"
   7
   8 #include <stdio.h>
   9 #include <stdlib.h>
  10 #include <string.h>
  11 #include <strings.h>
  12 #include <time.h>
  13 #include <tmpdir.h>
  14 #include <sha1.h>
  15 #include <base32.h>
  16 #include <unistd.h>
  17 #ifdef HAVE_LIBZ
  18 #include <zlib.h>
  19 #endif
  20 #ifdef HAVE_LIBUUID
  21 #include <uuid/uuid.h>
  22 #endif
  23
  24 #ifndef WINDOWS
  25 #include <libgen.h>
  26 #endif
  27
  28 #include "warc.h"
  29
  30 extern char *version_string;
  31
  32 /* Set by main in main.c */
  33 extern char *program_argstring;
  34
  35
  36 /* The log file (a temporary file that contains a copy
  37    of the wget log). */
  38 static FILE *warc_log_fp;
  39
  40 /* The manifest file (a temporary file that contains the
  41    warcinfo uuid of every file in this crawl). */
  42 static FILE *warc_manifest_fp;
  43
  44 /* The current WARC file (or NULL, if WARC is disabled). */
  45 static FILE *warc_current_file;
  46
  47 #ifdef HAVE_LIBZ
  48 /* The gzip stream for the current WARC file
  49    (or NULL, if WARC or gzip is disabled). */
  50 static gzFile *warc_current_gzfile;
  51
  52 /* The offset of the current gzip record in the WARC file. */
  53 static size_t warc_current_gzfile_offset;
  54
  55 /* The uncompressed size (so far) of the current record. */
  56 static size_t warc_current_gzfile_uncompressed_size;
  57 # endif
  58
  59 /* This is true until a warc_write_* method fails. */
  60 static bool warc_write_ok;
  61
  62 /* The current CDX file (or NULL, if CDX is disabled). */
  63 static FILE *warc_current_cdx_file;
  64
  65 /* The record id of the warcinfo record of the current WARC file.  */
  66 static char *warc_current_warcinfo_uuid_str;
  67
  68 /* The file name of the current WARC file. */
  69 static char *warc_current_filename;
  70
  71 /* The serial number of the current WARC file.  This number is
  72    incremented each time a new file is opened and is used in the
  73    WARC file's filename. */
  74 static int warc_current_file_number;
  75
  76 /* The table of CDX records, if deduplication is enabled. */
  77 struct hash_table * warc_cdx_dedup_table;
  78
  79 static bool warc_start_new_file (bool meta);
  80
  81
  82 struct warc_cdx_record
  83 {
  84   char *url;
  85   char *uuid;
  86   char digest[SHA1_DIGEST_SIZE];
  87 };
  88
  89 static unsigned long
  90 warc_hash_sha1_digest (const void *key)
  91 {
  92   /* We just use some of the first bytes of the digest. */
  93   unsigned long v = 0;
  94   memcpy (&v, key, sizeof (unsigned long));
  95   return v;
  96 }
  97
  98 static int
  99 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 100 {
 101   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 102 }
 103
 104
 105
 106 /* Writes SIZE bytes from BUFFER to the current WARC file,
 107    through gzwrite if compression is enabled.
 108    Returns the number of uncompressed bytes written.  */
 109 static size_t
 110 warc_write_buffer (const char *buffer, size_t size)
 111 {
 112 #ifdef HAVE_LIBZ
 113   if (warc_current_gzfile)
 114     {
 115       warc_current_gzfile_uncompressed_size += size;
 116       return gzwrite (warc_current_gzfile, buffer, size);
 117     }
 118   else
 119 #endif
 120     return fwrite (buffer, 1, size, warc_current_file);
 121 }
 122
 123 /* Writes STR to the current WARC file.
 124    Returns false and set warc_write_ok to false if there
 125    is an error.  */
 126 static bool
 127 warc_write_string (const char *str)
 128 {
 129   if (!warc_write_ok)
 130     return false;
 131
 132   size_t n = strlen (str);
 133   if (n != warc_write_buffer (str, n))
 134     warc_write_ok = false;
 135
 136   return warc_write_ok;
 137 }
 138
 139
 140 #define EXTRA_GZIP_HEADER_SIZE 12
 141 #define GZIP_STATIC_HEADER_SIZE  10
 142 #define FLG_FEXTRA          0x04
 143 #define OFF_FLG             3
 144
 145 /* Starts a new WARC record.  Writes the version header.
 146    If opt.warc_maxsize is set and the current file is becoming
 147    too large, this will open a new WARC file.
 148
 149    If compression is enabled, this will start a new
 150    gzip stream in the current WARC file.
 151
 152    Returns false and set warc_write_ok to false if there
 153    is an error.  */
 154 static bool
 155 warc_write_start_record ()
 156 {
 157   if (!warc_write_ok)
 158     return false;
 159
 160   fflush (warc_current_file);
 161   if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
 162     warc_start_new_file (false);
 163
 164 #ifdef HAVE_LIBZ
 165   /* Start a GZIP stream, if required. */
 166   if (opt.warc_compression_enabled)
 167     {
 168       /* Record the starting offset of the new record. */
 169       warc_current_gzfile_offset = ftell (warc_current_file);
 170
 171       /* Reserve space for the extra GZIP header field.
 172          In warc_write_end_record we will fill this space
 173          with information about the uncompressed and
 174          compressed size of the record. */
 175       fprintf (warc_current_file, "XXXXXXXXXXXX");
 176       fflush (warc_current_file);
 177
 178       /* Start a new GZIP stream. */
 179       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 180       warc_current_gzfile_uncompressed_size = 0;
 181
 182       if (warc_current_gzfile == NULL)
 183         {
 184           logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
 185           warc_write_ok = false;
 186           return false;
 187         }
 188     }
 189 #endif
 190
 191   warc_write_string ("WARC/1.0\r\n");
 192   return warc_write_ok;
 193 }
 194
 195 /* Writes a WARC header to the current WARC record.
 196    This method may be run after warc_write_start_record and
 197    before warc_write_block_from_file.  */
 198 static bool
 199 warc_write_header (const char *name, const char *value)
 200 {
 201   if (value)
 202     {
 203       warc_write_string (name);
 204       warc_write_string (": ");
 205       warc_write_string (value);
 206       warc_write_string ("\r\n");
 207     }
 208   return warc_write_ok;
 209 }
 210
 211 /* Copies the contents of DATA_IN to the WARC record.
 212    Adds a Content-Length header to the WARC record.
 213    Run this method after warc_write_header,
 214    then run warc_write_end_record. */
 215 static bool
 216 warc_write_block_from_file (FILE *data_in)
 217 {
 218   /* Add the Content-Length header. */
 219   char *content_length;
 220   fseek (data_in, 0L, SEEK_END);
 221   if (! asprintf (&content_length, "%ld", ftell (data_in)))
 222     {
 223       warc_write_ok = false;
 224       return false;
 225     }
 226   warc_write_header ("Content-Length", content_length);
 227   free (content_length);
 228
 229   /* End of the WARC header section. */
 230   warc_write_string ("\r\n");
 231
 232   if (fseek (data_in, 0L, SEEK_SET) != 0)
 233     warc_write_ok = false;
 234
 235   /* Copy the data in the file to the WARC record. */
 236   char buffer[BUFSIZ];
 237   size_t s;
 238   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 239     {
 240       if (warc_write_buffer (buffer, s) < s)
 241         warc_write_ok = false;
 242     }
 243
 244   return warc_write_ok;
 245 }
 246
 247 /* Run this method to close the current WARC record.
 248
 249    If compression is enabled, this method closes the
 250    current GZIP stream and fills the extra GZIP header
 251    with the uncompressed and compressed length of the
 252    record. */
 253 static bool
 254 warc_write_end_record ()
 255 {
 256   warc_write_buffer ("\r\n\r\n", 4);
 257
 258 #ifdef HAVE_LIBZ
 259   /* We start a new gzip stream for each record.  */
 260   if (warc_write_ok && warc_current_gzfile)
 261     {
 262       if (gzclose (warc_current_gzfile) != Z_OK)
 263         {
 264           warc_write_ok = false;
 265           return false;
 266         }
 267
 268       fflush (warc_current_file);
 269       fseek (warc_current_file, 0, SEEK_END);
 270
 271       /* The WARC standard suggests that we add 'skip length' data in the
 272          extra header field of the GZIP stream.
 273
 274          In warc_write_start_record we reserved space for this extra header.
 275          This extra space starts at warc_current_gzfile_offset and fills
 276          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 277          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 278
 279          We need to do three things:
 280          1. Move the static GZIP header to warc_current_gzfile_offset;
 281          2. Set the FEXTRA flag in the GZIP header;
 282          3. Write the extra GZIP header after the static header, that is,
 283             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 284       */
 285
 286       /* Calculate the uncompressed and compressed sizes. */
 287       size_t current_offset = ftell (warc_current_file);
 288       size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 289       size_t compressed_size = warc_current_gzfile_uncompressed_size;
 290
 291       /* Go back to the static GZIP header. */
 292       fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 293
 294       /* Read the header. */
 295       char static_header[GZIP_STATIC_HEADER_SIZE];
 296       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 297       if (result != GZIP_STATIC_HEADER_SIZE)
 298         {
 299           warc_write_ok = false;
 300           return false;
 301         }
 302
 303       /* Set the FEXTRA flag in the flags byte of the header. */
 304       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 305
 306       /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
 307       fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 308       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 309
 310       /* Prepare the extra GZIP header. */
 311       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 312       /* XLEN, the length of the extra header fields.  */
 313       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 314       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 315       /* The extra header field identifier for the WARC skip length. */
 316       extra_header[2]  = 's';
 317       extra_header[3]  = 'l';
 318       /* The size of the uncompressed record.  */
 319       extra_header[4]  = (uncompressed_size & 255);
 320       extra_header[5]  = (uncompressed_size >> 8) & 255;
 321       extra_header[6]  = (uncompressed_size >> 16) & 255;
 322       extra_header[7]  = (uncompressed_size >> 24) & 255;
 323       /* The size of the compressed record.  */
 324       extra_header[8]  = (compressed_size & 255);
 325       extra_header[9]  = (compressed_size >> 8) & 255;
 326       extra_header[10] = (compressed_size >> 16) & 255;
 327       extra_header[11] = (compressed_size >> 24) & 255;
 328
 329       /* Write the extra header after the static header. */
 330       fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 331       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 332
 333       /* Done, move back to the end of the file. */
 334       fflush (warc_current_file);
 335       fseek (warc_current_file, 0, SEEK_END);
 336     }
 337 #endif /* HAVE_LIBZ */
 338
 339   return warc_write_ok;
 340 }
 341
 342
 343 /* Writes the WARC-Date header for the given timestamp to
 344    the current WARC record.
 345    If timestamp is NULL, the current time will be used.  */
 346 static bool
 347 warc_write_date_header (char *timestamp)
 348 {
 349   if (timestamp == NULL)
 350     {
 351       char current_timestamp[21];
 352       warc_timestamp (current_timestamp);
 353       timestamp = current_timestamp;
 354     }
 355   return warc_write_header ("WARC-Date", timestamp);
 356 }
 357
 358 /* Writes the WARC-IP-Address header for the given IP to
 359    the current WARC record.  If IP is NULL, no header will
 360    be written.  */
 361 static bool
 362 warc_write_ip_header (ip_address *ip)
 363 {
 364   if (ip != NULL)
 365     return warc_write_header ("WARC-IP-Address", print_address (ip));
 366   else
 367     return warc_write_ok;
 368 }
 369
 370
 371 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 372    from gnulib/sha1.c.  This version calculates two digests in one go.
 373
 374    Compute SHA1 message digests for bytes read from STREAM.  The
 375    digest of the complete file will be written into the 16 bytes
 376    beginning at RES_BLOCK.
 377
 378    If payload_offset >= 0, a second digest will be calculated of the
 379    portion of the file starting at payload_offset and continuing to
 380    the end of the file.  The digest number will be written into the
 381    16 bytes beginning ad RES_PAYLOAD.  */
 382 static int
 383 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
 384 {
 385 #define BLOCKSIZE 32768
 386
 387   struct sha1_ctx ctx_block;
 388   struct sha1_ctx ctx_payload;
 389   long int pos;
 390   size_t sum;
 391
 392   char *buffer = malloc (BLOCKSIZE + 72);
 393   if (!buffer)
 394     return 1;
 395
 396   /* Initialize the computation context.  */
 397   sha1_init_ctx (&ctx_block);
 398   if (payload_offset >= 0)
 399     sha1_init_ctx (&ctx_payload);
 400
 401   pos = 0;
 402
 403   /* Iterate over full file contents.  */
 404   while (1)
 405     {
 406       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 407          computation function processes the whole buffer so that with the
 408          next round of the loop another block can be read.  */
 409       size_t n;
 410       sum = 0;
 411
 412       /* Read block.  Take care for partial reads.  */
 413       while (1)
 414         {
 415           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 416
 417           sum += n;
 418           pos += n;
 419
 420           if (sum == BLOCKSIZE)
 421             break;
 422
 423           if (n == 0)
 424             {
 425               /* Check for the error flag IFF N == 0, so that we don't
 426                  exit the loop after a partial read due to e.g., EAGAIN
 427                  or EWOULDBLOCK.  */
 428               if (ferror (stream))
 429                 {
 430                   free (buffer);
 431                   return 1;
 432                 }
 433               goto process_partial_block;
 434             }
 435
 436           /* We've read at least one byte, so ignore errors.  But always
 437              check for EOF, since feof may be true even though N > 0.
 438              Otherwise, we could end up calling fread after EOF.  */
 439           if (feof (stream))
 440             goto process_partial_block;
 441         }
 442
 443       /* Process buffer with BLOCKSIZE bytes.  Note that
 444                         BLOCKSIZE % 64 == 0
 445        */
 446       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 447       if (payload_offset >= 0 && payload_offset < pos)
 448         {
 449           /* At least part of the buffer contains data from payload. */
 450           int start_of_payload = payload_offset - (pos - BLOCKSIZE);
 451           if (start_of_payload <= 0)
 452             /* All bytes in the buffer belong to the payload. */
 453             start_of_payload = 0;
 454
 455           /* Process the payload part of the buffer.
 456              Note: we can't use  sha1_process_block  here even if we
 457              process the complete buffer.  Because the payload doesn't
 458              have to start with a full block, there may still be some
 459              bytes left from the previous buffer.  Therefore, we need
 460              to continue with  sha1_process_bytes.  */
 461           sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
 462         }
 463     }
 464
 465  process_partial_block:;
 466
 467   /* Process any remaining bytes.  */
 468   if (sum > 0)
 469     {
 470       sha1_process_bytes (buffer, sum, &ctx_block);
 471       if (payload_offset >= 0 && payload_offset < pos)
 472         {
 473           /* At least part of the buffer contains data from payload. */
 474           int start_of_payload = payload_offset - (pos - sum);
 475           if (start_of_payload <= 0)
 476             /* All bytes in the buffer belong to the payload. */
 477             start_of_payload = 0;
 478
 479           /* Process the payload part of the buffer. */
 480           sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
 481         }
 482     }
 483
 484   /* Construct result in desired memory.  */
 485   sha1_finish_ctx (&ctx_block,   res_block);
 486   if (payload_offset >= 0)
 487     sha1_finish_ctx (&ctx_payload, res_payload);
 488   free (buffer);
 489   return 0;
 490
 491 #undef BLOCKSIZE
 492 }
 493
 494 /* Converts the SHA1 digest to a base32-encoded string.
 495    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 496 static char *
 497 warc_base32_sha1_digest (char *sha1_digest)
 498 {
 499   // length: "sha1:" + digest + "\0"
 500   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 501   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 502   memcpy (sha1_base32, "sha1:", 5);
 503   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 504   return sha1_base32;
 505 }
 506
 507
 508 /* Sets the digest headers of the record.
 509    This method will calculate the block digest and, if payload_offset >= 0,
 510    will also calculate the payload digest of the payload starting at the
 511    provided offset.  */
 512 static void
 513 warc_write_digest_headers (FILE *file, long payload_offset)
 514 {
 515   if (opt.warc_digests_enabled)
 516     {
 517       /* Calculate the block and payload digests. */
 518       char sha1_res_block[SHA1_DIGEST_SIZE];
 519       char sha1_res_payload[SHA1_DIGEST_SIZE];
 520
 521       rewind (file);
 522       if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
 523         {
 524           char *digest;
 525
 526           digest = warc_base32_sha1_digest (sha1_res_block);
 527           warc_write_header ("WARC-Block-Digest", digest);
 528           free (digest);
 529
 530           if (payload_offset >= 0)
 531             {
 532               digest = warc_base32_sha1_digest (sha1_res_payload);
 533               warc_write_header ("WARC-Payload-Digest", digest);
 534               free (digest);
 535             }
 536         }
 537     }
 538 }
 539
 540
 541 /* Fills timestamp with the current time and date.
 542    The UTC time is formatted following ISO 8601, as required
 543    for use in the WARC-Date header.
 544    The timestamp will be 21 characters long. */
 545 void
 546 warc_timestamp (char *timestamp)
 547 {
 548   time_t rawtime;
 549   struct tm * timeinfo;
 550   time ( &rawtime );
 551   timeinfo = gmtime (&rawtime);
 552   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 553 }
 554
 555 /* Fills uuid_str with a UUID based on random numbers.
 556    (See RFC 4122, UUID version 4.)
 557
 558    Note: this is a fallback method, it is much better to use the
 559    methods provided by libuuid.
 560
 561    The uuid_str will be 36 characters long. */
 562 static void
 563 warc_uuid_random (char *uuid_str)
 564 {
 565   // RFC 4122, a version 4 UUID with only random numbers
 566
 567   unsigned char uuid_data[16];
 568   int i;
 569   for (i=0; i<16; i++)
 570     uuid_data[i] = random_number (255);
 571
 572   // Set the four most significant bits (bits 12 through 15) of the
 573   // time_hi_and_version field to the 4-bit version number
 574   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 575
 576   // Set the two most significant bits (bits 6 and 7) of the
 577   // clock_seq_hi_and_reserved to zero and one, respectively.
 578   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 579
 580   sprintf (uuid_str,
 581     "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 582     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 583     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 584     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 585     uuid_data[15]);
 586 }
 587
 588 /* Fills urn_str with a UUID in the format required
 589    for the WARC-Record-Id header.
 590    The string will be 47 characters long. */
 591 void
 592 warc_uuid_str (char *urn_str)
 593 {
 594   char uuid_str[37];
 595
 596 # ifdef HAVE_LIBUUID
 597   uuid_t record_id;
 598   uuid_generate (record_id);
 599   uuid_unparse (record_id, uuid_str);
 600 # else
 601   warc_uuid_random (uuid_str);
 602 # endif
 603
 604   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 605 }
 606
 607 /* Write a warcinfo record to the current file.
 608    Updates warc_current_warcinfo_uuid_str. */
 609 bool
 610 warc_write_warcinfo_record (char *filename)
 611 {
 612   /* Write warc-info record as the first record of the file. */
 613   /* We add the record id of this info record to the other records in the file. */
 614   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 615   warc_uuid_str (warc_current_warcinfo_uuid_str);
 616
 617   char timestamp[22];
 618   warc_timestamp (timestamp);
 619
 620   char *filename_copy, *filename_basename;
 621   filename_copy = strdup (filename);
 622   filename_basename = strdup (basename (filename_copy));
 623
 624   warc_write_start_record ();
 625   warc_write_header ("WARC-Type", "warcinfo");
 626   warc_write_header ("Content-Type", "application/warc-fields");
 627   warc_write_header ("WARC-Date", timestamp);
 628   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 629   warc_write_header ("WARC-Filename", filename_basename);
 630
 631   /* Create content.  */
 632   FILE *warc_tmp = warc_tempfile ();
 633   if (warc_tmp == NULL)
 634     {
 635       free (filename_copy);
 636       free (filename_basename);
 637       return false;
 638     }
 639
 640   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 641   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 642   fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 643   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 644   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 645   /* Add the user headers, if any. */
 646   if (opt.warc_user_headers)
 647     {
 648       int i;
 649       for (i = 0; opt.warc_user_headers[i]; i++)
 650         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 651     }
 652   fprintf(warc_tmp, "\r\n");
 653
 654   warc_write_digest_headers (warc_tmp, -1);
 655   warc_write_block_from_file (warc_tmp);
 656   warc_write_end_record ();
 657
 658   if (! warc_write_ok)
 659     {
 660       logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 661     }
 662
 663   free (filename_copy);
 664   free (filename_basename);
 665   fclose (warc_tmp);
 666   return warc_write_ok;
 667 }
 668
 669 /* Opens a new WARC file.
 670    If META is true, generates a filename ending with 'meta.warc.gz'.
 671
 672    This method will:
 673    1. close the current WARC file (if there is one);
 674    2. increment warc_current_file_number;
 675    3. open a new WARC file;
 676    4. write the initial warcinfo record.
 677
 678    Returns true on success, false otherwise.
 679    */
 680 static bool
 681 warc_start_new_file (bool meta)
 682 {
 683   if (opt.warc_filename == NULL)
 684     return false;
 685
 686   if (warc_current_file != NULL)
 687     fclose (warc_current_file);
 688   if (warc_current_warcinfo_uuid_str)
 689     free (warc_current_warcinfo_uuid_str);
 690   if (warc_current_filename)
 691     free (warc_current_filename);
 692
 693   warc_current_file_number++;
 694
 695   int base_filename_length = strlen (opt.warc_filename);
 696   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 697   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 698   warc_current_filename = new_filename;
 699
 700 #ifdef HAVE_LIBZ
 701   char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 702 #else
 703   char *extension = "warc";
 704 #endif
 705
 706   /* If max size is enabled, we add a serial number to the file names. */
 707   if (meta)
 708     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 709   else if (opt.warc_maxsize > 0)
 710     sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
 711   else
 712     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 713
 714   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 715
 716   /* Open the WARC file. */
 717   warc_current_file = fopen (new_filename, "wb+");
 718   if (warc_current_file == NULL)
 719     {
 720       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
 721       return false;
 722     }
 723
 724   if (! warc_write_warcinfo_record (new_filename))
 725     return false;
 726
 727   /* Add warcinfo uuid to manifest. */
 728   if (warc_manifest_fp)
 729     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 730
 731   return true;
 732 }
 733
 734 /* Opens the CDX file for output. */
 735 static bool
 736 warc_start_cdx_file ()
 737 {
 738   int filename_length = strlen (opt.warc_filename);
 739   char *cdx_filename = alloca (filename_length + 4 + 1);
 740   memcpy (cdx_filename, opt.warc_filename, filename_length);
 741   memcpy (cdx_filename + filename_length, ".cdx", 5);
 742   warc_current_cdx_file = fopen (cdx_filename, "a+");
 743   if (warc_current_cdx_file == NULL)
 744     return false;
 745
 746   /* Print the CDX header.
 747    *
 748    * a - original url
 749    * b - date
 750    * m - mime type
 751    * s - response code
 752    * k - new style checksum
 753    * r - redirect
 754    * M - meta tags
 755    * V - compressed arc file offset
 756    * g - file name
 757    * u - record-id
 758    */
 759   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 760   fflush (warc_current_cdx_file);
 761
 762   return true;
 763 }
 764
 765 #define CDX_FIELDSEP " \t\r\n"
 766
 767 /* Parse the CDX header and find the field numbers of the original url,
 768    checksum and record ID fields. */
 769 static bool
 770 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
 771 {
 772   *field_num_original_url = -1;
 773   *field_num_checksum = -1;
 774   *field_num_record_id = -1;
 775
 776   char *token;
 777   char *save_ptr;
 778   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 779
 780   if (token != NULL && strcmp (token, "CDX") == 0)
 781     {
 782       int field_num = 0;
 783       while (token != NULL)
 784         {
 785           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 786           if (token != NULL)
 787             {
 788               switch (token[0])
 789                 {
 790                 case 'a':
 791                   *field_num_original_url = field_num;
 792                   break;
 793                 case 'k':
 794                   *field_num_checksum = field_num;
 795                   break;
 796                 case 'u':
 797                   *field_num_record_id = field_num;
 798                   break;
 799                 }
 800             }
 801           field_num++;
 802         }
 803     }
 804
 805   return *field_num_original_url != -1
 806          && *field_num_checksum != -1
 807          && *field_num_record_id != -1;
 808 }
 809
 810 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 811 static void
 812 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
 813 {
 814   char *original_url = NULL;
 815   char *checksum = NULL;
 816   char *record_id = NULL;
 817
 818   char *token;
 819   char *save_ptr;
 820   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 821
 822   /* Read this line to get the fields we need. */
 823   int field_num = 0;
 824   while (token != NULL)
 825     {
 826       char **val;
 827       if (field_num == field_num_original_url)
 828         val = &original_url;
 829       else if (field_num == field_num_checksum)
 830         val = &checksum;
 831       else if (field_num == field_num_record_id)
 832         val = &record_id;
 833       else
 834         val = NULL;
 835
 836       if (val != NULL)
 837         *val = strdup (token);
 838
 839       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 840       field_num++;
 841     }
 842
 843   if (original_url != NULL && checksum != NULL && record_id != NULL)
 844     {
 845       /* For some extra efficiency, we decode the base32 encoded
 846          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 847          bytes.  */
 848       size_t checksum_l;
 849       char * checksum_v;
 850       base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
 851       free (checksum);
 852
 853       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 854         {
 855           /* This is a valid line with a valid checksum. */
 856           struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
 857           rec->url = original_url;
 858           rec->uuid = record_id;
 859           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 860           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 861           free (checksum_v);
 862         }
 863       else
 864         {
 865           free (original_url);
 866           if (checksum_v != NULL)
 867             free (checksum_v);
 868           free (record_id);
 869         }
 870     }
 871 }
 872
 873 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 874    the warc_cdx_dedup_table. */
 875 bool
 876 warc_load_cdx_dedup_file ()
 877 {
 878   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 879   if (f == NULL)
 880     return false;
 881
 882   int field_num_original_url = -1;
 883   int field_num_checksum = -1;
 884   int field_num_record_id = -1;
 885
 886   char *lineptr = NULL;
 887   size_t n = 0;
 888   size_t line_length;
 889
 890   /* The first line should contain the CDX header.
 891      Format:  " CDX x x x x x"
 892      where x are field type indicators.  For our purposes, we only
 893      need 'a' (the original url), 'k' (the SHA1 checksum) and
 894      'u' (the WARC record id). */
 895   line_length = getline (&lineptr, &n, f);
 896   if (line_length != -1)
 897     warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
 898
 899   /* If the file contains all three fields, read the complete file. */
 900   if (field_num_original_url == -1
 901       || field_num_checksum == -1
 902       || field_num_record_id == -1)
 903     {
 904       if (field_num_original_url == -1)
 905         logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 906       if (field_num_checksum == -1)
 907         logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 908       if (field_num_record_id == -1)
 909         logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 910     }
 911   else
 912     {
 913       /* Initialize the table. */
 914       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
 915
 916       do
 917         {
 918           line_length = getline (&lineptr, &n, f);
 919           if (line_length != -1)
 920             warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
 921
 922         }
 923       while (line_length != -1);
 924
 925       /* Print results. */
 926       int nrecords = hash_table_count (warc_cdx_dedup_table);
 927       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 928                                         "Loaded %d records from CDX.\n\n", nrecords),
 929                               nrecords);
 930     }
 931
 932   fclose (f);
 933
 934   return true;
 935 }
 936 #undef CDX_FIELDSEP
 937
 938 /* Returns the existing duplicate CDX record for the given url and payload
 939    digest.  Returns NULL if the url is not found or if the payload digest
 940    does not match, or if CDX deduplication is disabled. */
 941 static struct warc_cdx_record *
 942 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 943 {
 944   if (warc_cdx_dedup_table == NULL)
 945     return NULL;
 946
 947   char *key;
 948   struct warc_cdx_record *rec_existing;
 949   hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
 950
 951   if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
 952     return rec_existing;
 953   else
 954     return NULL;
 955 }
 956
 957 /* Initializes the WARC writer (if opt.warc_filename is set).
 958    This should be called before any WARC record is written. */
 959 void
 960 warc_init ()
 961 {
 962   warc_write_ok = true;
 963
 964   if (opt.warc_filename != NULL)
 965     {
 966       if (opt.warc_cdx_dedup_filename != NULL)
 967         {
 968           if (! warc_load_cdx_dedup_file ())
 969             {
 970               logprintf (LOG_NOTQUIET,
 971                          _("Could not read CDX file %s for deduplication.\n"),
 972                          quote (opt.warc_cdx_dedup_filename));
 973               exit(1);
 974             }
 975         }
 976
 977       warc_manifest_fp = warc_tempfile ();
 978       if (warc_manifest_fp == NULL)
 979         {
 980           logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
 981           exit(1);
 982         }
 983
 984       if (opt.warc_keep_log)
 985         {
 986           warc_log_fp = warc_tempfile ();
 987           if (warc_log_fp == NULL)
 988             {
 989               logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
 990               exit(1);
 991             }
 992           log_set_warc_log_fp (warc_log_fp);
 993         }
 994
 995       warc_current_file_number = -1;
 996       if (! warc_start_new_file (false))
 997         {
 998           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
 999           exit(1);
1000         }
1001
1002       if (opt.warc_cdx_enabled)
1003         {
1004           if (! warc_start_cdx_file ())
1005             {
1006               logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
1007               exit(1);
1008             }
1009         }
1010     }
1011 }
1012
1013 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1014 void
1015 warc_write_metadata ()
1016 {
1017   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1018   if (opt.warc_maxsize > 0)
1019     warc_start_new_file (true);
1020
1021   char manifest_uuid [48];
1022   warc_uuid_str (manifest_uuid);
1023
1024   fflush (warc_manifest_fp);
1025   warc_write_resource_record (manifest_uuid,
1026                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1027                               NULL, NULL, NULL, "text/plain",
1028                               warc_manifest_fp, -1);
1029   /* warc_write_resource_record has closed warc_manifest_fp. */
1030
1031   FILE * warc_tmp_fp = warc_tempfile ();
1032   if (warc_tmp_fp == NULL)
1033     {
1034       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1035       exit(1);
1036     }
1037   fflush (warc_tmp_fp);
1038   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1039
1040   warc_write_resource_record (manifest_uuid,
1041                               "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1042                               NULL, NULL, NULL, "text/plain",
1043                               warc_tmp_fp, -1);
1044   /* warc_write_resource_record has closed warc_tmp_fp. */
1045
1046   if (warc_log_fp != NULL)
1047     {
1048       warc_write_resource_record (NULL,
1049                                   "metadata://gnu.org/software/wget/warc/wget.log",
1050                                   NULL, manifest_uuid, NULL, "text/plain",
1051                                   warc_log_fp, -1);
1052       /* warc_write_resource_record has closed warc_log_fp. */
1053
1054       warc_log_fp = NULL;
1055       log_set_warc_log_fp (NULL);
1056     }
1057 }
1058
1059 /* Finishes the WARC writing.
1060    This should be called at the end of the program. */
1061 void
1062 warc_close ()
1063 {
1064   if (warc_current_file != NULL)
1065     {
1066       warc_write_metadata ();
1067       free (warc_current_warcinfo_uuid_str);
1068       fclose (warc_current_file);
1069     }
1070   if (warc_current_cdx_file != NULL)
1071     fclose (warc_current_cdx_file);
1072   if (warc_log_fp != NULL)
1073     {
1074       fclose (warc_log_fp);
1075       log_set_warc_log_fp (NULL);
1076     }
1077 }
1078
1079 /* Creates a temporary file for writing WARC output.
1080    The temporary file will be created in opt.warc_tempdir.
1081    Returns the pointer to the temporary file, or NULL. */
1082 FILE *
1083 warc_tempfile ()
1084 {
1085   char filename[100];
1086   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1087     return NULL;
1088
1089   int fd = mkstemp (filename);
1090   if (fd < 0)
1091     return NULL;
1092
1093   if (unlink (filename) < 0)
1094     return NULL;
1095
1096   return fdopen (fd, "wb+");
1097 }
1098
1099
1100 /* Writes a request record to the WARC file.
1101    url  is the target uri of the request,
1102    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1103    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1104    body  is a pointer to a file containing the request headers and body.
1105    ip  is the ip address of the server (or NULL),
1106    Calling this function will close body.
1107    Returns true on success, false on error. */
1108 bool
1109 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1110 {
1111   warc_write_start_record ();
1112   warc_write_header ("WARC-Type", "request");
1113   warc_write_header ("WARC-Target-URI", url);
1114   warc_write_header ("Content-Type", "application/http;msgtype=request");
1115   warc_write_date_header (timestamp_str);
1116   warc_write_header ("WARC-Record-ID", record_uuid);
1117   warc_write_ip_header (ip);
1118   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1119   warc_write_digest_headers (body, payload_offset);
1120   warc_write_block_from_file (body);
1121   warc_write_end_record ();
1122
1123   fclose (body);
1124
1125   return warc_write_ok;
1126 }
1127
1128 /* Writes a response record to the CDX file.
1129    url  is the target uri of the request/response,
1130    timestamp_str  is the timestamp of the request that generated this response,
1131                   (generated with warc_timestamp),
1132    mime_type  is the mime type of the response body (will be printed to CDX),
1133    response_code  is the HTTP response code (will be printed to CDX),
1134    payload_digest  is the sha1 digest of the payload,
1135    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1136    offset  is the position of the WARC record in the WARC file,
1137    warc_filename  is the filename of the WARC,
1138    response_uuid  is the uuid of the response.
1139    Returns true on success, false on error. */
1140 static bool
1141 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1142 {
1143   /* Transform the timestamp. */
1144   char timestamp_str_cdx [15];
1145   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1146   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1147   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1148   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1149   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1150   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1151   timestamp_str_cdx[14] = '\0';
1152
1153   /* Rewrite the checksum. */
1154   char *checksum;
1155   if (payload_digest != NULL)
1156     checksum = payload_digest + 5; /* Skip the "sha1:" */
1157   else
1158     checksum = "-";
1159
1160   if (mime_type == NULL || strlen(mime_type) == 0)
1161     mime_type = "-";
1162   if (redirect_location == NULL || strlen(redirect_location) == 0)
1163     redirect_location = "-";
1164
1165   /* Print the CDX line. */
1166   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1167   fflush (warc_current_cdx_file);
1168
1169   return true;
1170 }
1171
1172 /* Writes a revisit record to the WARC file.
1173    url  is the target uri of the request/response,
1174    timestamp_str  is the timestamp of the request that generated this response
1175                   (generated with warc_timestamp),
1176    concurrent_to_uuid  is the uuid of the request for that generated this response
1177                  (generated with warc_uuid_str),
1178    refers_to_uuid  is the uuid of the original response
1179                  (generated with warc_uuid_str),
1180    payload_digest  is the sha1 digest of the payload,
1181    ip  is the ip address of the server (or NULL),
1182    body  is a pointer to a file containing the response headers (without payload).
1183    Calling this function will close body.
1184    Returns true on success, false on error. */
1185 static bool
1186 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1187 {
1188   char revisit_uuid [48];
1189   warc_uuid_str (revisit_uuid);
1190
1191   char *block_digest = NULL;
1192   char sha1_res_block[SHA1_DIGEST_SIZE];
1193   sha1_stream (body, sha1_res_block);
1194   block_digest = warc_base32_sha1_digest (sha1_res_block);
1195
1196   warc_write_start_record ();
1197   warc_write_header ("WARC-Type", "revisit");
1198   warc_write_header ("WARC-Record-ID", revisit_uuid);
1199   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1200   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1201   warc_write_header ("WARC-Refers-To", refers_to);
1202   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1203   warc_write_header ("WARC-Truncated", "length");
1204   warc_write_header ("WARC-Target-URI", url);
1205   warc_write_date_header (timestamp_str);
1206   warc_write_ip_header (ip);
1207   warc_write_header ("Content-Type", "application/http;msgtype=response");
1208   warc_write_header ("WARC-Block-Digest", block_digest);
1209   warc_write_header ("WARC-Payload-Digest", payload_digest);
1210   warc_write_block_from_file (body);
1211   warc_write_end_record ();
1212
1213   fclose (body);
1214   free (block_digest);
1215
1216   return warc_write_ok;
1217 }
1218
1219 /* Writes a response record to the WARC file.
1220    url  is the target uri of the request/response,
1221    timestamp_str  is the timestamp of the request that generated this response
1222                   (generated with warc_timestamp),
1223    concurrent_to_uuid  is the uuid of the request for that generated this response
1224                  (generated with warc_uuid_str),
1225    ip  is the ip address of the server (or NULL),
1226    body  is a pointer to a file containing the response headers and body.
1227    mime_type  is the mime type of the response body (will be printed to CDX),
1228    response_code  is the HTTP response code (will be printed to CDX),
1229    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1230    Calling this function will close body.
1231    Returns true on success, false on error. */
1232 bool
1233 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1234 {
1235   char *block_digest = NULL;
1236   char *payload_digest = NULL;
1237   char sha1_res_block[SHA1_DIGEST_SIZE];
1238   char sha1_res_payload[SHA1_DIGEST_SIZE];
1239
1240   if (opt.warc_digests_enabled)
1241     {
1242       /* Calculate the block and payload digests. */
1243       rewind (body);
1244       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1245         {
1246           /* Decide (based on url + payload digest) if we have seen this
1247              data before. */
1248           struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1249           if (rec_existing != NULL)
1250             {
1251               /* Found an existing record. */
1252               logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1253
1254               /* Remove the payload from the file. */
1255               if (payload_offset > 0)
1256                 {
1257                   if (ftruncate (fileno (body), payload_offset) == -1)
1258                     return false;
1259                 }
1260
1261               /* Send the original payload digest. */
1262               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1263               bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1264               free (payload_digest);
1265
1266               return result;
1267             }
1268
1269           block_digest = warc_base32_sha1_digest (sha1_res_block);
1270           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1271         }
1272     }
1273
1274   /* Not a revisit, just store the record. */
1275
1276   char response_uuid [48];
1277   warc_uuid_str (response_uuid);
1278
1279   fseek (warc_current_file, 0L, SEEK_END);
1280   size_t offset = ftell (warc_current_file);
1281
1282   warc_write_start_record ();
1283   warc_write_header ("WARC-Type", "response");
1284   warc_write_header ("WARC-Record-ID", response_uuid);
1285   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1286   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1287   warc_write_header ("WARC-Target-URI", url);
1288   warc_write_date_header (timestamp_str);
1289   warc_write_ip_header (ip);
1290   warc_write_header ("WARC-Block-Digest", block_digest);
1291   warc_write_header ("WARC-Payload-Digest", payload_digest);
1292   warc_write_header ("Content-Type", "application/http;msgtype=response");
1293   warc_write_block_from_file (body);
1294   warc_write_end_record ();
1295
1296   fclose (body);
1297
1298   if (warc_write_ok && opt.warc_cdx_enabled)
1299     {
1300       /* Add this record to the CDX. */
1301       warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1302     }
1303
1304   if (block_digest)
1305     free (block_digest);
1306   if (payload_digest)
1307     free (payload_digest);
1308
1309   return warc_write_ok;
1310 }
1311
1312 /* Writes a resource record to the WARC file.
1313    resource_uuid  is the uuid of the resource (or NULL),
1314    url  is the target uri of the resource,
1315    timestamp_str  is the timestamp (generated with warc_timestamp),
1316    concurrent_to_uuid  is the uuid of the request for that generated this resource
1317                  (generated with warc_uuid_str) or NULL,
1318    ip  is the ip address of the server (or NULL),
1319    content_type  is the mime type of the body (or NULL),
1320    body  is a pointer to a file containing the resource data.
1321    Calling this function will close body.
1322    Returns true on success, false on error. */
1323 bool
1324 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1325 {
1326   if (resource_uuid == NULL)
1327     {
1328       resource_uuid = alloca (48);
1329       warc_uuid_str (resource_uuid);
1330     }
1331
1332   if (content_type == NULL)
1333     content_type = "application/octet-stream";
1334
1335   warc_write_start_record ();
1336   warc_write_header ("WARC-Type", "resource");
1337   warc_write_header ("WARC-Record-ID", resource_uuid);
1338   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1339   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1340   warc_write_header ("WARC-Target-URI", url);
1341   warc_write_date_header (timestamp_str);
1342   warc_write_ip_header (ip);
1343   warc_write_digest_headers (body, payload_offset);
1344   warc_write_header ("Content-Type", content_type);
1345   warc_write_block_from_file (body);
1346   warc_write_end_record ();
1347
1348   fclose (body);
1349
1350   return warc_write_ok;
1351 }
1352