sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files. */
   2 #define _GNU_SOURCE
   3
   4 #include "wget.h"
   5 #include "hash.h"
   6 #include "utils.h"
   7
   8 #include <stdio.h>
   9 #include <stdlib.h>
  10 #include <string.h>
  11 #include <strings.h>
  12 #include <time.h>
  13 #include <tmpdir.h>
  14 #include <sha1.h>
  15 #include <base32.h>
  16 #include <unistd.h>
  17 #include <zlib.h>
  18 #ifdef HAVE_LIBUUID
  19 #include <uuid/uuid.h>
  20 #endif
  21
  22 #include "warc.h"
  23
  24 extern char *version_string;
  25
  26 /* Set by main in main.c */
  27 extern char *program_argstring;
  28
  29
  30 /* The log file (a temporary file that contains a copy
  31    of the wget log). */
  32 static FILE *warc_log_fp;
  33
  34 /* The manifest file (a temporary file that contains the
  35    warcinfo uuid of every file in this crawl). */
  36 static FILE *warc_manifest_fp;
  37
  38 /* The current WARC file (or NULL, if WARC is disabled). */
  39 static FILE *warc_current_file;
  40
  41 /* The gzip stream for the current WARC file
  42    (or NULL, if WARC or gzip is disabled). */
  43 static gzFile *warc_current_gzfile;
  44
  45 /* The offset of the current gzip record in the WARC file. */
  46 static size_t warc_current_gzfile_offset;
  47
  48 /* The uncompressed size (so far) of the current record. */
  49 static size_t warc_current_gzfile_uncompressed_size;
  50
  51 /* This is true until a warc_write_* method fails. */
  52 static bool warc_write_ok;
  53
  54 /* The current CDX file (or NULL, if CDX is disabled). */
  55 static FILE *warc_current_cdx_file;
  56
  57 /* The record id of the warcinfo record of the current WARC file.  */
  58 static char *warc_current_warcinfo_uuid_str;
  59
  60 /* The file name of the current WARC file. */
  61 static char *warc_current_filename;
  62
  63 /* The serial number of the current WARC file.  This number is
  64    incremented each time a new file is opened and is used in the
  65    WARC file's filename. */
  66 static int warc_current_file_number;
  67
  68 /* The table of CDX records, if deduplication is enabled. */
  69 struct hash_table * warc_cdx_dedup_table;
  70
  71 static bool warc_start_new_file (bool meta);
  72
  73
  74 struct warc_cdx_record
  75 {
  76   char *url;
  77   char *uuid;
  78   char digest[SHA1_DIGEST_SIZE];
  79 };
  80
  81 static unsigned long
  82 warc_hash_sha1_digest (const void *key)
  83 {
  84   /* We just use some of the first bytes of the digest. */
  85   unsigned long v = 0;
  86   memcpy (&v, key, sizeof (unsigned long));
  87   return v;
  88 }
  89
  90 static int
  91 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
  92 {
  93   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
  94 }
  95
  96
  97
  98 /* Writes SIZE bytes from BUFFER to the current WARC file,
  99    through gzwrite if compression is enabled.
 100    Returns the number of uncompressed bytes written.  */
 101 static size_t
 102 warc_write_buffer (const char *buffer, size_t size)
 103 {
 104   if (warc_current_gzfile)
 105     {
 106       warc_current_gzfile_uncompressed_size += size;
 107       return gzwrite (warc_current_gzfile, buffer, size);
 108     }
 109   else
 110     return fwrite (buffer, 1, size, warc_current_file);
 111 }
 112
 113 /* Writes STR to the current WARC file.
 114    Returns false and set warc_write_ok to false if there
 115    is an error.  */
 116 static bool
 117 warc_write_string (const char *str)
 118 {
 119   if (!warc_write_ok)
 120     return false;
 121
 122   size_t n = strlen (str);
 123   if (n != warc_write_buffer (str, n))
 124     warc_write_ok = false;
 125
 126   return warc_write_ok;
 127 }
 128
 129
 130 #define EXTRA_GZIP_HEADER_SIZE 12
 131 #define GZIP_STATIC_HEADER_SIZE  10
 132 #define FLG_FEXTRA          0x04
 133 #define OFF_FLG             3
 134
 135 /* Starts a new WARC record.  Writes the version header.
 136    If opt.warc_maxsize is set and the current file is becoming
 137    too large, this will open a new WARC file.
 138
 139    If compression is enabled, this will start a new
 140    gzip stream in the current WARC file.
 141
 142    Returns false and set warc_write_ok to false if there
 143    is an error.  */
 144 static bool
 145 warc_write_start_record ()
 146 {
 147   if (!warc_write_ok)
 148     return false;
 149
 150   fflush (warc_current_file);
 151   if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
 152     warc_start_new_file (false);
 153
 154   /* Start a GZIP stream, if required. */
 155   if (opt.warc_compression_enabled)
 156     {
 157       /* Record the starting offset of the new record. */
 158       warc_current_gzfile_offset = ftell (warc_current_file);
 159
 160       /* Reserve space for the extra GZIP header field.
 161          In warc_write_end_record we will fill this space
 162          with information about the uncompressed and
 163          compressed size of the record. */
 164       fprintf (warc_current_file, "XXXXXXXXXXXX");
 165       fflush (warc_current_file);
 166
 167       /* Start a new GZIP stream. */
 168       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb+9");
 169       warc_current_gzfile_uncompressed_size = 0;
 170
 171       if (warc_current_gzfile == NULL)
 172         {
 173           logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
 174           warc_write_ok = false;
 175           return false;
 176         }
 177     }
 178
 179   warc_write_string ("WARC/1.0\r\n");
 180   return warc_write_ok;
 181 }
 182
 183 /* Writes a WARC header to the current WARC record.
 184    This method may be run after warc_write_start_record and
 185    before warc_write_block_from_file.  */
 186 static bool
 187 warc_write_header (const char *name, const char *value)
 188 {
 189   if (value)
 190     {
 191       warc_write_string (name);
 192       warc_write_string (": ");
 193       warc_write_string (value);
 194       warc_write_string ("\r\n");
 195     }
 196   return warc_write_ok;
 197 }
 198
 199 /* Copies the contents of DATA_IN to the WARC record.
 200    Adds a Content-Length header to the WARC record.
 201    Run this method after warc_write_header,
 202    then run warc_write_end_record. */
 203 static bool
 204 warc_write_block_from_file (FILE *data_in)
 205 {
 206   /* Add the Content-Length header. */
 207   char *content_length;
 208   fseek (data_in, 0L, SEEK_END);
 209   if (! asprintf (&content_length, "%ld", ftell (data_in)))
 210     {
 211       warc_write_ok = false;
 212       return false;
 213     }
 214   warc_write_header ("Content-Length", content_length);
 215   free (content_length);
 216
 217   /* End of the WARC header section. */
 218   warc_write_string ("\r\n");
 219
 220   if (fseek (data_in, 0L, SEEK_SET) != 0)
 221     warc_write_ok = false;
 222
 223   /* Copy the data in the file to the WARC record. */
 224   char buffer[BUFSIZ];
 225   size_t s;
 226   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 227     {
 228       if (warc_write_buffer (buffer, s) < s)
 229         warc_write_ok = false;
 230     }
 231
 232   return warc_write_ok;
 233 }
 234
 235 /* Run this method to close the current WARC record.
 236
 237    If compression is enabled, this method closes the
 238    current GZIP stream and fills the extra GZIP header
 239    with the uncompressed and compressed length of the
 240    record. */
 241 static bool
 242 warc_write_end_record ()
 243 {
 244   warc_write_buffer ("\r\n\r\n", 4);
 245
 246   /* We start a new gzip stream for each record.  */
 247   if (warc_write_ok && warc_current_gzfile)
 248     {
 249       if (gzclose (warc_current_gzfile) != Z_OK)
 250         {
 251           warc_write_ok = false;
 252           return false;
 253         }
 254
 255       fflush (warc_current_file);
 256       fseek (warc_current_file, 0, SEEK_END);
 257
 258       /* The WARC standard suggests that we add 'skip length' data in the
 259          extra header field of the GZIP stream.
 260
 261          In warc_write_start_record we reserved space for this extra header.
 262          This extra space starts at warc_current_gzfile_offset and fills
 263          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 264          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 265
 266          We need to do three things:
 267          1. Move the static GZIP header to warc_current_gzfile_offset;
 268          2. Set the FEXTRA flag in the GZIP header;
 269          3. Write the extra GZIP header after the static header, that is,
 270             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 271       */
 272
 273       /* Calculate the uncompressed and compressed sizes. */
 274       size_t current_offset = ftell (warc_current_file);
 275       size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 276       size_t compressed_size = warc_current_gzfile_uncompressed_size;
 277
 278       /* Go back to the static GZIP header. */
 279       fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 280
 281       /* Read the header. */
 282       char static_header[GZIP_STATIC_HEADER_SIZE];
 283       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 284       if (result != GZIP_STATIC_HEADER_SIZE)
 285         {
 286           warc_write_ok = false;
 287           return false;
 288         }
 289
 290       /* Set the FEXTRA flag in the flags byte of the header. */
 291       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 292
 293       /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
 294       fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 295       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 296
 297       /* Prepare the extra GZIP header. */
 298       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 299       /* XLEN, the length of the extra header fields.  */
 300       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 301       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 302       /* The extra header field identifier for the WARC skip length. */
 303       extra_header[2]  = 's';
 304       extra_header[3]  = 'l';
 305       /* The size of the uncompressed record.  */
 306       extra_header[4]  = (uncompressed_size & 255);
 307       extra_header[5]  = (uncompressed_size >> 8) & 255;
 308       extra_header[6]  = (uncompressed_size >> 16) & 255;
 309       extra_header[7]  = (uncompressed_size >> 24) & 255;
 310       /* The size of the compressed record.  */
 311       extra_header[8]  = (compressed_size & 255);
 312       extra_header[9]  = (compressed_size >> 8) & 255;
 313       extra_header[10] = (compressed_size >> 16) & 255;
 314       extra_header[11] = (compressed_size >> 24) & 255;
 315
 316       /* Write the extra header after the static header. */
 317       fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 318       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 319
 320       /* Done, move back to the end of the file. */
 321       fflush (warc_current_file);
 322       fseek (warc_current_file, 0, SEEK_END);
 323     }
 324
 325   return warc_write_ok;
 326 }
 327
 328
 329 /* Writes the WARC-Date header for the given timestamp to
 330    the current WARC record.
 331    If timestamp is NULL, the current time will be used.  */
 332 static bool
 333 warc_write_date_header (char *timestamp)
 334 {
 335   if (timestamp == NULL)
 336     {
 337       char current_timestamp[21];
 338       warc_timestamp (current_timestamp);
 339       timestamp = current_timestamp;
 340     }
 341   return warc_write_header ("WARC-Date", timestamp);
 342 }
 343
 344 /* Writes the WARC-IP-Address header for the given IP to
 345    the current WARC record.  If IP is NULL, no header will
 346    be written.  */
 347 static bool
 348 warc_write_ip_header (ip_address *ip)
 349 {
 350   if (ip != NULL)
 351     return warc_write_header ("WARC-IP-Address", print_address (ip));
 352   else
 353     return warc_write_ok;
 354 }
 355
 356
 357 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 358    from gnulib/sha1.c.  This version calculates two digests in one go.
 359
 360    Compute SHA1 message digests for bytes read from STREAM.  The
 361    digest of the complete file will be written into the 16 bytes
 362    beginning at RES_BLOCK.
 363
 364    If payload_offset >= 0, a second digest will be calculated of the
 365    portion of the file starting at payload_offset and continuing to
 366    the end of the file.  The digest number will be written into the
 367    16 bytes beginning ad RES_PAYLOAD.  */
 368 static int
 369 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
 370 {
 371 #define BLOCKSIZE 32768
 372
 373   struct sha1_ctx ctx_block;
 374   struct sha1_ctx ctx_payload;
 375   long int pos;
 376   size_t sum;
 377
 378   char *buffer = malloc (BLOCKSIZE + 72);
 379   if (!buffer)
 380     return 1;
 381
 382   /* Initialize the computation context.  */
 383   sha1_init_ctx (&ctx_block);
 384   if (payload_offset >= 0)
 385     sha1_init_ctx (&ctx_payload);
 386
 387   pos = 0;
 388
 389   /* Iterate over full file contents.  */
 390   while (1)
 391     {
 392       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 393          computation function processes the whole buffer so that with the
 394          next round of the loop another block can be read.  */
 395       size_t n;
 396       sum = 0;
 397
 398       /* Read block.  Take care for partial reads.  */
 399       while (1)
 400         {
 401           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 402
 403           sum += n;
 404           pos += n;
 405
 406           if (sum == BLOCKSIZE)
 407             break;
 408
 409           if (n == 0)
 410             {
 411               /* Check for the error flag IFF N == 0, so that we don't
 412                  exit the loop after a partial read due to e.g., EAGAIN
 413                  or EWOULDBLOCK.  */
 414               if (ferror (stream))
 415                 {
 416                   free (buffer);
 417                   return 1;
 418                 }
 419               goto process_partial_block;
 420             }
 421
 422           /* We've read at least one byte, so ignore errors.  But always
 423              check for EOF, since feof may be true even though N > 0.
 424              Otherwise, we could end up calling fread after EOF.  */
 425           if (feof (stream))
 426             goto process_partial_block;
 427         }
 428
 429       /* Process buffer with BLOCKSIZE bytes.  Note that
 430                         BLOCKSIZE % 64 == 0
 431        */
 432       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 433       if (payload_offset >= 0 && payload_offset < pos)
 434         {
 435           /* At least part of the buffer contains data from payload. */
 436           int start_of_payload = payload_offset - (pos - BLOCKSIZE);
 437           if (start_of_payload <= 0)
 438             /* All bytes in the buffer belong to the payload. */
 439             start_of_payload = 0;
 440
 441           /* Process the payload part of the buffer.
 442              Note: we can't use  sha1_process_block  here even if we
 443              process the complete buffer.  Because the payload doesn't
 444              have to start with a full block, there may still be some
 445              bytes left from the previous buffer.  Therefore, we need
 446              to continue with  sha1_process_bytes.  */
 447           sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
 448         }
 449     }
 450
 451  process_partial_block:;
 452
 453   /* Process any remaining bytes.  */
 454   if (sum > 0)
 455     {
 456       sha1_process_bytes (buffer, sum, &ctx_block);
 457       if (payload_offset >= 0 && payload_offset < pos)
 458         {
 459           /* At least part of the buffer contains data from payload. */
 460           int start_of_payload = payload_offset - (pos - sum);
 461           if (start_of_payload <= 0)
 462             /* All bytes in the buffer belong to the payload. */
 463             start_of_payload = 0;
 464
 465           /* Process the payload part of the buffer. */
 466           sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
 467         }
 468     }
 469
 470   /* Construct result in desired memory.  */
 471   sha1_finish_ctx (&ctx_block,   res_block);
 472   if (payload_offset >= 0)
 473     sha1_finish_ctx (&ctx_payload, res_payload);
 474   free (buffer);
 475   return 0;
 476
 477 #undef BLOCKSIZE
 478 }
 479
 480 /* Converts the SHA1 digest to a base32-encoded string.
 481    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 482 static char *
 483 warc_base32_sha1_digest (char *sha1_digest)
 484 {
 485   // length: "sha1:" + digest + "\0"
 486   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 487   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 488   memcpy (sha1_base32, "sha1:", 5);
 489   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 490   return sha1_base32;
 491 }
 492
 493
 494 /* Sets the digest headers of the record.
 495    This method will calculate the block digest and, if payload_offset >= 0,
 496    will also calculate the payload digest of the payload starting at the
 497    provided offset.  */
 498 static void
 499 warc_write_digest_headers (FILE *file, long payload_offset)
 500 {
 501   if (opt.warc_digests_enabled)
 502     {
 503       /* Calculate the block and payload digests. */
 504       char sha1_res_block[SHA1_DIGEST_SIZE];
 505       char sha1_res_payload[SHA1_DIGEST_SIZE];
 506
 507       rewind (file);
 508       if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
 509         {
 510           char *digest;
 511
 512           digest = warc_base32_sha1_digest (sha1_res_block);
 513           warc_write_header ("WARC-Block-Digest", digest);
 514           free (digest);
 515
 516           if (payload_offset >= 0)
 517             {
 518               digest = warc_base32_sha1_digest (sha1_res_payload);
 519               warc_write_header ("WARC-Payload-Digest", digest);
 520               free (digest);
 521             }
 522         }
 523     }
 524 }
 525
 526
 527 /* Fills timestamp with the current time and date.
 528    The UTC time is formatted following ISO 8601, as required
 529    for use in the WARC-Date header.
 530    The timestamp will be 21 characters long. */
 531 void
 532 warc_timestamp (char *timestamp)
 533 {
 534   time_t rawtime;
 535   struct tm * timeinfo;
 536   time ( &rawtime );
 537   timeinfo = gmtime (&rawtime);
 538   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 539 }
 540
 541 /* Fills uuid_str with a UUID based on random numbers.
 542    (See RFC 4122, UUID version 4.)
 543
 544    Note: this is a fallback method, it is much better to use the
 545    methods provided by libuuid.
 546
 547    The uuid_str will be 36 characters long. */
 548 static void
 549 warc_uuid_random (char *uuid_str)
 550 {
 551   // RFC 4122, a version 4 UUID with only random numbers
 552
 553   unsigned char uuid_data[16];
 554   int i;
 555   for (i=0; i<16; i++)
 556     uuid_data[i] = random_number (255);
 557
 558   // Set the four most significant bits (bits 12 through 15) of the
 559   // time_hi_and_version field to the 4-bit version number
 560   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 561
 562   // Set the two most significant bits (bits 6 and 7) of the
 563   // clock_seq_hi_and_reserved to zero and one, respectively.
 564   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 565
 566   sprintf (uuid_str,
 567     "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 568     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 569     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 570     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 571     uuid_data[15]);
 572 }
 573
 574 /* Fills urn_str with a UUID in the format required
 575    for the WARC-Record-Id header.
 576    The string will be 47 characters long. */
 577 void
 578 warc_uuid_str (char *urn_str)
 579 {
 580   char uuid_str[37];
 581
 582 # ifdef HAVE_LIBUUID
 583   uuid_t record_id;
 584   uuid_generate (record_id);
 585   uuid_unparse (record_id, uuid_str);
 586 # else
 587   warc_uuid_random (uuid_str);
 588 # endif
 589
 590   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 591 }
 592
 593 /* Write a warcinfo record to the current file.
 594    Updates warc_current_warcinfo_uuid_str. */
 595 bool
 596 warc_write_warcinfo_record (char *filename)
 597 {
 598   /* Write warc-info record as the first record of the file. */
 599   /* We add the record id of this info record to the other records in the file. */
 600   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 601   warc_uuid_str (warc_current_warcinfo_uuid_str);
 602
 603   char timestamp[22];
 604   warc_timestamp (timestamp);
 605
 606   char *filename_copy, *filename_basename;
 607   filename_copy = strdup (filename);
 608   filename_basename = basename (filename_copy);
 609
 610   warc_write_start_record ();
 611   warc_write_header ("WARC-Type", "warcinfo");
 612   warc_write_header ("Content-Type", "application/warc-fields");
 613   warc_write_header ("WARC-Date", timestamp);
 614   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 615   warc_write_header ("WARC-Filename", filename_basename);
 616
 617   /* Create content.  */
 618   FILE *warc_tmp = warc_tempfile ();
 619   if (warc_tmp == NULL)
 620     {
 621       free (filename_copy);
 622       return false;
 623     }
 624
 625   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 626   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 627   fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 628   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 629   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 630   /* Add the user headers, if any. */
 631   if (opt.warc_user_headers)
 632     {
 633       int i;
 634       for (i = 0; opt.warc_user_headers[i]; i++)
 635         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 636     }
 637   fprintf(warc_tmp, "\r\n");
 638
 639   warc_write_digest_headers (warc_tmp, -1);
 640   warc_write_block_from_file (warc_tmp);
 641   warc_write_end_record ();
 642
 643   if (! warc_write_ok)
 644     {
 645       logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 646     }
 647
 648   free (filename_copy);
 649   fclose (warc_tmp);
 650   return warc_write_ok;
 651 }
 652
 653 /* Opens a new WARC file.
 654    If META is true, generates a filename ending with 'meta.warc.gz'.
 655
 656    This method will:
 657    1. close the current WARC file (if there is one);
 658    2. increment warc_current_file_number;
 659    3. open a new WARC file;
 660    4. write the initial warcinfo record.
 661
 662    Returns true on success, false otherwise.
 663    */
 664 static bool
 665 warc_start_new_file (bool meta)
 666 {
 667   if (opt.warc_filename == NULL)
 668     return false;
 669
 670   if (warc_current_file != NULL)
 671     fclose (warc_current_file);
 672   if (warc_current_warcinfo_uuid_str)
 673     free (warc_current_warcinfo_uuid_str);
 674   if (warc_current_filename)
 675     free (warc_current_filename);
 676
 677   warc_current_file_number++;
 678
 679   int base_filename_length = strlen (opt.warc_filename);
 680   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 681   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 682   warc_current_filename = new_filename;
 683
 684   char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 685
 686   /* If max size is enabled, we add a serial number to the file names. */
 687   if (meta)
 688     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 689   else if (opt.warc_maxsize > 0)
 690     sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
 691   else
 692     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 693
 694   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 695
 696   /* Open the WARC file. */
 697   warc_current_file = fopen (new_filename, "wb+");
 698   if (warc_current_file == NULL)
 699     {
 700       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
 701       return false;
 702     }
 703
 704   if (! warc_write_warcinfo_record (new_filename))
 705     return false;
 706
 707   /* Add warcinfo uuid to manifest. */
 708   if (warc_manifest_fp)
 709     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 710
 711   return true;
 712 }
 713
 714 /* Opens the CDX file for output. */
 715 static bool
 716 warc_start_cdx_file ()
 717 {
 718   int filename_length = strlen (opt.warc_filename);
 719   char *cdx_filename = alloca (filename_length + 4 + 1);
 720   memcpy (cdx_filename, opt.warc_filename, filename_length);
 721   memcpy (cdx_filename + filename_length, ".cdx", 5);
 722   warc_current_cdx_file = fopen (cdx_filename, "a+");
 723   if (warc_current_cdx_file == NULL)
 724     return false;
 725
 726   /* Print the CDX header.
 727    *
 728    * a - original url
 729    * b - date
 730    * m - mime type
 731    * s - response code
 732    * k - new style checksum
 733    * r - redirect
 734    * M - meta tags
 735    * V - compressed arc file offset
 736    * g - file name
 737    * u - record-id
 738    */
 739   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 740   fflush (warc_current_cdx_file);
 741
 742   return true;
 743 }
 744
 745 #define CDX_FIELDSEP " \t\r\n"
 746
 747 /* Parse the CDX header and find the field numbers of the original url,
 748    checksum and record ID fields. */
 749 static bool
 750 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
 751 {
 752   *field_num_original_url = -1;
 753   *field_num_checksum = -1;
 754   *field_num_record_id = -1;
 755
 756   char *token;
 757   char *save_ptr;
 758   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 759
 760   if (token != NULL && strcmp (token, "CDX") == 0)
 761     {
 762       int field_num = 0;
 763       while (token != NULL)
 764         {
 765           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 766           if (token != NULL)
 767             {
 768               switch (token[0])
 769                 {
 770                 case 'a':
 771                   *field_num_original_url = field_num;
 772                   break;
 773                 case 'k':
 774                   *field_num_checksum = field_num;
 775                   break;
 776                 case 'u':
 777                   *field_num_record_id = field_num;
 778                   break;
 779                 }
 780             }
 781           field_num++;
 782         }
 783     }
 784
 785   return *field_num_original_url != -1
 786          && *field_num_checksum != -1
 787          && *field_num_record_id != -1;
 788 }
 789
 790 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 791 static void
 792 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
 793 {
 794   char *original_url = NULL;
 795   char *checksum = NULL;
 796   char *record_id = NULL;
 797
 798   char *token;
 799   char *save_ptr;
 800   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 801
 802   /* Read this line to get the fields we need. */
 803   int field_num = 0;
 804   while (token != NULL)
 805     {
 806       char **val;
 807       if (field_num == field_num_original_url)
 808         val = &original_url;
 809       else if (field_num == field_num_checksum)
 810         val = &checksum;
 811       else if (field_num == field_num_record_id)
 812         val = &record_id;
 813       else
 814         val = NULL;
 815
 816       if (val != NULL)
 817         *val = strdup (token);
 818
 819       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 820       field_num++;
 821     }
 822
 823   if (original_url != NULL && checksum != NULL && record_id != NULL)
 824     {
 825       /* For some extra efficiency, we decode the base32 encoded
 826          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 827          bytes.  */
 828       size_t checksum_l;
 829       char * checksum_v;
 830       base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
 831       free (checksum);
 832
 833       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 834         {
 835           /* This is a valid line with a valid checksum. */
 836           struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
 837           rec->url = original_url;
 838           rec->uuid = record_id;
 839           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 840           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 841           free (checksum_v);
 842         }
 843       else
 844         {
 845           free (original_url);
 846           if (checksum_v != NULL)
 847             free (checksum_v);
 848           free (record_id);
 849         }
 850     }
 851 }
 852
 853 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 854    the warc_cdx_dedup_table. */
 855 bool
 856 warc_load_cdx_dedup_file ()
 857 {
 858   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 859   if (f == NULL)
 860     return false;
 861
 862   int field_num_original_url = -1;
 863   int field_num_checksum = -1;
 864   int field_num_record_id = -1;
 865
 866   char *lineptr = NULL;
 867   size_t n = 0;
 868   size_t line_length;
 869
 870   /* The first line should contain the CDX header.
 871      Format:  " CDX x x x x x"
 872      where x are field type indicators.  For our purposes, we only
 873      need 'a' (the original url), 'k' (the SHA1 checksum) and
 874      'u' (the WARC record id). */
 875   line_length = getline (&lineptr, &n, f);
 876   if (line_length != -1)
 877     warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
 878
 879   /* If the file contains all three fields, read the complete file. */
 880   if (field_num_original_url == -1
 881       || field_num_checksum == -1
 882       || field_num_record_id == -1)
 883     {
 884       if (field_num_original_url == -1)
 885         logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 886       if (field_num_checksum == -1)
 887         logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 888       if (field_num_record_id == -1)
 889         logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 890     }
 891   else
 892     {
 893       /* Initialize the table. */
 894       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
 895
 896       do
 897         {
 898           line_length = getline (&lineptr, &n, f);
 899           if (line_length != -1)
 900             warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
 901
 902         }
 903       while (line_length != -1);
 904
 905       /* Print results. */
 906       int nrecords = hash_table_count (warc_cdx_dedup_table);
 907       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 908                                         "Loaded %d records from CDX.\n\n", nrecords),
 909                               nrecords);
 910     }
 911
 912   fclose (f);
 913
 914   return true;
 915 }
 916 #undef CDX_FIELDSEP
 917
 918 /* Returns the existing duplicate CDX record for the given url and payload
 919    digest.  Returns NULL if the url is not found or if the payload digest
 920    does not match, or if CDX deduplication is disabled. */
 921 static struct warc_cdx_record *
 922 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 923 {
 924   if (warc_cdx_dedup_table == NULL)
 925     return NULL;
 926
 927   char *key;
 928   struct warc_cdx_record *rec_existing;
 929   hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
 930
 931   if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
 932     return rec_existing;
 933   else
 934     return NULL;
 935 }
 936
 937 /* Initializes the WARC writer (if opt.warc_filename is set).
 938    This should be called before any WARC record is written. */
 939 void
 940 warc_init ()
 941 {
 942   warc_write_ok = true;
 943
 944   if (opt.warc_filename != NULL)
 945     {
 946       if (opt.warc_cdx_dedup_filename != NULL)
 947         {
 948           if (! warc_load_cdx_dedup_file ())
 949             {
 950               logprintf (LOG_NOTQUIET,
 951                          _("Could not read CDX file %s for deduplication.\n"),
 952                          quote (opt.warc_cdx_dedup_filename));
 953               exit(1);
 954             }
 955         }
 956
 957       warc_manifest_fp = warc_tempfile ();
 958       if (warc_manifest_fp == NULL)
 959         {
 960           logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
 961           exit(1);
 962         }
 963
 964       if (opt.warc_keep_log)
 965         {
 966           warc_log_fp = warc_tempfile ();
 967           if (warc_log_fp == NULL)
 968             {
 969               logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
 970               exit(1);
 971             }
 972           log_set_warc_log_fp (warc_log_fp);
 973         }
 974
 975       warc_current_file_number = -1;
 976       if (! warc_start_new_file (false))
 977         {
 978           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
 979           exit(1);
 980         }
 981
 982       if (opt.warc_cdx_enabled)
 983         {
 984           if (! warc_start_cdx_file ())
 985             {
 986               logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
 987               exit(1);
 988             }
 989         }
 990     }
 991 }
 992
 993 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
 994 void
 995 warc_write_metadata ()
 996 {
 997   /* If there are multiple WARC files, the metadata should be written to a separate file. */
 998   if (opt.warc_maxsize > 0)
 999     warc_start_new_file (true);
1000
1001   char manifest_uuid [48];
1002   warc_uuid_str (manifest_uuid);
1003
1004   fflush (warc_manifest_fp);
1005   warc_write_resource_record (manifest_uuid,
1006                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1007                               NULL, NULL, NULL, "text/plain",
1008                               warc_manifest_fp, -1);
1009   /* warc_write_resource_record has closed warc_manifest_fp. */
1010
1011   FILE * warc_tmp_fp = warc_tempfile ();
1012   if (warc_tmp_fp == NULL)
1013     {
1014       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1015       exit(1);
1016     }
1017   fflush (warc_tmp_fp);
1018   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1019
1020   warc_write_resource_record (manifest_uuid,
1021                               "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1022                               NULL, NULL, NULL, "text/plain",
1023                               warc_tmp_fp, -1);
1024   /* warc_write_resource_record has closed warc_tmp_fp. */
1025
1026   if (warc_log_fp != NULL)
1027     {
1028       warc_write_resource_record (NULL,
1029                                   "metadata://gnu.org/software/wget/warc/wget.log",
1030                                   NULL, manifest_uuid, NULL, "text/plain",
1031                                   warc_log_fp, -1);
1032       /* warc_write_resource_record has closed warc_log_fp. */
1033
1034       warc_log_fp = NULL;
1035       log_set_warc_log_fp (NULL);
1036     }
1037 }
1038
1039 /* Finishes the WARC writing.
1040    This should be called at the end of the program. */
1041 void
1042 warc_close ()
1043 {
1044   if (warc_current_file != NULL)
1045     {
1046       warc_write_metadata ();
1047       free (warc_current_warcinfo_uuid_str);
1048       fclose (warc_current_file);
1049     }
1050   if (warc_current_cdx_file != NULL)
1051     fclose (warc_current_cdx_file);
1052   if (warc_log_fp != NULL)
1053     {
1054       fclose (warc_log_fp);
1055       log_set_warc_log_fp (NULL);
1056     }
1057 }
1058
1059 /* Creates a temporary file for writing WARC output.
1060    The temporary file will be created in opt.warc_tempdir.
1061    Returns the pointer to the temporary file, or NULL. */
1062 FILE *
1063 warc_tempfile ()
1064 {
1065   char filename[100];
1066   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1067     return NULL;
1068
1069   int fd = mkstemp (filename);
1070   if (fd < 0)
1071     return NULL;
1072
1073   if (unlink (filename) < 0)
1074     return NULL;
1075
1076   return fdopen (fd, "wb+");
1077 }
1078
1079
1080 /* Writes a request record to the WARC file.
1081    url  is the target uri of the request,
1082    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1083    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1084    body  is a pointer to a file containing the request headers and body.
1085    ip  is the ip address of the server (or NULL),
1086    Calling this function will close body.
1087    Returns true on success, false on error. */
1088 bool
1089 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1090 {
1091   warc_write_start_record ();
1092   warc_write_header ("WARC-Type", "request");
1093   warc_write_header ("WARC-Target-URI", url);
1094   warc_write_header ("Content-Type", "application/http;msgtype=request");
1095   warc_write_date_header (timestamp_str);
1096   warc_write_header ("WARC-Record-ID", record_uuid);
1097   warc_write_ip_header (ip);
1098   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1099   warc_write_digest_headers (body, payload_offset);
1100   warc_write_block_from_file (body);
1101   warc_write_end_record ();
1102
1103   fclose (body);
1104
1105   return warc_write_ok;
1106 }
1107
1108 /* Writes a response record to the CDX file.
1109    url  is the target uri of the request/response,
1110    timestamp_str  is the timestamp of the request that generated this response,
1111                   (generated with warc_timestamp),
1112    mime_type  is the mime type of the response body (will be printed to CDX),
1113    response_code  is the HTTP response code (will be printed to CDX),
1114    payload_digest  is the sha1 digest of the payload,
1115    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1116    offset  is the position of the WARC record in the WARC file,
1117    warc_filename  is the filename of the WARC,
1118    response_uuid  is the uuid of the response.
1119    Returns true on success, false on error. */
1120 static bool
1121 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1122 {
1123   /* Transform the timestamp. */
1124   char timestamp_str_cdx [15];
1125   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1126   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1127   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1128   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1129   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1130   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1131   timestamp_str_cdx[14] = '\0';
1132
1133   /* Rewrite the checksum. */
1134   char *checksum;
1135   if (payload_digest != NULL)
1136     checksum = payload_digest + 5; /* Skip the "sha1:" */
1137   else
1138     checksum = "-";
1139
1140   if (mime_type == NULL || strlen(mime_type) == 0)
1141     mime_type = "-";
1142   if (redirect_location == NULL || strlen(redirect_location) == 0)
1143     redirect_location = "-";
1144
1145   /* Print the CDX line. */
1146   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1147   fflush (warc_current_cdx_file);
1148
1149   return true;
1150 }
1151
1152 /* Writes a revisit record to the WARC file.
1153    url  is the target uri of the request/response,
1154    timestamp_str  is the timestamp of the request that generated this response
1155                   (generated with warc_timestamp),
1156    concurrent_to_uuid  is the uuid of the request for that generated this response
1157                  (generated with warc_uuid_str),
1158    refers_to_uuid  is the uuid of the original response
1159                  (generated with warc_uuid_str),
1160    payload_digest  is the sha1 digest of the payload,
1161    ip  is the ip address of the server (or NULL),
1162    body  is a pointer to a file containing the response headers (without payload).
1163    Calling this function will close body.
1164    Returns true on success, false on error. */
1165 static bool
1166 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1167 {
1168   char revisit_uuid [48];
1169   warc_uuid_str (revisit_uuid);
1170
1171   char *block_digest = NULL;
1172   char sha1_res_block[SHA1_DIGEST_SIZE];
1173   sha1_stream (body, sha1_res_block);
1174   block_digest = warc_base32_sha1_digest (sha1_res_block);
1175
1176   warc_write_start_record ();
1177   warc_write_header ("WARC-Type", "revisit");
1178   warc_write_header ("WARC-Record-ID", revisit_uuid);
1179   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1180   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1181   warc_write_header ("WARC-Refers-To", refers_to);
1182   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1183   warc_write_header ("WARC-Truncated", "length");
1184   warc_write_header ("WARC-Target-URI", url);
1185   warc_write_date_header (timestamp_str);
1186   warc_write_ip_header (ip);
1187   warc_write_header ("Content-Type", "application/http;msgtype=response");
1188   warc_write_header ("WARC-Block-Digest", block_digest);
1189   warc_write_header ("WARC-Payload-Digest", payload_digest);
1190   warc_write_block_from_file (body);
1191   warc_write_end_record ();
1192
1193   fclose (body);
1194   free (block_digest);
1195
1196   return warc_write_ok;
1197 }
1198
1199 /* Writes a response record to the WARC file.
1200    url  is the target uri of the request/response,
1201    timestamp_str  is the timestamp of the request that generated this response
1202                   (generated with warc_timestamp),
1203    concurrent_to_uuid  is the uuid of the request for that generated this response
1204                  (generated with warc_uuid_str),
1205    ip  is the ip address of the server (or NULL),
1206    body  is a pointer to a file containing the response headers and body.
1207    mime_type  is the mime type of the response body (will be printed to CDX),
1208    response_code  is the HTTP response code (will be printed to CDX),
1209    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1210    Calling this function will close body.
1211    Returns true on success, false on error. */
1212 bool
1213 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1214 {
1215   char *block_digest = NULL;
1216   char *payload_digest = NULL;
1217   char sha1_res_block[SHA1_DIGEST_SIZE];
1218   char sha1_res_payload[SHA1_DIGEST_SIZE];
1219
1220   if (opt.warc_digests_enabled)
1221     {
1222       /* Calculate the block and payload digests. */
1223       rewind (body);
1224       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1225         {
1226           /* Decide (based on url + payload digest) if we have seen this
1227              data before. */
1228           struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1229           if (rec_existing != NULL)
1230             {
1231               /* Found an existing record. */
1232               logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1233
1234               /* Remove the payload from the file. */
1235               if (payload_offset > 0)
1236                 {
1237                   if (ftruncate (fileno (body), payload_offset) == -1)
1238                     return false;
1239                 }
1240
1241               /* Send the original payload digest. */
1242               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1243               bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1244               free (payload_digest);
1245
1246               return result;
1247             }
1248
1249           block_digest = warc_base32_sha1_digest (sha1_res_block);
1250           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1251         }
1252     }
1253
1254   /* Not a revisit, just store the record. */
1255
1256   char response_uuid [48];
1257   warc_uuid_str (response_uuid);
1258
1259   fseek (warc_current_file, 0L, SEEK_END);
1260   size_t offset = ftell (warc_current_file);
1261
1262   warc_write_start_record ();
1263   warc_write_header ("WARC-Type", "response");
1264   warc_write_header ("WARC-Record-ID", response_uuid);
1265   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1266   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1267   warc_write_header ("WARC-Target-URI", url);
1268   warc_write_date_header (timestamp_str);
1269   warc_write_ip_header (ip);
1270   warc_write_header ("WARC-Block-Digest", block_digest);
1271   warc_write_header ("WARC-Payload-Digest", payload_digest);
1272   warc_write_header ("Content-Type", "application/http;msgtype=response");
1273   warc_write_block_from_file (body);
1274   warc_write_end_record ();
1275
1276   fclose (body);
1277
1278   if (warc_write_ok && opt.warc_cdx_enabled)
1279     {
1280       /* Add this record to the CDX. */
1281       warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1282     }
1283
1284   if (block_digest)
1285     free (block_digest);
1286   if (payload_digest)
1287     free (payload_digest);
1288
1289   return warc_write_ok;
1290 }
1291
1292 /* Writes a resource record to the WARC file.
1293    resource_uuid  is the uuid of the resource (or NULL),
1294    url  is the target uri of the resource,
1295    timestamp_str  is the timestamp (generated with warc_timestamp),
1296    concurrent_to_uuid  is the uuid of the request for that generated this resource
1297                  (generated with warc_uuid_str) or NULL,
1298    ip  is the ip address of the server (or NULL),
1299    content_type  is the mime type of the body (or NULL),
1300    body  is a pointer to a file containing the resource data.
1301    Calling this function will close body.
1302    Returns true on success, false on error. */
1303 bool
1304 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1305 {
1306   if (resource_uuid == NULL)
1307     {
1308       resource_uuid = alloca (48);
1309       warc_uuid_str (resource_uuid);
1310     }
1311
1312   if (content_type == NULL)
1313     content_type = "application/octet-stream";
1314
1315   warc_write_start_record ();
1316   warc_write_header ("WARC-Type", "resource");
1317   warc_write_header ("WARC-Record-ID", resource_uuid);
1318   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1319   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1320   warc_write_header ("WARC-Target-URI", url);
1321   warc_write_date_header (timestamp_str);
1322   warc_write_ip_header (ip);
1323   warc_write_digest_headers (body, payload_offset);
1324   warc_write_header ("Content-Type", content_type);
1325   warc_write_block_from_file (body);
1326   warc_write_end_record ();
1327
1328   fclose (body);
1329
1330   return warc_write_ok;
1331 }
1332