sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files. */
   2 #define _GNU_SOURCE
   3
   4 #include "wget.h"
   5 #include "hash.h"
   6 #include "utils.h"
   7
   8 #include <stdio.h>
   9 #include <stdlib.h>
  10 #include <string.h>
  11 #include <strings.h>
  12 #include <time.h>
  13 #include <tmpdir.h>
  14 #include <sha1.h>
  15 #include <base32.h>
  16 #include <unistd.h>
  17 #include <zlib.h>
  18 #ifdef HAVE_LIBUUID
  19 #include <uuid/uuid.h>
  20 #endif
  21
  22 #ifndef WINDOWS
  23 #include <libgen.h>
  24 #endif
  25
  26 #include "warc.h"
  27
  28 extern char *version_string;
  29
  30 /* Set by main in main.c */
  31 extern char *program_argstring;
  32
  33
  34 /* The log file (a temporary file that contains a copy
  35    of the wget log). */
  36 static FILE *warc_log_fp;
  37
  38 /* The manifest file (a temporary file that contains the
  39    warcinfo uuid of every file in this crawl). */
  40 static FILE *warc_manifest_fp;
  41
  42 /* The current WARC file (or NULL, if WARC is disabled). */
  43 static FILE *warc_current_file;
  44
  45 /* The gzip stream for the current WARC file
  46    (or NULL, if WARC or gzip is disabled). */
  47 static gzFile *warc_current_gzfile;
  48
  49 /* The offset of the current gzip record in the WARC file. */
  50 static size_t warc_current_gzfile_offset;
  51
  52 /* The uncompressed size (so far) of the current record. */
  53 static size_t warc_current_gzfile_uncompressed_size;
  54
  55 /* This is true until a warc_write_* method fails. */
  56 static bool warc_write_ok;
  57
  58 /* The current CDX file (or NULL, if CDX is disabled). */
  59 static FILE *warc_current_cdx_file;
  60
  61 /* The record id of the warcinfo record of the current WARC file.  */
  62 static char *warc_current_warcinfo_uuid_str;
  63
  64 /* The file name of the current WARC file. */
  65 static char *warc_current_filename;
  66
  67 /* The serial number of the current WARC file.  This number is
  68    incremented each time a new file is opened and is used in the
  69    WARC file's filename. */
  70 static int warc_current_file_number;
  71
  72 /* The table of CDX records, if deduplication is enabled. */
  73 struct hash_table * warc_cdx_dedup_table;
  74
  75 static bool warc_start_new_file (bool meta);
  76
  77
  78 struct warc_cdx_record
  79 {
  80   char *url;
  81   char *uuid;
  82   char digest[SHA1_DIGEST_SIZE];
  83 };
  84
  85 static unsigned long
  86 warc_hash_sha1_digest (const void *key)
  87 {
  88   /* We just use some of the first bytes of the digest. */
  89   unsigned long v = 0;
  90   memcpy (&v, key, sizeof (unsigned long));
  91   return v;
  92 }
  93
  94 static int
  95 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
  96 {
  97   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
  98 }
  99
 100
 101
 102 /* Writes SIZE bytes from BUFFER to the current WARC file,
 103    through gzwrite if compression is enabled.
 104    Returns the number of uncompressed bytes written.  */
 105 static size_t
 106 warc_write_buffer (const char *buffer, size_t size)
 107 {
 108   if (warc_current_gzfile)
 109     {
 110       warc_current_gzfile_uncompressed_size += size;
 111       return gzwrite (warc_current_gzfile, buffer, size);
 112     }
 113   else
 114     return fwrite (buffer, 1, size, warc_current_file);
 115 }
 116
 117 /* Writes STR to the current WARC file.
 118    Returns false and set warc_write_ok to false if there
 119    is an error.  */
 120 static bool
 121 warc_write_string (const char *str)
 122 {
 123   if (!warc_write_ok)
 124     return false;
 125
 126   size_t n = strlen (str);
 127   if (n != warc_write_buffer (str, n))
 128     warc_write_ok = false;
 129
 130   return warc_write_ok;
 131 }
 132
 133
 134 #define EXTRA_GZIP_HEADER_SIZE 12
 135 #define GZIP_STATIC_HEADER_SIZE  10
 136 #define FLG_FEXTRA          0x04
 137 #define OFF_FLG             3
 138
 139 /* Starts a new WARC record.  Writes the version header.
 140    If opt.warc_maxsize is set and the current file is becoming
 141    too large, this will open a new WARC file.
 142
 143    If compression is enabled, this will start a new
 144    gzip stream in the current WARC file.
 145
 146    Returns false and set warc_write_ok to false if there
 147    is an error.  */
 148 static bool
 149 warc_write_start_record ()
 150 {
 151   if (!warc_write_ok)
 152     return false;
 153
 154   fflush (warc_current_file);
 155   if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
 156     warc_start_new_file (false);
 157
 158   /* Start a GZIP stream, if required. */
 159   if (opt.warc_compression_enabled)
 160     {
 161       /* Record the starting offset of the new record. */
 162       warc_current_gzfile_offset = ftell (warc_current_file);
 163
 164       /* Reserve space for the extra GZIP header field.
 165          In warc_write_end_record we will fill this space
 166          with information about the uncompressed and
 167          compressed size of the record. */
 168       fprintf (warc_current_file, "XXXXXXXXXXXX");
 169       fflush (warc_current_file);
 170
 171       /* Start a new GZIP stream. */
 172       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 173       warc_current_gzfile_uncompressed_size = 0;
 174
 175       if (warc_current_gzfile == NULL)
 176         {
 177           logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
 178           warc_write_ok = false;
 179           return false;
 180         }
 181     }
 182
 183   warc_write_string ("WARC/1.0\r\n");
 184   return warc_write_ok;
 185 }
 186
 187 /* Writes a WARC header to the current WARC record.
 188    This method may be run after warc_write_start_record and
 189    before warc_write_block_from_file.  */
 190 static bool
 191 warc_write_header (const char *name, const char *value)
 192 {
 193   if (value)
 194     {
 195       warc_write_string (name);
 196       warc_write_string (": ");
 197       warc_write_string (value);
 198       warc_write_string ("\r\n");
 199     }
 200   return warc_write_ok;
 201 }
 202
 203 /* Copies the contents of DATA_IN to the WARC record.
 204    Adds a Content-Length header to the WARC record.
 205    Run this method after warc_write_header,
 206    then run warc_write_end_record. */
 207 static bool
 208 warc_write_block_from_file (FILE *data_in)
 209 {
 210   /* Add the Content-Length header. */
 211   char *content_length;
 212   fseek (data_in, 0L, SEEK_END);
 213   if (! asprintf (&content_length, "%ld", ftell (data_in)))
 214     {
 215       warc_write_ok = false;
 216       return false;
 217     }
 218   warc_write_header ("Content-Length", content_length);
 219   free (content_length);
 220
 221   /* End of the WARC header section. */
 222   warc_write_string ("\r\n");
 223
 224   if (fseek (data_in, 0L, SEEK_SET) != 0)
 225     warc_write_ok = false;
 226
 227   /* Copy the data in the file to the WARC record. */
 228   char buffer[BUFSIZ];
 229   size_t s;
 230   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 231     {
 232       if (warc_write_buffer (buffer, s) < s)
 233         warc_write_ok = false;
 234     }
 235
 236   return warc_write_ok;
 237 }
 238
 239 /* Run this method to close the current WARC record.
 240
 241    If compression is enabled, this method closes the
 242    current GZIP stream and fills the extra GZIP header
 243    with the uncompressed and compressed length of the
 244    record. */
 245 static bool
 246 warc_write_end_record ()
 247 {
 248   warc_write_buffer ("\r\n\r\n", 4);
 249
 250   /* We start a new gzip stream for each record.  */
 251   if (warc_write_ok && warc_current_gzfile)
 252     {
 253       if (gzclose (warc_current_gzfile) != Z_OK)
 254         {
 255           warc_write_ok = false;
 256           return false;
 257         }
 258
 259       fflush (warc_current_file);
 260       fseek (warc_current_file, 0, SEEK_END);
 261
 262       /* The WARC standard suggests that we add 'skip length' data in the
 263          extra header field of the GZIP stream.
 264
 265          In warc_write_start_record we reserved space for this extra header.
 266          This extra space starts at warc_current_gzfile_offset and fills
 267          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 268          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 269
 270          We need to do three things:
 271          1. Move the static GZIP header to warc_current_gzfile_offset;
 272          2. Set the FEXTRA flag in the GZIP header;
 273          3. Write the extra GZIP header after the static header, that is,
 274             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 275       */
 276
 277       /* Calculate the uncompressed and compressed sizes. */
 278       size_t current_offset = ftell (warc_current_file);
 279       size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 280       size_t compressed_size = warc_current_gzfile_uncompressed_size;
 281
 282       /* Go back to the static GZIP header. */
 283       fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 284
 285       /* Read the header. */
 286       char static_header[GZIP_STATIC_HEADER_SIZE];
 287       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 288       if (result != GZIP_STATIC_HEADER_SIZE)
 289         {
 290           warc_write_ok = false;
 291           return false;
 292         }
 293
 294       /* Set the FEXTRA flag in the flags byte of the header. */
 295       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 296
 297       /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
 298       fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 299       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 300
 301       /* Prepare the extra GZIP header. */
 302       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 303       /* XLEN, the length of the extra header fields.  */
 304       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 305       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 306       /* The extra header field identifier for the WARC skip length. */
 307       extra_header[2]  = 's';
 308       extra_header[3]  = 'l';
 309       /* The size of the uncompressed record.  */
 310       extra_header[4]  = (uncompressed_size & 255);
 311       extra_header[5]  = (uncompressed_size >> 8) & 255;
 312       extra_header[6]  = (uncompressed_size >> 16) & 255;
 313       extra_header[7]  = (uncompressed_size >> 24) & 255;
 314       /* The size of the compressed record.  */
 315       extra_header[8]  = (compressed_size & 255);
 316       extra_header[9]  = (compressed_size >> 8) & 255;
 317       extra_header[10] = (compressed_size >> 16) & 255;
 318       extra_header[11] = (compressed_size >> 24) & 255;
 319
 320       /* Write the extra header after the static header. */
 321       fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 322       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 323
 324       /* Done, move back to the end of the file. */
 325       fflush (warc_current_file);
 326       fseek (warc_current_file, 0, SEEK_END);
 327     }
 328
 329   return warc_write_ok;
 330 }
 331
 332
 333 /* Writes the WARC-Date header for the given timestamp to
 334    the current WARC record.
 335    If timestamp is NULL, the current time will be used.  */
 336 static bool
 337 warc_write_date_header (char *timestamp)
 338 {
 339   if (timestamp == NULL)
 340     {
 341       char current_timestamp[21];
 342       warc_timestamp (current_timestamp);
 343       timestamp = current_timestamp;
 344     }
 345   return warc_write_header ("WARC-Date", timestamp);
 346 }
 347
 348 /* Writes the WARC-IP-Address header for the given IP to
 349    the current WARC record.  If IP is NULL, no header will
 350    be written.  */
 351 static bool
 352 warc_write_ip_header (ip_address *ip)
 353 {
 354   if (ip != NULL)
 355     return warc_write_header ("WARC-IP-Address", print_address (ip));
 356   else
 357     return warc_write_ok;
 358 }
 359
 360
 361 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 362    from gnulib/sha1.c.  This version calculates two digests in one go.
 363
 364    Compute SHA1 message digests for bytes read from STREAM.  The
 365    digest of the complete file will be written into the 16 bytes
 366    beginning at RES_BLOCK.
 367
 368    If payload_offset >= 0, a second digest will be calculated of the
 369    portion of the file starting at payload_offset and continuing to
 370    the end of the file.  The digest number will be written into the
 371    16 bytes beginning ad RES_PAYLOAD.  */
 372 static int
 373 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
 374 {
 375 #define BLOCKSIZE 32768
 376
 377   struct sha1_ctx ctx_block;
 378   struct sha1_ctx ctx_payload;
 379   long int pos;
 380   size_t sum;
 381
 382   char *buffer = malloc (BLOCKSIZE + 72);
 383   if (!buffer)
 384     return 1;
 385
 386   /* Initialize the computation context.  */
 387   sha1_init_ctx (&ctx_block);
 388   if (payload_offset >= 0)
 389     sha1_init_ctx (&ctx_payload);
 390
 391   pos = 0;
 392
 393   /* Iterate over full file contents.  */
 394   while (1)
 395     {
 396       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 397          computation function processes the whole buffer so that with the
 398          next round of the loop another block can be read.  */
 399       size_t n;
 400       sum = 0;
 401
 402       /* Read block.  Take care for partial reads.  */
 403       while (1)
 404         {
 405           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 406
 407           sum += n;
 408           pos += n;
 409
 410           if (sum == BLOCKSIZE)
 411             break;
 412
 413           if (n == 0)
 414             {
 415               /* Check for the error flag IFF N == 0, so that we don't
 416                  exit the loop after a partial read due to e.g., EAGAIN
 417                  or EWOULDBLOCK.  */
 418               if (ferror (stream))
 419                 {
 420                   free (buffer);
 421                   return 1;
 422                 }
 423               goto process_partial_block;
 424             }
 425
 426           /* We've read at least one byte, so ignore errors.  But always
 427              check for EOF, since feof may be true even though N > 0.
 428              Otherwise, we could end up calling fread after EOF.  */
 429           if (feof (stream))
 430             goto process_partial_block;
 431         }
 432
 433       /* Process buffer with BLOCKSIZE bytes.  Note that
 434                         BLOCKSIZE % 64 == 0
 435        */
 436       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 437       if (payload_offset >= 0 && payload_offset < pos)
 438         {
 439           /* At least part of the buffer contains data from payload. */
 440           int start_of_payload = payload_offset - (pos - BLOCKSIZE);
 441           if (start_of_payload <= 0)
 442             /* All bytes in the buffer belong to the payload. */
 443             start_of_payload = 0;
 444
 445           /* Process the payload part of the buffer.
 446              Note: we can't use  sha1_process_block  here even if we
 447              process the complete buffer.  Because the payload doesn't
 448              have to start with a full block, there may still be some
 449              bytes left from the previous buffer.  Therefore, we need
 450              to continue with  sha1_process_bytes.  */
 451           sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
 452         }
 453     }
 454
 455  process_partial_block:;
 456
 457   /* Process any remaining bytes.  */
 458   if (sum > 0)
 459     {
 460       sha1_process_bytes (buffer, sum, &ctx_block);
 461       if (payload_offset >= 0 && payload_offset < pos)
 462         {
 463           /* At least part of the buffer contains data from payload. */
 464           int start_of_payload = payload_offset - (pos - sum);
 465           if (start_of_payload <= 0)
 466             /* All bytes in the buffer belong to the payload. */
 467             start_of_payload = 0;
 468
 469           /* Process the payload part of the buffer. */
 470           sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
 471         }
 472     }
 473
 474   /* Construct result in desired memory.  */
 475   sha1_finish_ctx (&ctx_block,   res_block);
 476   if (payload_offset >= 0)
 477     sha1_finish_ctx (&ctx_payload, res_payload);
 478   free (buffer);
 479   return 0;
 480
 481 #undef BLOCKSIZE
 482 }
 483
 484 /* Converts the SHA1 digest to a base32-encoded string.
 485    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 486 static char *
 487 warc_base32_sha1_digest (char *sha1_digest)
 488 {
 489   // length: "sha1:" + digest + "\0"
 490   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 491   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 492   memcpy (sha1_base32, "sha1:", 5);
 493   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 494   return sha1_base32;
 495 }
 496
 497
 498 /* Sets the digest headers of the record.
 499    This method will calculate the block digest and, if payload_offset >= 0,
 500    will also calculate the payload digest of the payload starting at the
 501    provided offset.  */
 502 static void
 503 warc_write_digest_headers (FILE *file, long payload_offset)
 504 {
 505   if (opt.warc_digests_enabled)
 506     {
 507       /* Calculate the block and payload digests. */
 508       char sha1_res_block[SHA1_DIGEST_SIZE];
 509       char sha1_res_payload[SHA1_DIGEST_SIZE];
 510
 511       rewind (file);
 512       if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
 513         {
 514           char *digest;
 515
 516           digest = warc_base32_sha1_digest (sha1_res_block);
 517           warc_write_header ("WARC-Block-Digest", digest);
 518           free (digest);
 519
 520           if (payload_offset >= 0)
 521             {
 522               digest = warc_base32_sha1_digest (sha1_res_payload);
 523               warc_write_header ("WARC-Payload-Digest", digest);
 524               free (digest);
 525             }
 526         }
 527     }
 528 }
 529
 530
 531 /* Fills timestamp with the current time and date.
 532    The UTC time is formatted following ISO 8601, as required
 533    for use in the WARC-Date header.
 534    The timestamp will be 21 characters long. */
 535 void
 536 warc_timestamp (char *timestamp)
 537 {
 538   time_t rawtime;
 539   struct tm * timeinfo;
 540   time ( &rawtime );
 541   timeinfo = gmtime (&rawtime);
 542   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 543 }
 544
 545 /* Fills uuid_str with a UUID based on random numbers.
 546    (See RFC 4122, UUID version 4.)
 547
 548    Note: this is a fallback method, it is much better to use the
 549    methods provided by libuuid.
 550
 551    The uuid_str will be 36 characters long. */
 552 static void
 553 warc_uuid_random (char *uuid_str)
 554 {
 555   // RFC 4122, a version 4 UUID with only random numbers
 556
 557   unsigned char uuid_data[16];
 558   int i;
 559   for (i=0; i<16; i++)
 560     uuid_data[i] = random_number (255);
 561
 562   // Set the four most significant bits (bits 12 through 15) of the
 563   // time_hi_and_version field to the 4-bit version number
 564   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 565
 566   // Set the two most significant bits (bits 6 and 7) of the
 567   // clock_seq_hi_and_reserved to zero and one, respectively.
 568   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 569
 570   sprintf (uuid_str,
 571     "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
 572     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 573     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 574     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 575     uuid_data[15]);
 576 }
 577
 578 /* Fills urn_str with a UUID in the format required
 579    for the WARC-Record-Id header.
 580    The string will be 47 characters long. */
 581 void
 582 warc_uuid_str (char *urn_str)
 583 {
 584   char uuid_str[37];
 585
 586 # ifdef HAVE_LIBUUID
 587   uuid_t record_id;
 588   uuid_generate (record_id);
 589   uuid_unparse (record_id, uuid_str);
 590 # else
 591   warc_uuid_random (uuid_str);
 592 # endif
 593
 594   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 595 }
 596
 597 /* Write a warcinfo record to the current file.
 598    Updates warc_current_warcinfo_uuid_str. */
 599 bool
 600 warc_write_warcinfo_record (char *filename)
 601 {
 602   /* Write warc-info record as the first record of the file. */
 603   /* We add the record id of this info record to the other records in the file. */
 604   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 605   warc_uuid_str (warc_current_warcinfo_uuid_str);
 606
 607   char timestamp[22];
 608   warc_timestamp (timestamp);
 609
 610   char *filename_copy, *filename_basename;
 611   filename_copy = strdup (filename);
 612   filename_basename = strdup (basename (filename_copy));
 613
 614   warc_write_start_record ();
 615   warc_write_header ("WARC-Type", "warcinfo");
 616   warc_write_header ("Content-Type", "application/warc-fields");
 617   warc_write_header ("WARC-Date", timestamp);
 618   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 619   warc_write_header ("WARC-Filename", filename_basename);
 620
 621   /* Create content.  */
 622   FILE *warc_tmp = warc_tempfile ();
 623   if (warc_tmp == NULL)
 624     {
 625       free (filename_copy);
 626       free (filename_basename);
 627       return false;
 628     }
 629
 630   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 631   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 632   fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 633   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 634   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 635   /* Add the user headers, if any. */
 636   if (opt.warc_user_headers)
 637     {
 638       int i;
 639       for (i = 0; opt.warc_user_headers[i]; i++)
 640         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 641     }
 642   fprintf(warc_tmp, "\r\n");
 643
 644   warc_write_digest_headers (warc_tmp, -1);
 645   warc_write_block_from_file (warc_tmp);
 646   warc_write_end_record ();
 647
 648   if (! warc_write_ok)
 649     {
 650       logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 651     }
 652
 653   free (filename_copy);
 654   free (filename_basename);
 655   fclose (warc_tmp);
 656   return warc_write_ok;
 657 }
 658
 659 /* Opens a new WARC file.
 660    If META is true, generates a filename ending with 'meta.warc.gz'.
 661
 662    This method will:
 663    1. close the current WARC file (if there is one);
 664    2. increment warc_current_file_number;
 665    3. open a new WARC file;
 666    4. write the initial warcinfo record.
 667
 668    Returns true on success, false otherwise.
 669    */
 670 static bool
 671 warc_start_new_file (bool meta)
 672 {
 673   if (opt.warc_filename == NULL)
 674     return false;
 675
 676   if (warc_current_file != NULL)
 677     fclose (warc_current_file);
 678   if (warc_current_warcinfo_uuid_str)
 679     free (warc_current_warcinfo_uuid_str);
 680   if (warc_current_filename)
 681     free (warc_current_filename);
 682
 683   warc_current_file_number++;
 684
 685   int base_filename_length = strlen (opt.warc_filename);
 686   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 687   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 688   warc_current_filename = new_filename;
 689
 690   char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 691
 692   /* If max size is enabled, we add a serial number to the file names. */
 693   if (meta)
 694     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 695   else if (opt.warc_maxsize > 0)
 696     sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
 697   else
 698     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 699
 700   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 701
 702   /* Open the WARC file. */
 703   warc_current_file = fopen (new_filename, "wb+");
 704   if (warc_current_file == NULL)
 705     {
 706       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
 707       return false;
 708     }
 709
 710   if (! warc_write_warcinfo_record (new_filename))
 711     return false;
 712
 713   /* Add warcinfo uuid to manifest. */
 714   if (warc_manifest_fp)
 715     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 716
 717   return true;
 718 }
 719
 720 /* Opens the CDX file for output. */
 721 static bool
 722 warc_start_cdx_file ()
 723 {
 724   int filename_length = strlen (opt.warc_filename);
 725   char *cdx_filename = alloca (filename_length + 4 + 1);
 726   memcpy (cdx_filename, opt.warc_filename, filename_length);
 727   memcpy (cdx_filename + filename_length, ".cdx", 5);
 728   warc_current_cdx_file = fopen (cdx_filename, "a+");
 729   if (warc_current_cdx_file == NULL)
 730     return false;
 731
 732   /* Print the CDX header.
 733    *
 734    * a - original url
 735    * b - date
 736    * m - mime type
 737    * s - response code
 738    * k - new style checksum
 739    * r - redirect
 740    * M - meta tags
 741    * V - compressed arc file offset
 742    * g - file name
 743    * u - record-id
 744    */
 745   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 746   fflush (warc_current_cdx_file);
 747
 748   return true;
 749 }
 750
 751 #define CDX_FIELDSEP " \t\r\n"
 752
 753 /* Parse the CDX header and find the field numbers of the original url,
 754    checksum and record ID fields. */
 755 static bool
 756 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
 757 {
 758   *field_num_original_url = -1;
 759   *field_num_checksum = -1;
 760   *field_num_record_id = -1;
 761
 762   char *token;
 763   char *save_ptr;
 764   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 765
 766   if (token != NULL && strcmp (token, "CDX") == 0)
 767     {
 768       int field_num = 0;
 769       while (token != NULL)
 770         {
 771           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 772           if (token != NULL)
 773             {
 774               switch (token[0])
 775                 {
 776                 case 'a':
 777                   *field_num_original_url = field_num;
 778                   break;
 779                 case 'k':
 780                   *field_num_checksum = field_num;
 781                   break;
 782                 case 'u':
 783                   *field_num_record_id = field_num;
 784                   break;
 785                 }
 786             }
 787           field_num++;
 788         }
 789     }
 790
 791   return *field_num_original_url != -1
 792          && *field_num_checksum != -1
 793          && *field_num_record_id != -1;
 794 }
 795
 796 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 797 static void
 798 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
 799 {
 800   char *original_url = NULL;
 801   char *checksum = NULL;
 802   char *record_id = NULL;
 803
 804   char *token;
 805   char *save_ptr;
 806   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 807
 808   /* Read this line to get the fields we need. */
 809   int field_num = 0;
 810   while (token != NULL)
 811     {
 812       char **val;
 813       if (field_num == field_num_original_url)
 814         val = &original_url;
 815       else if (field_num == field_num_checksum)
 816         val = &checksum;
 817       else if (field_num == field_num_record_id)
 818         val = &record_id;
 819       else
 820         val = NULL;
 821
 822       if (val != NULL)
 823         *val = strdup (token);
 824
 825       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 826       field_num++;
 827     }
 828
 829   if (original_url != NULL && checksum != NULL && record_id != NULL)
 830     {
 831       /* For some extra efficiency, we decode the base32 encoded
 832          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 833          bytes.  */
 834       size_t checksum_l;
 835       char * checksum_v;
 836       base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
 837       free (checksum);
 838
 839       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 840         {
 841           /* This is a valid line with a valid checksum. */
 842           struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
 843           rec->url = original_url;
 844           rec->uuid = record_id;
 845           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 846           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 847           free (checksum_v);
 848         }
 849       else
 850         {
 851           free (original_url);
 852           if (checksum_v != NULL)
 853             free (checksum_v);
 854           free (record_id);
 855         }
 856     }
 857 }
 858
 859 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 860    the warc_cdx_dedup_table. */
 861 bool
 862 warc_load_cdx_dedup_file ()
 863 {
 864   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 865   if (f == NULL)
 866     return false;
 867
 868   int field_num_original_url = -1;
 869   int field_num_checksum = -1;
 870   int field_num_record_id = -1;
 871
 872   char *lineptr = NULL;
 873   size_t n = 0;
 874   size_t line_length;
 875
 876   /* The first line should contain the CDX header.
 877      Format:  " CDX x x x x x"
 878      where x are field type indicators.  For our purposes, we only
 879      need 'a' (the original url), 'k' (the SHA1 checksum) and
 880      'u' (the WARC record id). */
 881   line_length = getline (&lineptr, &n, f);
 882   if (line_length != -1)
 883     warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
 884
 885   /* If the file contains all three fields, read the complete file. */
 886   if (field_num_original_url == -1
 887       || field_num_checksum == -1
 888       || field_num_record_id == -1)
 889     {
 890       if (field_num_original_url == -1)
 891         logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 892       if (field_num_checksum == -1)
 893         logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 894       if (field_num_record_id == -1)
 895         logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 896     }
 897   else
 898     {
 899       /* Initialize the table. */
 900       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
 901
 902       do
 903         {
 904           line_length = getline (&lineptr, &n, f);
 905           if (line_length != -1)
 906             warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
 907
 908         }
 909       while (line_length != -1);
 910
 911       /* Print results. */
 912       int nrecords = hash_table_count (warc_cdx_dedup_table);
 913       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 914                                         "Loaded %d records from CDX.\n\n", nrecords),
 915                               nrecords);
 916     }
 917
 918   fclose (f);
 919
 920   return true;
 921 }
 922 #undef CDX_FIELDSEP
 923
 924 /* Returns the existing duplicate CDX record for the given url and payload
 925    digest.  Returns NULL if the url is not found or if the payload digest
 926    does not match, or if CDX deduplication is disabled. */
 927 static struct warc_cdx_record *
 928 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
 929 {
 930   if (warc_cdx_dedup_table == NULL)
 931     return NULL;
 932
 933   char *key;
 934   struct warc_cdx_record *rec_existing;
 935   hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
 936
 937   if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
 938     return rec_existing;
 939   else
 940     return NULL;
 941 }
 942
 943 /* Initializes the WARC writer (if opt.warc_filename is set).
 944    This should be called before any WARC record is written. */
 945 void
 946 warc_init ()
 947 {
 948   warc_write_ok = true;
 949
 950   if (opt.warc_filename != NULL)
 951     {
 952       if (opt.warc_cdx_dedup_filename != NULL)
 953         {
 954           if (! warc_load_cdx_dedup_file ())
 955             {
 956               logprintf (LOG_NOTQUIET,
 957                          _("Could not read CDX file %s for deduplication.\n"),
 958                          quote (opt.warc_cdx_dedup_filename));
 959               exit(1);
 960             }
 961         }
 962
 963       warc_manifest_fp = warc_tempfile ();
 964       if (warc_manifest_fp == NULL)
 965         {
 966           logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
 967           exit(1);
 968         }
 969
 970       if (opt.warc_keep_log)
 971         {
 972           warc_log_fp = warc_tempfile ();
 973           if (warc_log_fp == NULL)
 974             {
 975               logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
 976               exit(1);
 977             }
 978           log_set_warc_log_fp (warc_log_fp);
 979         }
 980
 981       warc_current_file_number = -1;
 982       if (! warc_start_new_file (false))
 983         {
 984           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
 985           exit(1);
 986         }
 987
 988       if (opt.warc_cdx_enabled)
 989         {
 990           if (! warc_start_cdx_file ())
 991             {
 992               logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
 993               exit(1);
 994             }
 995         }
 996     }
 997 }
 998
 999 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1000 void
1001 warc_write_metadata ()
1002 {
1003   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1004   if (opt.warc_maxsize > 0)
1005     warc_start_new_file (true);
1006
1007   char manifest_uuid [48];
1008   warc_uuid_str (manifest_uuid);
1009
1010   fflush (warc_manifest_fp);
1011   warc_write_resource_record (manifest_uuid,
1012                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1013                               NULL, NULL, NULL, "text/plain",
1014                               warc_manifest_fp, -1);
1015   /* warc_write_resource_record has closed warc_manifest_fp. */
1016
1017   FILE * warc_tmp_fp = warc_tempfile ();
1018   if (warc_tmp_fp == NULL)
1019     {
1020       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1021       exit(1);
1022     }
1023   fflush (warc_tmp_fp);
1024   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1025
1026   warc_write_resource_record (manifest_uuid,
1027                               "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1028                               NULL, NULL, NULL, "text/plain",
1029                               warc_tmp_fp, -1);
1030   /* warc_write_resource_record has closed warc_tmp_fp. */
1031
1032   if (warc_log_fp != NULL)
1033     {
1034       warc_write_resource_record (NULL,
1035                                   "metadata://gnu.org/software/wget/warc/wget.log",
1036                                   NULL, manifest_uuid, NULL, "text/plain",
1037                                   warc_log_fp, -1);
1038       /* warc_write_resource_record has closed warc_log_fp. */
1039
1040       warc_log_fp = NULL;
1041       log_set_warc_log_fp (NULL);
1042     }
1043 }
1044
1045 /* Finishes the WARC writing.
1046    This should be called at the end of the program. */
1047 void
1048 warc_close ()
1049 {
1050   if (warc_current_file != NULL)
1051     {
1052       warc_write_metadata ();
1053       free (warc_current_warcinfo_uuid_str);
1054       fclose (warc_current_file);
1055     }
1056   if (warc_current_cdx_file != NULL)
1057     fclose (warc_current_cdx_file);
1058   if (warc_log_fp != NULL)
1059     {
1060       fclose (warc_log_fp);
1061       log_set_warc_log_fp (NULL);
1062     }
1063 }
1064
1065 /* Creates a temporary file for writing WARC output.
1066    The temporary file will be created in opt.warc_tempdir.
1067    Returns the pointer to the temporary file, or NULL. */
1068 FILE *
1069 warc_tempfile ()
1070 {
1071   char filename[100];
1072   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1073     return NULL;
1074
1075   int fd = mkstemp (filename);
1076   if (fd < 0)
1077     return NULL;
1078
1079   if (unlink (filename) < 0)
1080     return NULL;
1081
1082   return fdopen (fd, "wb+");
1083 }
1084
1085
1086 /* Writes a request record to the WARC file.
1087    url  is the target uri of the request,
1088    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1089    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1090    body  is a pointer to a file containing the request headers and body.
1091    ip  is the ip address of the server (or NULL),
1092    Calling this function will close body.
1093    Returns true on success, false on error. */
1094 bool
1095 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1096 {
1097   warc_write_start_record ();
1098   warc_write_header ("WARC-Type", "request");
1099   warc_write_header ("WARC-Target-URI", url);
1100   warc_write_header ("Content-Type", "application/http;msgtype=request");
1101   warc_write_date_header (timestamp_str);
1102   warc_write_header ("WARC-Record-ID", record_uuid);
1103   warc_write_ip_header (ip);
1104   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1105   warc_write_digest_headers (body, payload_offset);
1106   warc_write_block_from_file (body);
1107   warc_write_end_record ();
1108
1109   fclose (body);
1110
1111   return warc_write_ok;
1112 }
1113
1114 /* Writes a response record to the CDX file.
1115    url  is the target uri of the request/response,
1116    timestamp_str  is the timestamp of the request that generated this response,
1117                   (generated with warc_timestamp),
1118    mime_type  is the mime type of the response body (will be printed to CDX),
1119    response_code  is the HTTP response code (will be printed to CDX),
1120    payload_digest  is the sha1 digest of the payload,
1121    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1122    offset  is the position of the WARC record in the WARC file,
1123    warc_filename  is the filename of the WARC,
1124    response_uuid  is the uuid of the response.
1125    Returns true on success, false on error. */
1126 static bool
1127 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1128 {
1129   /* Transform the timestamp. */
1130   char timestamp_str_cdx [15];
1131   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1132   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1133   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1134   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1135   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1136   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1137   timestamp_str_cdx[14] = '\0';
1138
1139   /* Rewrite the checksum. */
1140   char *checksum;
1141   if (payload_digest != NULL)
1142     checksum = payload_digest + 5; /* Skip the "sha1:" */
1143   else
1144     checksum = "-";
1145
1146   if (mime_type == NULL || strlen(mime_type) == 0)
1147     mime_type = "-";
1148   if (redirect_location == NULL || strlen(redirect_location) == 0)
1149     redirect_location = "-";
1150
1151   /* Print the CDX line. */
1152   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1153   fflush (warc_current_cdx_file);
1154
1155   return true;
1156 }
1157
1158 /* Writes a revisit record to the WARC file.
1159    url  is the target uri of the request/response,
1160    timestamp_str  is the timestamp of the request that generated this response
1161                   (generated with warc_timestamp),
1162    concurrent_to_uuid  is the uuid of the request for that generated this response
1163                  (generated with warc_uuid_str),
1164    refers_to_uuid  is the uuid of the original response
1165                  (generated with warc_uuid_str),
1166    payload_digest  is the sha1 digest of the payload,
1167    ip  is the ip address of the server (or NULL),
1168    body  is a pointer to a file containing the response headers (without payload).
1169    Calling this function will close body.
1170    Returns true on success, false on error. */
1171 static bool
1172 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1173 {
1174   char revisit_uuid [48];
1175   warc_uuid_str (revisit_uuid);
1176
1177   char *block_digest = NULL;
1178   char sha1_res_block[SHA1_DIGEST_SIZE];
1179   sha1_stream (body, sha1_res_block);
1180   block_digest = warc_base32_sha1_digest (sha1_res_block);
1181
1182   warc_write_start_record ();
1183   warc_write_header ("WARC-Type", "revisit");
1184   warc_write_header ("WARC-Record-ID", revisit_uuid);
1185   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1186   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1187   warc_write_header ("WARC-Refers-To", refers_to);
1188   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1189   warc_write_header ("WARC-Truncated", "length");
1190   warc_write_header ("WARC-Target-URI", url);
1191   warc_write_date_header (timestamp_str);
1192   warc_write_ip_header (ip);
1193   warc_write_header ("Content-Type", "application/http;msgtype=response");
1194   warc_write_header ("WARC-Block-Digest", block_digest);
1195   warc_write_header ("WARC-Payload-Digest", payload_digest);
1196   warc_write_block_from_file (body);
1197   warc_write_end_record ();
1198
1199   fclose (body);
1200   free (block_digest);
1201
1202   return warc_write_ok;
1203 }
1204
1205 /* Writes a response record to the WARC file.
1206    url  is the target uri of the request/response,
1207    timestamp_str  is the timestamp of the request that generated this response
1208                   (generated with warc_timestamp),
1209    concurrent_to_uuid  is the uuid of the request for that generated this response
1210                  (generated with warc_uuid_str),
1211    ip  is the ip address of the server (or NULL),
1212    body  is a pointer to a file containing the response headers and body.
1213    mime_type  is the mime type of the response body (will be printed to CDX),
1214    response_code  is the HTTP response code (will be printed to CDX),
1215    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1216    Calling this function will close body.
1217    Returns true on success, false on error. */
1218 bool
1219 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1220 {
1221   char *block_digest = NULL;
1222   char *payload_digest = NULL;
1223   char sha1_res_block[SHA1_DIGEST_SIZE];
1224   char sha1_res_payload[SHA1_DIGEST_SIZE];
1225
1226   if (opt.warc_digests_enabled)
1227     {
1228       /* Calculate the block and payload digests. */
1229       rewind (body);
1230       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1231         {
1232           /* Decide (based on url + payload digest) if we have seen this
1233              data before. */
1234           struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1235           if (rec_existing != NULL)
1236             {
1237               /* Found an existing record. */
1238               logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1239
1240               /* Remove the payload from the file. */
1241               if (payload_offset > 0)
1242                 {
1243                   if (ftruncate (fileno (body), payload_offset) == -1)
1244                     return false;
1245                 }
1246
1247               /* Send the original payload digest. */
1248               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1249               bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1250               free (payload_digest);
1251
1252               return result;
1253             }
1254
1255           block_digest = warc_base32_sha1_digest (sha1_res_block);
1256           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1257         }
1258     }
1259
1260   /* Not a revisit, just store the record. */
1261
1262   char response_uuid [48];
1263   warc_uuid_str (response_uuid);
1264
1265   fseek (warc_current_file, 0L, SEEK_END);
1266   size_t offset = ftell (warc_current_file);
1267
1268   warc_write_start_record ();
1269   warc_write_header ("WARC-Type", "response");
1270   warc_write_header ("WARC-Record-ID", response_uuid);
1271   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1272   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1273   warc_write_header ("WARC-Target-URI", url);
1274   warc_write_date_header (timestamp_str);
1275   warc_write_ip_header (ip);
1276   warc_write_header ("WARC-Block-Digest", block_digest);
1277   warc_write_header ("WARC-Payload-Digest", payload_digest);
1278   warc_write_header ("Content-Type", "application/http;msgtype=response");
1279   warc_write_block_from_file (body);
1280   warc_write_end_record ();
1281
1282   fclose (body);
1283
1284   if (warc_write_ok && opt.warc_cdx_enabled)
1285     {
1286       /* Add this record to the CDX. */
1287       warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1288     }
1289
1290   if (block_digest)
1291     free (block_digest);
1292   if (payload_digest)
1293     free (payload_digest);
1294
1295   return warc_write_ok;
1296 }
1297
1298 /* Writes a resource record to the WARC file.
1299    resource_uuid  is the uuid of the resource (or NULL),
1300    url  is the target uri of the resource,
1301    timestamp_str  is the timestamp (generated with warc_timestamp),
1302    concurrent_to_uuid  is the uuid of the request for that generated this resource
1303                  (generated with warc_uuid_str) or NULL,
1304    ip  is the ip address of the server (or NULL),
1305    content_type  is the mime type of the body (or NULL),
1306    body  is a pointer to a file containing the resource data.
1307    Calling this function will close body.
1308    Returns true on success, false on error. */
1309 bool
1310 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1311 {
1312   if (resource_uuid == NULL)
1313     {
1314       resource_uuid = alloca (48);
1315       warc_uuid_str (resource_uuid);
1316     }
1317
1318   if (content_type == NULL)
1319     content_type = "application/octet-stream";
1320
1321   warc_write_start_record ();
1322   warc_write_header ("WARC-Type", "resource");
1323   warc_write_header ("WARC-Record-ID", resource_uuid);
1324   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1325   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1326   warc_write_header ("WARC-Target-URI", url);
1327   warc_write_date_header (timestamp_str);
1328   warc_write_ip_header (ip);
1329   warc_write_digest_headers (body, payload_offset);
1330   warc_write_header ("Content-Type", content_type);
1331   warc_write_block_from_file (body);
1332   warc_write_end_record ();
1333
1334   fclose (body);
1335
1336   return warc_write_ok;
1337 }
1338