sjero.net Git - wget/blob - src/warc.c

   1 /* Utility functions for writing WARC files.
   2    Copyright (C) 2011, 2012 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Wget.
   5
   6 GNU Wget is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3 of the License, or (at
   9 your option) any later version.
  10
  11 GNU Wget is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 Additional permission under GNU GPL version 3 section 7
  20
  21 If you modify this program, or any covered work, by linking or
  22 combining it with the OpenSSL project's OpenSSL library (or a
  23 modified version of that library), containing parts covered by the
  24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
  25 grants you additional permission to convey the resulting work.
  26 Corresponding Source for a non-source form of such a combination
  27 shall include the source code for the parts of OpenSSL used as well
  28 as that of the covered work.  */
  29
  30 #define _GNU_SOURCE
  31
  32 #include "wget.h"
  33 #include "hash.h"
  34 #include "utils.h"
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <strings.h>
  40 #include <time.h>
  41 #include <tmpdir.h>
  42 #include <sha1.h>
  43 #include <base32.h>
  44 #include <unistd.h>
  45 #ifdef HAVE_LIBZ
  46 #include <zlib.h>
  47 #endif
  48 #ifdef HAVE_LIBUUID
  49 #include <uuid/uuid.h>
  50 #endif
  51
  52 #ifndef WINDOWS
  53 #include <libgen.h>
  54 #endif
  55
  56 #include "warc.h"
  57
  58 extern char *version_string;
  59
  60 /* Set by main in main.c */
  61 extern char *program_argstring;
  62
  63
  64 /* The log file (a temporary file that contains a copy
  65    of the wget log). */
  66 static FILE *warc_log_fp;
  67
  68 /* The manifest file (a temporary file that contains the
  69    warcinfo uuid of every file in this crawl). */
  70 static FILE *warc_manifest_fp;
  71
  72 /* The current WARC file (or NULL, if WARC is disabled). */
  73 static FILE *warc_current_file;
  74
  75 #ifdef HAVE_LIBZ
  76 /* The gzip stream for the current WARC file
  77    (or NULL, if WARC or gzip is disabled). */
  78 static gzFile warc_current_gzfile;
  79
  80 /* The offset of the current gzip record in the WARC file. */
  81 static off_t warc_current_gzfile_offset;
  82
  83 /* The uncompressed size (so far) of the current record. */
  84 static off_t warc_current_gzfile_uncompressed_size;
  85 # endif
  86
  87 /* This is true until a warc_write_* method fails. */
  88 static bool warc_write_ok;
  89
  90 /* The current CDX file (or NULL, if CDX is disabled). */
  91 static FILE *warc_current_cdx_file;
  92
  93 /* The record id of the warcinfo record of the current WARC file.  */
  94 static char *warc_current_warcinfo_uuid_str;
  95
  96 /* The file name of the current WARC file. */
  97 static char *warc_current_filename;
  98
  99 /* The serial number of the current WARC file.  This number is
 100    incremented each time a new file is opened and is used in the
 101    WARC file's filename. */
 102 static int warc_current_file_number;
 103
 104 /* The table of CDX records, if deduplication is enabled. */
 105 struct hash_table * warc_cdx_dedup_table;
 106
 107 static bool warc_start_new_file (bool meta);
 108
 109
 110 struct warc_cdx_record
 111 {
 112   char *url;
 113   char *uuid;
 114   char digest[SHA1_DIGEST_SIZE];
 115 };
 116
 117 static unsigned long
 118 warc_hash_sha1_digest (const void *key)
 119 {
 120   /* We just use some of the first bytes of the digest. */
 121   unsigned long v = 0;
 122   memcpy (&v, key, sizeof (unsigned long));
 123   return v;
 124 }
 125
 126 static int
 127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
 128 {
 129   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
 130 }
 131
 132
 133
 134 /* Writes SIZE bytes from BUFFER to the current WARC file,
 135    through gzwrite if compression is enabled.
 136    Returns the number of uncompressed bytes written.  */
 137 static size_t
 138 warc_write_buffer (const char *buffer, size_t size)
 139 {
 140 #ifdef HAVE_LIBZ
 141   if (warc_current_gzfile)
 142     {
 143       warc_current_gzfile_uncompressed_size += size;
 144       return gzwrite (warc_current_gzfile, buffer, size);
 145     }
 146   else
 147 #endif
 148     return fwrite (buffer, 1, size, warc_current_file);
 149 }
 150
 151 /* Writes STR to the current WARC file.
 152    Returns false and set warc_write_ok to false if there
 153    is an error.  */
 154 static bool
 155 warc_write_string (const char *str)
 156 {
 157   if (!warc_write_ok)
 158     return false;
 159
 160   size_t n = strlen (str);
 161   if (n != warc_write_buffer (str, n))
 162     warc_write_ok = false;
 163
 164   return warc_write_ok;
 165 }
 166
 167
 168 #define EXTRA_GZIP_HEADER_SIZE 14
 169 #define GZIP_STATIC_HEADER_SIZE  10
 170 #define FLG_FEXTRA          0x04
 171 #define OFF_FLG             3
 172
 173 /* Starts a new WARC record.  Writes the version header.
 174    If opt.warc_maxsize is set and the current file is becoming
 175    too large, this will open a new WARC file.
 176
 177    If compression is enabled, this will start a new
 178    gzip stream in the current WARC file.
 179
 180    Returns false and set warc_write_ok to false if there
 181    is an error.  */
 182 static bool
 183 warc_write_start_record (void)
 184 {
 185   if (!warc_write_ok)
 186     return false;
 187
 188   fflush (warc_current_file);
 189   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
 190     warc_start_new_file (false);
 191
 192 #ifdef HAVE_LIBZ
 193   /* Start a GZIP stream, if required. */
 194   if (opt.warc_compression_enabled)
 195     {
 196       /* Record the starting offset of the new record. */
 197       warc_current_gzfile_offset = ftello (warc_current_file);
 198
 199       /* Reserve space for the extra GZIP header field.
 200          In warc_write_end_record we will fill this space
 201          with information about the uncompressed and
 202          compressed size of the record. */
 203       fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
 204       fflush (warc_current_file);
 205
 206       /* Start a new GZIP stream. */
 207       warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
 208       warc_current_gzfile_uncompressed_size = 0;
 209
 210       if (warc_current_gzfile == NULL)
 211         {
 212           logprintf (LOG_NOTQUIET,
 213 _("Error opening GZIP stream to WARC file.\n"));
 214           warc_write_ok = false;
 215           return false;
 216         }
 217     }
 218 #endif
 219
 220   warc_write_string ("WARC/1.0\r\n");
 221   return warc_write_ok;
 222 }
 223
 224 /* Writes a WARC header to the current WARC record.
 225    This method may be run after warc_write_start_record and
 226    before warc_write_block_from_file.  */
 227 static bool
 228 warc_write_header (const char *name, const char *value)
 229 {
 230   if (value)
 231     {
 232       warc_write_string (name);
 233       warc_write_string (": ");
 234       warc_write_string (value);
 235       warc_write_string ("\r\n");
 236     }
 237   return warc_write_ok;
 238 }
 239
 240 /* Copies the contents of DATA_IN to the WARC record.
 241    Adds a Content-Length header to the WARC record.
 242    Run this method after warc_write_header,
 243    then run warc_write_end_record. */
 244 static bool
 245 warc_write_block_from_file (FILE *data_in)
 246 {
 247   /* Add the Content-Length header. */
 248   char content_length[MAX_INT_TO_STRING_LEN(off_t)];
 249   fseeko (data_in, 0L, SEEK_END);
 250   number_to_string (content_length, ftello (data_in));
 251   warc_write_header ("Content-Length", content_length);
 252
 253   /* End of the WARC header section. */
 254   warc_write_string ("\r\n");
 255
 256   if (fseeko (data_in, 0L, SEEK_SET) != 0)
 257     warc_write_ok = false;
 258
 259   /* Copy the data in the file to the WARC record. */
 260   char buffer[BUFSIZ];
 261   size_t s;
 262   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
 263     {
 264       if (warc_write_buffer (buffer, s) < s)
 265         warc_write_ok = false;
 266     }
 267
 268   return warc_write_ok;
 269 }
 270
 271 /* Run this method to close the current WARC record.
 272
 273    If compression is enabled, this method closes the
 274    current GZIP stream and fills the extra GZIP header
 275    with the uncompressed and compressed length of the
 276    record. */
 277 static bool
 278 warc_write_end_record (void)
 279 {
 280   warc_write_buffer ("\r\n\r\n", 4);
 281
 282 #ifdef HAVE_LIBZ
 283   /* We start a new gzip stream for each record.  */
 284   if (warc_write_ok && warc_current_gzfile)
 285     {
 286       if (gzclose (warc_current_gzfile) != Z_OK)
 287         {
 288           warc_write_ok = false;
 289           return false;
 290         }
 291
 292       fflush (warc_current_file);
 293       fseeko (warc_current_file, 0, SEEK_END);
 294
 295       /* The WARC standard suggests that we add 'skip length' data in the
 296          extra header field of the GZIP stream.
 297
 298          In warc_write_start_record we reserved space for this extra header.
 299          This extra space starts at warc_current_gzfile_offset and fills
 300          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
 301          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
 302
 303          We need to do three things:
 304          1. Move the static GZIP header to warc_current_gzfile_offset;
 305          2. Set the FEXTRA flag in the GZIP header;
 306          3. Write the extra GZIP header after the static header, that is,
 307             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
 308       */
 309
 310       /* Calculate the uncompressed and compressed sizes. */
 311       off_t current_offset = ftello (warc_current_file);
 312       off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
 313       off_t compressed_size = warc_current_gzfile_uncompressed_size;
 314
 315       /* Go back to the static GZIP header. */
 316       fseeko (warc_current_file, warc_current_gzfile_offset
 317               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
 318
 319       /* Read the header. */
 320       char static_header[GZIP_STATIC_HEADER_SIZE];
 321       size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
 322                              warc_current_file);
 323       if (result != GZIP_STATIC_HEADER_SIZE)
 324         {
 325           warc_write_ok = false;
 326           return false;
 327         }
 328
 329       /* Set the FEXTRA flag in the flags byte of the header. */
 330       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
 331
 332       /* Write the header back to the file, but starting at
 333          warc_current_gzfile_offset. */
 334       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
 335       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
 336
 337       /* Prepare the extra GZIP header. */
 338       char extra_header[EXTRA_GZIP_HEADER_SIZE];
 339       /* XLEN, the length of the extra header fields.  */
 340       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
 341       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
 342       /* The extra header field identifier for the WARC skip length. */
 343       extra_header[2]  = 's';
 344       extra_header[3]  = 'l';
 345       /* The size of the field value (8 bytes).  */
 346       extra_header[4]  = (8 & 255);
 347       extra_header[5]  = ((8 >> 8) & 255);
 348       /* The size of the uncompressed record.  */
 349       extra_header[6]  = (uncompressed_size & 255);
 350       extra_header[7]  = (uncompressed_size >> 8) & 255;
 351       extra_header[8]  = (uncompressed_size >> 16) & 255;
 352       extra_header[9]  = (uncompressed_size >> 24) & 255;
 353       /* The size of the compressed record.  */
 354       extra_header[10] = (compressed_size & 255);
 355       extra_header[11] = (compressed_size >> 8) & 255;
 356       extra_header[12] = (compressed_size >> 16) & 255;
 357       extra_header[13] = (compressed_size >> 24) & 255;
 358
 359       /* Write the extra header after the static header. */
 360       fseeko (warc_current_file, warc_current_gzfile_offset
 361               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
 362       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
 363
 364       /* Done, move back to the end of the file. */
 365       fflush (warc_current_file);
 366       fseeko (warc_current_file, 0, SEEK_END);
 367     }
 368 #endif /* HAVE_LIBZ */
 369
 370   return warc_write_ok;
 371 }
 372
 373
 374 /* Writes the WARC-Date header for the given timestamp to
 375    the current WARC record.
 376    If timestamp is NULL, the current time will be used.  */
 377 static bool
 378 warc_write_date_header (const char *timestamp)
 379 {
 380   if (timestamp == NULL)
 381     {
 382       char current_timestamp[21];
 383       warc_timestamp (current_timestamp);
 384       timestamp = current_timestamp;
 385     }
 386   return warc_write_header ("WARC-Date", timestamp);
 387 }
 388
 389 /* Writes the WARC-IP-Address header for the given IP to
 390    the current WARC record.  If IP is NULL, no header will
 391    be written.  */
 392 static bool
 393 warc_write_ip_header (ip_address *ip)
 394 {
 395   if (ip != NULL)
 396     return warc_write_header ("WARC-IP-Address", print_address (ip));
 397   else
 398     return warc_write_ok;
 399 }
 400
 401
 402 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
 403    from gnulib/sha1.c.  This version calculates two digests in one go.
 404
 405    Compute SHA1 message digests for bytes read from STREAM.  The
 406    digest of the complete file will be written into the 16 bytes
 407    beginning at RES_BLOCK.
 408
 409    If payload_offset >= 0, a second digest will be calculated of the
 410    portion of the file starting at payload_offset and continuing to
 411    the end of the file.  The digest number will be written into the
 412    16 bytes beginning ad RES_PAYLOAD.  */
 413 static int
 414 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
 415                                off_t payload_offset)
 416 {
 417 #define BLOCKSIZE 32768
 418
 419   struct sha1_ctx ctx_block;
 420   struct sha1_ctx ctx_payload;
 421   off_t pos;
 422   off_t sum;
 423
 424   char *buffer = malloc (BLOCKSIZE + 72);
 425   if (!buffer)
 426     return 1;
 427
 428   /* Initialize the computation context.  */
 429   sha1_init_ctx (&ctx_block);
 430   if (payload_offset >= 0)
 431     sha1_init_ctx (&ctx_payload);
 432
 433   pos = 0;
 434
 435   /* Iterate over full file contents.  */
 436   while (1)
 437     {
 438       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
 439          computation function processes the whole buffer so that with the
 440          next round of the loop another block can be read.  */
 441       off_t n;
 442       sum = 0;
 443
 444       /* Read block.  Take care for partial reads.  */
 445       while (1)
 446         {
 447           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
 448
 449           sum += n;
 450           pos += n;
 451
 452           if (sum == BLOCKSIZE)
 453             break;
 454
 455           if (n == 0)
 456             {
 457               /* Check for the error flag IFF N == 0, so that we don't
 458                  exit the loop after a partial read due to e.g., EAGAIN
 459                  or EWOULDBLOCK.  */
 460               if (ferror (stream))
 461                 {
 462                   free (buffer);
 463                   return 1;
 464                 }
 465               goto process_partial_block;
 466             }
 467
 468           /* We've read at least one byte, so ignore errors.  But always
 469              check for EOF, since feof may be true even though N > 0.
 470              Otherwise, we could end up calling fread after EOF.  */
 471           if (feof (stream))
 472             goto process_partial_block;
 473         }
 474
 475       /* Process buffer with BLOCKSIZE bytes.  Note that
 476                         BLOCKSIZE % 64 == 0
 477        */
 478       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
 479       if (payload_offset >= 0 && payload_offset < pos)
 480         {
 481           /* At least part of the buffer contains data from payload. */
 482           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
 483           if (start_of_payload <= 0)
 484             /* All bytes in the buffer belong to the payload. */
 485             start_of_payload = 0;
 486
 487           /* Process the payload part of the buffer.
 488              Note: we can't use  sha1_process_block  here even if we
 489              process the complete buffer.  Because the payload doesn't
 490              have to start with a full block, there may still be some
 491              bytes left from the previous buffer.  Therefore, we need
 492              to continue with  sha1_process_bytes.  */
 493           sha1_process_bytes (buffer + start_of_payload,
 494                               BLOCKSIZE - start_of_payload, &ctx_payload);
 495         }
 496     }
 497
 498  process_partial_block:;
 499
 500   /* Process any remaining bytes.  */
 501   if (sum > 0)
 502     {
 503       sha1_process_bytes (buffer, sum, &ctx_block);
 504       if (payload_offset >= 0 && payload_offset < pos)
 505         {
 506           /* At least part of the buffer contains data from payload. */
 507           off_t start_of_payload = payload_offset - (pos - sum);
 508           if (start_of_payload <= 0)
 509             /* All bytes in the buffer belong to the payload. */
 510             start_of_payload = 0;
 511
 512           /* Process the payload part of the buffer. */
 513           sha1_process_bytes (buffer + start_of_payload,
 514                               sum - start_of_payload, &ctx_payload);
 515         }
 516     }
 517
 518   /* Construct result in desired memory.  */
 519   sha1_finish_ctx (&ctx_block,   res_block);
 520   if (payload_offset >= 0)
 521     sha1_finish_ctx (&ctx_payload, res_payload);
 522   free (buffer);
 523   return 0;
 524
 525 #undef BLOCKSIZE
 526 }
 527
 528 /* Converts the SHA1 digest to a base32-encoded string.
 529    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
 530 static char *
 531 warc_base32_sha1_digest (char *sha1_digest)
 532 {
 533   /* length: "sha1:" + digest + "\0" */
 534   char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
 535   base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
 536                  BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
 537   memcpy (sha1_base32, "sha1:", 5);
 538   sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
 539   return sha1_base32;
 540 }
 541
 542
 543 /* Sets the digest headers of the record.
 544    This method will calculate the block digest and, if payload_offset >= 0,
 545    will also calculate the payload digest of the payload starting at the
 546    provided offset.  */
 547 static void
 548 warc_write_digest_headers (FILE *file, long payload_offset)
 549 {
 550   if (opt.warc_digests_enabled)
 551     {
 552       /* Calculate the block and payload digests. */
 553       char sha1_res_block[SHA1_DIGEST_SIZE];
 554       char sha1_res_payload[SHA1_DIGEST_SIZE];
 555
 556       rewind (file);
 557       if (warc_sha1_stream_with_payload (file, sha1_res_block,
 558           sha1_res_payload, payload_offset) == 0)
 559         {
 560           char *digest;
 561
 562           digest = warc_base32_sha1_digest (sha1_res_block);
 563           warc_write_header ("WARC-Block-Digest", digest);
 564           free (digest);
 565
 566           if (payload_offset >= 0)
 567             {
 568               digest = warc_base32_sha1_digest (sha1_res_payload);
 569               warc_write_header ("WARC-Payload-Digest", digest);
 570               free (digest);
 571             }
 572         }
 573     }
 574 }
 575
 576
 577 /* Fills timestamp with the current time and date.
 578    The UTC time is formatted following ISO 8601, as required
 579    for use in the WARC-Date header.
 580    The timestamp will be 21 characters long. */
 581 void
 582 warc_timestamp (char *timestamp)
 583 {
 584   time_t rawtime;
 585   struct tm * timeinfo;
 586   time ( &rawtime );
 587   timeinfo = gmtime (&rawtime);
 588   strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
 589 }
 590
 591 #ifdef HAVE_LIBUUID
 592 /* Fills urn_str with a UUID in the format required
 593    for the WARC-Record-Id header.
 594    The string will be 47 characters long. */
 595 void
 596 warc_uuid_str (char *urn_str)
 597 {
 598   char uuid_str[37];
 599
 600   uuid_t record_id;
 601   uuid_generate (record_id);
 602   uuid_unparse (record_id, uuid_str);
 603
 604   sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
 605 }
 606 #else
 607 /* Fills urn_str with a UUID based on random numbers in the format
 608    required for the WARC-Record-Id header.
 609    (See RFC 4122, UUID version 4.)
 610
 611    Note: this is a fallback method, it is much better to use the
 612    methods provided by libuuid.
 613
 614    The string will be 47 characters long. */
 615 void
 616 warc_uuid_str (char *urn_str)
 617 {
 618   // RFC 4122, a version 4 UUID with only random numbers
 619
 620   unsigned char uuid_data[16];
 621   int i;
 622   for (i=0; i<16; i++)
 623     uuid_data[i] = random_number (255);
 624
 625   // Set the four most significant bits (bits 12 through 15) of the
 626   // time_hi_and_version field to the 4-bit version number
 627   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
 628
 629   // Set the two most significant bits (bits 6 and 7) of the
 630   // clock_seq_hi_and_reserved to zero and one, respectively.
 631   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
 632
 633   sprintf (urn_str,
 634     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
 635     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
 636     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
 637     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
 638     uuid_data[15]);
 639 }
 640 #endif
 641
 642 /* Write a warcinfo record to the current file.
 643    Updates warc_current_warcinfo_uuid_str. */
 644 static bool
 645 warc_write_warcinfo_record (char *filename)
 646 {
 647   /* Write warc-info record as the first record of the file. */
 648   /* We add the record id of this info record to the other records in the
 649      file. */
 650   warc_current_warcinfo_uuid_str = (char *) malloc (48);
 651   warc_uuid_str (warc_current_warcinfo_uuid_str);
 652
 653   char timestamp[22];
 654   warc_timestamp (timestamp);
 655
 656   char *filename_copy, *filename_basename;
 657   filename_copy = strdup (filename);
 658   filename_basename = strdup (basename (filename_copy));
 659
 660   warc_write_start_record ();
 661   warc_write_header ("WARC-Type", "warcinfo");
 662   warc_write_header ("Content-Type", "application/warc-fields");
 663   warc_write_header ("WARC-Date", timestamp);
 664   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
 665   warc_write_header ("WARC-Filename", filename_basename);
 666
 667   /* Create content.  */
 668   FILE *warc_tmp = warc_tempfile ();
 669   if (warc_tmp == NULL)
 670     {
 671       free (filename_copy);
 672       free (filename_basename);
 673       return false;
 674     }
 675
 676   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
 677   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
 678   fprintf (warc_tmp,
 679 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
 680   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
 681   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
 682   /* Add the user headers, if any. */
 683   if (opt.warc_user_headers)
 684     {
 685       int i;
 686       for (i = 0; opt.warc_user_headers[i]; i++)
 687         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
 688     }
 689   fprintf(warc_tmp, "\r\n");
 690
 691   warc_write_digest_headers (warc_tmp, -1);
 692   warc_write_block_from_file (warc_tmp);
 693   warc_write_end_record ();
 694
 695   if (! warc_write_ok)
 696     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
 697
 698   free (filename_copy);
 699   free (filename_basename);
 700   fclose (warc_tmp);
 701   return warc_write_ok;
 702 }
 703
 704 /* Opens a new WARC file.
 705    If META is true, generates a filename ending with 'meta.warc.gz'.
 706
 707    This method will:
 708    1. close the current WARC file (if there is one);
 709    2. increment warc_current_file_number;
 710    3. open a new WARC file;
 711    4. write the initial warcinfo record.
 712
 713    Returns true on success, false otherwise.
 714    */
 715 static bool
 716 warc_start_new_file (bool meta)
 717 {
 718   if (opt.warc_filename == NULL)
 719     return false;
 720
 721   if (warc_current_file != NULL)
 722     fclose (warc_current_file);
 723   if (warc_current_warcinfo_uuid_str)
 724     free (warc_current_warcinfo_uuid_str);
 725   if (warc_current_filename)
 726     free (warc_current_filename);
 727
 728   warc_current_file_number++;
 729
 730   int base_filename_length = strlen (opt.warc_filename);
 731   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
 732   char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
 733   warc_current_filename = new_filename;
 734
 735 #ifdef HAVE_LIBZ
 736   const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
 737 #else
 738   const char *extension = "warc";
 739 #endif
 740
 741   /* If max size is enabled, we add a serial number to the file names. */
 742   if (meta)
 743     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
 744   else if (opt.warc_maxsize > 0)
 745     {
 746       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
 747                warc_current_file_number, extension);
 748     }
 749   else
 750     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
 751
 752   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
 753
 754   /* Open the WARC file. */
 755   warc_current_file = fopen (new_filename, "wb+");
 756   if (warc_current_file == NULL)
 757     {
 758       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
 759                  quote (new_filename));
 760       return false;
 761     }
 762
 763   if (! warc_write_warcinfo_record (new_filename))
 764     return false;
 765
 766   /* Add warcinfo uuid to manifest. */
 767   if (warc_manifest_fp)
 768     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
 769
 770   return true;
 771 }
 772
 773 /* Opens the CDX file for output. */
 774 static bool
 775 warc_start_cdx_file (void)
 776 {
 777   int filename_length = strlen (opt.warc_filename);
 778   char *cdx_filename = alloca (filename_length + 4 + 1);
 779   memcpy (cdx_filename, opt.warc_filename, filename_length);
 780   memcpy (cdx_filename + filename_length, ".cdx", 5);
 781   warc_current_cdx_file = fopen (cdx_filename, "a+");
 782   if (warc_current_cdx_file == NULL)
 783     return false;
 784
 785   /* Print the CDX header.
 786    *
 787    * a - original url
 788    * b - date
 789    * m - mime type
 790    * s - response code
 791    * k - new style checksum
 792    * r - redirect
 793    * M - meta tags
 794    * V - compressed arc file offset
 795    * g - file name
 796    * u - record-id
 797    */
 798   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
 799   fflush (warc_current_cdx_file);
 800
 801   return true;
 802 }
 803
 804 #define CDX_FIELDSEP " \t\r\n"
 805
 806 /* Parse the CDX header and find the field numbers of the original url,
 807    checksum and record ID fields. */
 808 static bool
 809 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
 810                        int *field_num_checksum, int *field_num_record_id)
 811 {
 812   *field_num_original_url = -1;
 813   *field_num_checksum = -1;
 814   *field_num_record_id = -1;
 815
 816   char *token;
 817   char *save_ptr;
 818   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 819
 820   if (token != NULL && strcmp (token, "CDX") == 0)
 821     {
 822       int field_num = 0;
 823       while (token != NULL)
 824         {
 825           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 826           if (token != NULL)
 827             {
 828               switch (token[0])
 829                 {
 830                 case 'a':
 831                   *field_num_original_url = field_num;
 832                   break;
 833                 case 'k':
 834                   *field_num_checksum = field_num;
 835                   break;
 836                 case 'u':
 837                   *field_num_record_id = field_num;
 838                   break;
 839                 }
 840             }
 841           field_num++;
 842         }
 843     }
 844
 845   return *field_num_original_url != -1
 846          && *field_num_checksum != -1
 847          && *field_num_record_id != -1;
 848 }
 849
 850 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
 851 static void
 852 warc_process_cdx_line (char *lineptr, int field_num_original_url,
 853                        int field_num_checksum, int field_num_record_id)
 854 {
 855   char *original_url = NULL;
 856   char *checksum = NULL;
 857   char *record_id = NULL;
 858
 859   char *token;
 860   char *save_ptr;
 861   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
 862
 863   /* Read this line to get the fields we need. */
 864   int field_num = 0;
 865   while (token != NULL)
 866     {
 867       char **val;
 868       if (field_num == field_num_original_url)
 869         val = &original_url;
 870       else if (field_num == field_num_checksum)
 871         val = &checksum;
 872       else if (field_num == field_num_record_id)
 873         val = &record_id;
 874       else
 875         val = NULL;
 876
 877       if (val != NULL)
 878         *val = strdup (token);
 879
 880       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
 881       field_num++;
 882     }
 883
 884   if (original_url != NULL && checksum != NULL && record_id != NULL)
 885     {
 886       /* For some extra efficiency, we decode the base32 encoded
 887          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
 888          bytes.  */
 889       size_t checksum_l;
 890       char * checksum_v;
 891       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
 892                            &checksum_l);
 893       free (checksum);
 894
 895       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
 896         {
 897           /* This is a valid line with a valid checksum. */
 898           struct warc_cdx_record *rec;
 899           rec = malloc (sizeof (struct warc_cdx_record));
 900           rec->url = original_url;
 901           rec->uuid = record_id;
 902           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
 903           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
 904           free (checksum_v);
 905         }
 906       else
 907         {
 908           free (original_url);
 909           if (checksum_v != NULL)
 910             free (checksum_v);
 911           free (record_id);
 912         }
 913     }
 914   else
 915     {
 916       xfree_null(checksum);
 917       xfree_null(original_url);
 918       xfree_null(record_id);
 919     }
 920 }
 921
 922 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
 923    the warc_cdx_dedup_table. */
 924 static bool
 925 warc_load_cdx_dedup_file (void)
 926 {
 927   FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
 928   if (f == NULL)
 929     return false;
 930
 931   int field_num_original_url = -1;
 932   int field_num_checksum = -1;
 933   int field_num_record_id = -1;
 934
 935   char *lineptr = NULL;
 936   size_t n = 0;
 937   ssize_t line_length;
 938
 939   /* The first line should contain the CDX header.
 940      Format:  " CDX x x x x x"
 941      where x are field type indicators.  For our purposes, we only
 942      need 'a' (the original url), 'k' (the SHA1 checksum) and
 943      'u' (the WARC record id). */
 944   line_length = getline (&lineptr, &n, f);
 945   if (line_length != -1)
 946     warc_parse_cdx_header (lineptr, &field_num_original_url,
 947                            &field_num_checksum, &field_num_record_id);
 948
 949   /* If the file contains all three fields, read the complete file. */
 950   if (field_num_original_url == -1
 951       || field_num_checksum == -1
 952       || field_num_record_id == -1)
 953     {
 954       if (field_num_original_url == -1)
 955         logprintf (LOG_NOTQUIET,
 956 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
 957       if (field_num_checksum == -1)
 958         logprintf (LOG_NOTQUIET,
 959 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
 960       if (field_num_record_id == -1)
 961         logprintf (LOG_NOTQUIET,
 962 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
 963     }
 964   else
 965     {
 966       /* Initialize the table. */
 967       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
 968                                              warc_cmp_sha1_digest);
 969
 970       do
 971         {
 972           line_length = getline (&lineptr, &n, f);
 973           if (line_length != -1)
 974             {
 975               warc_process_cdx_line (lineptr, field_num_original_url,
 976                             field_num_checksum, field_num_record_id);
 977             }
 978
 979         }
 980       while (line_length != -1);
 981
 982       /* Print results. */
 983       int nrecords = hash_table_count (warc_cdx_dedup_table);
 984       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
 985                                         "Loaded %d records from CDX.\n\n",
 986                                          nrecords),
 987                               nrecords);
 988     }
 989
 990   free (lineptr);
 991   fclose (f);
 992
 993   return true;
 994 }
 995 #undef CDX_FIELDSEP
 996
 997 /* Returns the existing duplicate CDX record for the given url and payload
 998    digest.  Returns NULL if the url is not found or if the payload digest
 999    does not match, or if CDX deduplication is disabled. */
1000 static struct warc_cdx_record *
1001 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1002 {
1003   if (warc_cdx_dedup_table == NULL)
1004     return NULL;
1005
1006   struct warc_cdx_record *rec_existing
1007     = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1008
1009   if (rec_existing && strcmp (rec_existing->url, url) == 0)
1010     return rec_existing;
1011   else
1012     return NULL;
1013 }
1014
1015 /* Initializes the WARC writer (if opt.warc_filename is set).
1016    This should be called before any WARC record is written. */
1017 void
1018 warc_init (void)
1019 {
1020   warc_write_ok = true;
1021
1022   if (opt.warc_filename != NULL)
1023     {
1024       if (opt.warc_cdx_dedup_filename != NULL)
1025         {
1026           if (! warc_load_cdx_dedup_file ())
1027             {
1028               logprintf (LOG_NOTQUIET,
1029                          _("Could not read CDX file %s for deduplication.\n"),
1030                          quote (opt.warc_cdx_dedup_filename));
1031               exit(1);
1032             }
1033         }
1034
1035       warc_manifest_fp = warc_tempfile ();
1036       if (warc_manifest_fp == NULL)
1037         {
1038           logprintf (LOG_NOTQUIET,
1039                      _("Could not open temporary WARC manifest file.\n"));
1040           exit(1);
1041         }
1042
1043       if (opt.warc_keep_log)
1044         {
1045           warc_log_fp = warc_tempfile ();
1046           if (warc_log_fp == NULL)
1047             {
1048               logprintf (LOG_NOTQUIET,
1049                          _("Could not open temporary WARC log file.\n"));
1050               exit(1);
1051             }
1052           log_set_warc_log_fp (warc_log_fp);
1053         }
1054
1055       warc_current_file_number = -1;
1056       if (! warc_start_new_file (false))
1057         {
1058           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1059           exit(1);
1060         }
1061
1062       if (opt.warc_cdx_enabled)
1063         {
1064           if (! warc_start_cdx_file ())
1065             {
1066               logprintf (LOG_NOTQUIET,
1067                          _("Could not open CDX file for output.\n"));
1068               exit(1);
1069             }
1070         }
1071     }
1072 }
1073
1074 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1075 static void
1076 warc_write_metadata (void)
1077 {
1078   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1079   if (opt.warc_maxsize > 0)
1080     warc_start_new_file (true);
1081
1082   char manifest_uuid [48];
1083   warc_uuid_str (manifest_uuid);
1084
1085   fflush (warc_manifest_fp);
1086   warc_write_metadata_record (manifest_uuid,
1087                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1088                               NULL, NULL, NULL, "text/plain",
1089                               warc_manifest_fp, -1);
1090   /* warc_write_resource_record has closed warc_manifest_fp. */
1091
1092   FILE * warc_tmp_fp = warc_tempfile ();
1093   if (warc_tmp_fp == NULL)
1094     {
1095       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1096       exit(1);
1097     }
1098   fflush (warc_tmp_fp);
1099   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1100
1101   warc_write_resource_record (NULL,
1102                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1103                               NULL, manifest_uuid, NULL, "text/plain",
1104                               warc_tmp_fp, -1);
1105   /* warc_write_resource_record has closed warc_tmp_fp. */
1106
1107   if (warc_log_fp != NULL)
1108     {
1109       warc_write_resource_record (NULL,
1110                               "metadata://gnu.org/software/wget/warc/wget.log",
1111                                   NULL, manifest_uuid, NULL, "text/plain",
1112                                   warc_log_fp, -1);
1113       /* warc_write_resource_record has closed warc_log_fp. */
1114
1115       warc_log_fp = NULL;
1116       log_set_warc_log_fp (NULL);
1117     }
1118 }
1119
1120 /* Finishes the WARC writing.
1121    This should be called at the end of the program. */
1122 void
1123 warc_close (void)
1124 {
1125   if (warc_current_file != NULL)
1126     {
1127       warc_write_metadata ();
1128       free (warc_current_warcinfo_uuid_str);
1129       fclose (warc_current_file);
1130     }
1131   if (warc_current_cdx_file != NULL)
1132     fclose (warc_current_cdx_file);
1133   if (warc_log_fp != NULL)
1134     {
1135       fclose (warc_log_fp);
1136       log_set_warc_log_fp (NULL);
1137     }
1138 }
1139
1140 /* Creates a temporary file for writing WARC output.
1141    The temporary file will be created in opt.warc_tempdir.
1142    Returns the pointer to the temporary file, or NULL. */
1143 FILE *
1144 warc_tempfile (void)
1145 {
1146   char filename[100];
1147   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1148     return NULL;
1149
1150   int fd = mkstemp (filename);
1151   if (fd < 0)
1152     return NULL;
1153
1154   if (unlink (filename) < 0)
1155     return NULL;
1156
1157   return fdopen (fd, "wb+");
1158 }
1159
1160
1161 /* Writes a request record to the WARC file.
1162    url  is the target uri of the request,
1163    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1164    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1165    body  is a pointer to a file containing the request headers and body.
1166    ip  is the ip address of the server (or NULL),
1167    Calling this function will close body.
1168    Returns true on success, false on error. */
1169 bool
1170 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1171                            ip_address *ip, FILE *body, off_t payload_offset)
1172 {
1173   warc_write_start_record ();
1174   warc_write_header ("WARC-Type", "request");
1175   warc_write_header ("WARC-Target-URI", url);
1176   warc_write_header ("Content-Type", "application/http;msgtype=request");
1177   warc_write_date_header (timestamp_str);
1178   warc_write_header ("WARC-Record-ID", record_uuid);
1179   warc_write_ip_header (ip);
1180   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1181   warc_write_digest_headers (body, payload_offset);
1182   warc_write_block_from_file (body);
1183   warc_write_end_record ();
1184
1185   fclose (body);
1186
1187   return warc_write_ok;
1188 }
1189
1190 /* Writes a response record to the CDX file.
1191    url  is the target uri of the request/response,
1192    timestamp_str  is the timestamp of the request that generated this response,
1193                   (generated with warc_timestamp),
1194    mime_type  is the mime type of the response body (will be printed to CDX),
1195    response_code  is the HTTP response code (will be printed to CDX),
1196    payload_digest  is the sha1 digest of the payload,
1197    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1198    offset  is the position of the WARC record in the WARC file,
1199    warc_filename  is the filename of the WARC,
1200    response_uuid  is the uuid of the response.
1201    Returns true on success, false on error. */
1202 static bool
1203 warc_write_cdx_record (const char *url, const char *timestamp_str,
1204                        const char *mime_type, int response_code,
1205                        const char *payload_digest, const char *redirect_location,
1206                        off_t offset, const char *warc_filename,
1207                        const char *response_uuid)
1208 {
1209   /* Transform the timestamp. */
1210   char timestamp_str_cdx [15];
1211   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1212   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1213   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1214   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1215   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1216   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1217   timestamp_str_cdx[14] = '\0';
1218
1219   /* Rewrite the checksum. */
1220   const char *checksum;
1221   if (payload_digest != NULL)
1222     checksum = payload_digest + 5; /* Skip the "sha1:" */
1223   else
1224     checksum = "-";
1225
1226   if (mime_type == NULL || strlen(mime_type) == 0)
1227     mime_type = "-";
1228   if (redirect_location == NULL || strlen(redirect_location) == 0)
1229     redirect_location = "-";
1230
1231   char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1232   number_to_string (offset_string, offset);
1233
1234   /* Print the CDX line. */
1235   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1236            timestamp_str_cdx, url, mime_type, response_code, checksum,
1237            redirect_location, offset_string, warc_current_filename,
1238            response_uuid);
1239   fflush (warc_current_cdx_file);
1240
1241   return true;
1242 }
1243
1244 /* Writes a revisit record to the WARC file.
1245    url  is the target uri of the request/response,
1246    timestamp_str  is the timestamp of the request that generated this response
1247                   (generated with warc_timestamp),
1248    concurrent_to_uuid  is the uuid of the request for that generated this response
1249                  (generated with warc_uuid_str),
1250    refers_to_uuid  is the uuid of the original response
1251                  (generated with warc_uuid_str),
1252    payload_digest  is the sha1 digest of the payload,
1253    ip  is the ip address of the server (or NULL),
1254    body  is a pointer to a file containing the response headers (without payload).
1255    Calling this function will close body.
1256    Returns true on success, false on error. */
1257 static bool
1258 warc_write_revisit_record (char *url, char *timestamp_str,
1259                            char *concurrent_to_uuid, char *payload_digest,
1260                            char *refers_to, ip_address *ip, FILE *body)
1261 {
1262   char revisit_uuid [48];
1263   warc_uuid_str (revisit_uuid);
1264
1265   char *block_digest = NULL;
1266   char sha1_res_block[SHA1_DIGEST_SIZE];
1267   sha1_stream (body, sha1_res_block);
1268   block_digest = warc_base32_sha1_digest (sha1_res_block);
1269
1270   warc_write_start_record ();
1271   warc_write_header ("WARC-Type", "revisit");
1272   warc_write_header ("WARC-Record-ID", revisit_uuid);
1273   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1274   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1275   warc_write_header ("WARC-Refers-To", refers_to);
1276   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1277   warc_write_header ("WARC-Truncated", "length");
1278   warc_write_header ("WARC-Target-URI", url);
1279   warc_write_date_header (timestamp_str);
1280   warc_write_ip_header (ip);
1281   warc_write_header ("Content-Type", "application/http;msgtype=response");
1282   warc_write_header ("WARC-Block-Digest", block_digest);
1283   warc_write_header ("WARC-Payload-Digest", payload_digest);
1284   warc_write_block_from_file (body);
1285   warc_write_end_record ();
1286
1287   fclose (body);
1288   free (block_digest);
1289
1290   return warc_write_ok;
1291 }
1292
1293 /* Writes a response record to the WARC file.
1294    url  is the target uri of the request/response,
1295    timestamp_str  is the timestamp of the request that generated this response
1296                   (generated with warc_timestamp),
1297    concurrent_to_uuid  is the uuid of the request for that generated this response
1298                  (generated with warc_uuid_str),
1299    ip  is the ip address of the server (or NULL),
1300    body  is a pointer to a file containing the response headers and body.
1301    mime_type  is the mime type of the response body (will be printed to CDX),
1302    response_code  is the HTTP response code (will be printed to CDX),
1303    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1304    Calling this function will close body.
1305    Returns true on success, false on error. */
1306 bool
1307 warc_write_response_record (char *url, char *timestamp_str,
1308                             char *concurrent_to_uuid, ip_address *ip,
1309                             FILE *body, off_t payload_offset, char *mime_type,
1310                             int response_code, char *redirect_location)
1311 {
1312   char *block_digest = NULL;
1313   char *payload_digest = NULL;
1314   char sha1_res_block[SHA1_DIGEST_SIZE];
1315   char sha1_res_payload[SHA1_DIGEST_SIZE];
1316
1317   if (opt.warc_digests_enabled)
1318     {
1319       /* Calculate the block and payload digests. */
1320       rewind (body);
1321       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1322           payload_offset) == 0)
1323         {
1324           /* Decide (based on url + payload digest) if we have seen this
1325              data before. */
1326           struct warc_cdx_record *rec_existing;
1327           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1328           if (rec_existing != NULL)
1329             {
1330               bool result;
1331
1332               /* Found an existing record. */
1333               logprintf (LOG_VERBOSE,
1334           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1335
1336               /* Remove the payload from the file. */
1337               if (payload_offset > 0)
1338                 {
1339                   if (ftruncate (fileno (body), payload_offset) == -1)
1340                     return false;
1341                 }
1342
1343               /* Send the original payload digest. */
1344               payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1345               result = warc_write_revisit_record (url, timestamp_str,
1346                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1347                          ip, body);
1348               free (payload_digest);
1349
1350               return result;
1351             }
1352
1353           block_digest = warc_base32_sha1_digest (sha1_res_block);
1354           payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1355         }
1356     }
1357
1358   /* Not a revisit, just store the record. */
1359
1360   char response_uuid [48];
1361   warc_uuid_str (response_uuid);
1362
1363   fseeko (warc_current_file, 0L, SEEK_END);
1364   off_t offset = ftello (warc_current_file);
1365
1366   warc_write_start_record ();
1367   warc_write_header ("WARC-Type", "response");
1368   warc_write_header ("WARC-Record-ID", response_uuid);
1369   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1370   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1371   warc_write_header ("WARC-Target-URI", url);
1372   warc_write_date_header (timestamp_str);
1373   warc_write_ip_header (ip);
1374   warc_write_header ("WARC-Block-Digest", block_digest);
1375   warc_write_header ("WARC-Payload-Digest", payload_digest);
1376   warc_write_header ("Content-Type", "application/http;msgtype=response");
1377   warc_write_block_from_file (body);
1378   warc_write_end_record ();
1379
1380   fclose (body);
1381
1382   if (warc_write_ok && opt.warc_cdx_enabled)
1383     {
1384       /* Add this record to the CDX. */
1385       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1386       payload_digest, redirect_location, offset, warc_current_filename,
1387       response_uuid);
1388     }
1389
1390   if (block_digest)
1391     free (block_digest);
1392   if (payload_digest)
1393     free (payload_digest);
1394
1395   return warc_write_ok;
1396 }
1397
1398 /* Writes a resource or metadata record to the WARC file.
1399    warc_type  is either "resource" or "metadata",
1400    resource_uuid  is the uuid of the resource (or NULL),
1401    url  is the target uri of the resource,
1402    timestamp_str  is the timestamp (generated with warc_timestamp),
1403    concurrent_to_uuid  is the uuid of the record that generated this,
1404    resource (generated with warc_uuid_str) or NULL,
1405    ip  is the ip address of the server (or NULL),
1406    content_type  is the mime type of the body (or NULL),
1407    body  is a pointer to a file containing the resource data.
1408    Calling this function will close body.
1409    Returns true on success, false on error. */
1410 static bool
1411 warc_write_record (const char *record_type, char *resource_uuid,
1412                  const char *url, const char *timestamp_str,
1413                  const char *concurrent_to_uuid,
1414                  ip_address *ip, const char *content_type, FILE *body,
1415                  off_t payload_offset)
1416 {
1417   if (resource_uuid == NULL)
1418     {
1419       resource_uuid = alloca (48);
1420       warc_uuid_str (resource_uuid);
1421     }
1422
1423   if (content_type == NULL)
1424     content_type = "application/octet-stream";
1425
1426   warc_write_start_record ();
1427   warc_write_header ("WARC-Type", record_type);
1428   warc_write_header ("WARC-Record-ID", resource_uuid);
1429   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1430   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1431   warc_write_header ("WARC-Target-URI", url);
1432   warc_write_date_header (timestamp_str);
1433   warc_write_ip_header (ip);
1434   warc_write_digest_headers (body, payload_offset);
1435   warc_write_header ("Content-Type", content_type);
1436   warc_write_block_from_file (body);
1437   warc_write_end_record ();
1438
1439   fclose (body);
1440
1441   return warc_write_ok;
1442 }
1443
1444 /* Writes a resource record to the WARC file.
1445    resource_uuid  is the uuid of the resource (or NULL),
1446    url  is the target uri of the resource,
1447    timestamp_str  is the timestamp (generated with warc_timestamp),
1448    concurrent_to_uuid  is the uuid of the record that generated this,
1449    resource (generated with warc_uuid_str) or NULL,
1450    ip  is the ip address of the server (or NULL),
1451    content_type  is the mime type of the body (or NULL),
1452    body  is a pointer to a file containing the resource data.
1453    Calling this function will close body.
1454    Returns true on success, false on error. */
1455 bool
1456 warc_write_resource_record (char *resource_uuid, const char *url,
1457                  const char *timestamp_str, const char *concurrent_to_uuid,
1458                  ip_address *ip, const char *content_type, FILE *body,
1459                  off_t payload_offset)
1460 {
1461   return warc_write_record ("resource",
1462       resource_uuid, url, timestamp_str, concurrent_to_uuid,
1463       ip, content_type, body, payload_offset);
1464 }
1465
1466 /* Writes a metadata record to the WARC file.
1467    record_uuid  is the uuid of the record (or NULL),
1468    url  is the target uri of the record,
1469    timestamp_str  is the timestamp (generated with warc_timestamp),
1470    concurrent_to_uuid  is the uuid of the record that generated this,
1471    record (generated with warc_uuid_str) or NULL,
1472    ip  is the ip address of the server (or NULL),
1473    content_type  is the mime type of the body (or NULL),
1474    body  is a pointer to a file containing the record data.
1475    Calling this function will close body.
1476    Returns true on success, false on error. */
1477 bool
1478 warc_write_metadata_record (char *record_uuid, const char *url,
1479                  const char *timestamp_str, const char *concurrent_to_uuid,
1480                  ip_address *ip, const char *content_type, FILE *body,
1481                  off_t payload_offset)
1482 {
1483   return warc_write_record ("metadata",
1484       record_uuid, url, timestamp_str, concurrent_to_uuid,
1485       ip, content_type, body, payload_offset);
1486 }