1 /* Utility functions for writing WARC files.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
49 #include <uuid/uuid.h>
58 extern char *version_string;
60 /* Set by main in main.c */
61 extern char *program_argstring;
64 /* The log file (a temporary file that contains a copy
66 static FILE *warc_log_fp;
68 /* The manifest file (a temporary file that contains the
69 warcinfo uuid of every file in this crawl). */
70 static FILE *warc_manifest_fp;
72 /* The current WARC file (or NULL, if WARC is disabled). */
73 static FILE *warc_current_file;
76 /* The gzip stream for the current WARC file
77 (or NULL, if WARC or gzip is disabled). */
78 static gzFile *warc_current_gzfile;
80 /* The offset of the current gzip record in the WARC file. */
81 static size_t warc_current_gzfile_offset;
83 /* The uncompressed size (so far) of the current record. */
84 static size_t warc_current_gzfile_uncompressed_size;
87 /* This is true until a warc_write_* method fails. */
88 static bool warc_write_ok;
90 /* The current CDX file (or NULL, if CDX is disabled). */
91 static FILE *warc_current_cdx_file;
93 /* The record id of the warcinfo record of the current WARC file. */
94 static char *warc_current_warcinfo_uuid_str;
96 /* The file name of the current WARC file. */
97 static char *warc_current_filename;
99 /* The serial number of the current WARC file. This number is
100 incremented each time a new file is opened and is used in the
101 WARC file's filename. */
102 static int warc_current_file_number;
104 /* The table of CDX records, if deduplication is enabled. */
105 struct hash_table * warc_cdx_dedup_table;
107 static bool warc_start_new_file (bool meta);
110 struct warc_cdx_record
114 char digest[SHA1_DIGEST_SIZE];
118 warc_hash_sha1_digest (const void *key)
120 /* We just use some of the first bytes of the digest. */
122 memcpy (&v, key, sizeof (unsigned long));
127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
129 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
134 /* Writes SIZE bytes from BUFFER to the current WARC file,
135 through gzwrite if compression is enabled.
136 Returns the number of uncompressed bytes written. */
138 warc_write_buffer (const char *buffer, size_t size)
141 if (warc_current_gzfile)
143 warc_current_gzfile_uncompressed_size += size;
144 return gzwrite (warc_current_gzfile, buffer, size);
148 return fwrite (buffer, 1, size, warc_current_file);
151 /* Writes STR to the current WARC file.
152 Returns false and set warc_write_ok to false if there
155 warc_write_string (const char *str)
160 size_t n = strlen (str);
161 if (n != warc_write_buffer (str, n))
162 warc_write_ok = false;
164 return warc_write_ok;
168 #define EXTRA_GZIP_HEADER_SIZE 12
169 #define GZIP_STATIC_HEADER_SIZE 10
170 #define FLG_FEXTRA 0x04
173 /* Starts a new WARC record. Writes the version header.
174 If opt.warc_maxsize is set and the current file is becoming
175 too large, this will open a new WARC file.
177 If compression is enabled, this will start a new
178 gzip stream in the current WARC file.
180 Returns false and set warc_write_ok to false if there
183 warc_write_start_record ()
188 fflush (warc_current_file);
189 if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
190 warc_start_new_file (false);
193 /* Start a GZIP stream, if required. */
194 if (opt.warc_compression_enabled)
196 /* Record the starting offset of the new record. */
197 warc_current_gzfile_offset = ftell (warc_current_file);
199 /* Reserve space for the extra GZIP header field.
200 In warc_write_end_record we will fill this space
201 with information about the uncompressed and
202 compressed size of the record. */
203 fprintf (warc_current_file, "XXXXXXXXXXXX");
204 fflush (warc_current_file);
206 /* Start a new GZIP stream. */
207 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
208 warc_current_gzfile_uncompressed_size = 0;
210 if (warc_current_gzfile == NULL)
212 logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
213 warc_write_ok = false;
219 warc_write_string ("WARC/1.0\r\n");
220 return warc_write_ok;
223 /* Writes a WARC header to the current WARC record.
224 This method may be run after warc_write_start_record and
225 before warc_write_block_from_file. */
227 warc_write_header (const char *name, const char *value)
231 warc_write_string (name);
232 warc_write_string (": ");
233 warc_write_string (value);
234 warc_write_string ("\r\n");
236 return warc_write_ok;
239 /* Copies the contents of DATA_IN to the WARC record.
240 Adds a Content-Length header to the WARC record.
241 Run this method after warc_write_header,
242 then run warc_write_end_record. */
244 warc_write_block_from_file (FILE *data_in)
246 /* Add the Content-Length header. */
247 char *content_length;
248 fseek (data_in, 0L, SEEK_END);
249 if (! asprintf (&content_length, "%ld", ftell (data_in)))
251 warc_write_ok = false;
254 warc_write_header ("Content-Length", content_length);
255 free (content_length);
257 /* End of the WARC header section. */
258 warc_write_string ("\r\n");
260 if (fseek (data_in, 0L, SEEK_SET) != 0)
261 warc_write_ok = false;
263 /* Copy the data in the file to the WARC record. */
266 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
268 if (warc_write_buffer (buffer, s) < s)
269 warc_write_ok = false;
272 return warc_write_ok;
275 /* Run this method to close the current WARC record.
277 If compression is enabled, this method closes the
278 current GZIP stream and fills the extra GZIP header
279 with the uncompressed and compressed length of the
282 warc_write_end_record ()
284 warc_write_buffer ("\r\n\r\n", 4);
287 /* We start a new gzip stream for each record. */
288 if (warc_write_ok && warc_current_gzfile)
290 if (gzclose (warc_current_gzfile) != Z_OK)
292 warc_write_ok = false;
296 fflush (warc_current_file);
297 fseek (warc_current_file, 0, SEEK_END);
299 /* The WARC standard suggests that we add 'skip length' data in the
300 extra header field of the GZIP stream.
302 In warc_write_start_record we reserved space for this extra header.
303 This extra space starts at warc_current_gzfile_offset and fills
304 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
305 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
307 We need to do three things:
308 1. Move the static GZIP header to warc_current_gzfile_offset;
309 2. Set the FEXTRA flag in the GZIP header;
310 3. Write the extra GZIP header after the static header, that is,
311 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
314 /* Calculate the uncompressed and compressed sizes. */
315 size_t current_offset = ftell (warc_current_file);
316 size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
317 size_t compressed_size = warc_current_gzfile_uncompressed_size;
319 /* Go back to the static GZIP header. */
320 fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
322 /* Read the header. */
323 char static_header[GZIP_STATIC_HEADER_SIZE];
324 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
325 if (result != GZIP_STATIC_HEADER_SIZE)
327 warc_write_ok = false;
331 /* Set the FEXTRA flag in the flags byte of the header. */
332 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
334 /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
335 fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
336 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
338 /* Prepare the extra GZIP header. */
339 char extra_header[EXTRA_GZIP_HEADER_SIZE];
340 /* XLEN, the length of the extra header fields. */
341 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
342 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
343 /* The extra header field identifier for the WARC skip length. */
344 extra_header[2] = 's';
345 extra_header[3] = 'l';
346 /* The size of the uncompressed record. */
347 extra_header[4] = (uncompressed_size & 255);
348 extra_header[5] = (uncompressed_size >> 8) & 255;
349 extra_header[6] = (uncompressed_size >> 16) & 255;
350 extra_header[7] = (uncompressed_size >> 24) & 255;
351 /* The size of the compressed record. */
352 extra_header[8] = (compressed_size & 255);
353 extra_header[9] = (compressed_size >> 8) & 255;
354 extra_header[10] = (compressed_size >> 16) & 255;
355 extra_header[11] = (compressed_size >> 24) & 255;
357 /* Write the extra header after the static header. */
358 fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
359 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
361 /* Done, move back to the end of the file. */
362 fflush (warc_current_file);
363 fseek (warc_current_file, 0, SEEK_END);
365 #endif /* HAVE_LIBZ */
367 return warc_write_ok;
371 /* Writes the WARC-Date header for the given timestamp to
372 the current WARC record.
373 If timestamp is NULL, the current time will be used. */
375 warc_write_date_header (char *timestamp)
377 if (timestamp == NULL)
379 char current_timestamp[21];
380 warc_timestamp (current_timestamp);
381 timestamp = current_timestamp;
383 return warc_write_header ("WARC-Date", timestamp);
386 /* Writes the WARC-IP-Address header for the given IP to
387 the current WARC record. If IP is NULL, no header will
390 warc_write_ip_header (ip_address *ip)
393 return warc_write_header ("WARC-IP-Address", print_address (ip));
395 return warc_write_ok;
399 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
400 from gnulib/sha1.c. This version calculates two digests in one go.
402 Compute SHA1 message digests for bytes read from STREAM. The
403 digest of the complete file will be written into the 16 bytes
404 beginning at RES_BLOCK.
406 If payload_offset >= 0, a second digest will be calculated of the
407 portion of the file starting at payload_offset and continuing to
408 the end of the file. The digest number will be written into the
409 16 bytes beginning ad RES_PAYLOAD. */
411 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
413 #define BLOCKSIZE 32768
415 struct sha1_ctx ctx_block;
416 struct sha1_ctx ctx_payload;
420 char *buffer = malloc (BLOCKSIZE + 72);
424 /* Initialize the computation context. */
425 sha1_init_ctx (&ctx_block);
426 if (payload_offset >= 0)
427 sha1_init_ctx (&ctx_payload);
431 /* Iterate over full file contents. */
434 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
435 computation function processes the whole buffer so that with the
436 next round of the loop another block can be read. */
440 /* Read block. Take care for partial reads. */
443 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
448 if (sum == BLOCKSIZE)
453 /* Check for the error flag IFF N == 0, so that we don't
454 exit the loop after a partial read due to e.g., EAGAIN
461 goto process_partial_block;
464 /* We've read at least one byte, so ignore errors. But always
465 check for EOF, since feof may be true even though N > 0.
466 Otherwise, we could end up calling fread after EOF. */
468 goto process_partial_block;
471 /* Process buffer with BLOCKSIZE bytes. Note that
474 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
475 if (payload_offset >= 0 && payload_offset < pos)
477 /* At least part of the buffer contains data from payload. */
478 int start_of_payload = payload_offset - (pos - BLOCKSIZE);
479 if (start_of_payload <= 0)
480 /* All bytes in the buffer belong to the payload. */
481 start_of_payload = 0;
483 /* Process the payload part of the buffer.
484 Note: we can't use sha1_process_block here even if we
485 process the complete buffer. Because the payload doesn't
486 have to start with a full block, there may still be some
487 bytes left from the previous buffer. Therefore, we need
488 to continue with sha1_process_bytes. */
489 sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
493 process_partial_block:;
495 /* Process any remaining bytes. */
498 sha1_process_bytes (buffer, sum, &ctx_block);
499 if (payload_offset >= 0 && payload_offset < pos)
501 /* At least part of the buffer contains data from payload. */
502 int start_of_payload = payload_offset - (pos - sum);
503 if (start_of_payload <= 0)
504 /* All bytes in the buffer belong to the payload. */
505 start_of_payload = 0;
507 /* Process the payload part of the buffer. */
508 sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
512 /* Construct result in desired memory. */
513 sha1_finish_ctx (&ctx_block, res_block);
514 if (payload_offset >= 0)
515 sha1_finish_ctx (&ctx_payload, res_payload);
522 /* Converts the SHA1 digest to a base32-encoded string.
523 "sha1:DIGEST\0" (Allocates a new string for the response.) */
525 warc_base32_sha1_digest (char *sha1_digest)
527 // length: "sha1:" + digest + "\0"
528 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
529 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
530 memcpy (sha1_base32, "sha1:", 5);
531 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
536 /* Sets the digest headers of the record.
537 This method will calculate the block digest and, if payload_offset >= 0,
538 will also calculate the payload digest of the payload starting at the
541 warc_write_digest_headers (FILE *file, long payload_offset)
543 if (opt.warc_digests_enabled)
545 /* Calculate the block and payload digests. */
546 char sha1_res_block[SHA1_DIGEST_SIZE];
547 char sha1_res_payload[SHA1_DIGEST_SIZE];
550 if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
554 digest = warc_base32_sha1_digest (sha1_res_block);
555 warc_write_header ("WARC-Block-Digest", digest);
558 if (payload_offset >= 0)
560 digest = warc_base32_sha1_digest (sha1_res_payload);
561 warc_write_header ("WARC-Payload-Digest", digest);
569 /* Fills timestamp with the current time and date.
570 The UTC time is formatted following ISO 8601, as required
571 for use in the WARC-Date header.
572 The timestamp will be 21 characters long. */
574 warc_timestamp (char *timestamp)
577 struct tm * timeinfo;
579 timeinfo = gmtime (&rawtime);
580 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
583 /* Fills uuid_str with a UUID based on random numbers.
584 (See RFC 4122, UUID version 4.)
586 Note: this is a fallback method, it is much better to use the
587 methods provided by libuuid.
589 The uuid_str will be 36 characters long. */
591 warc_uuid_random (char *uuid_str)
593 // RFC 4122, a version 4 UUID with only random numbers
595 unsigned char uuid_data[16];
598 uuid_data[i] = random_number (255);
600 // Set the four most significant bits (bits 12 through 15) of the
601 // time_hi_and_version field to the 4-bit version number
602 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
604 // Set the two most significant bits (bits 6 and 7) of the
605 // clock_seq_hi_and_reserved to zero and one, respectively.
606 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
609 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
610 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
611 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
612 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
616 /* Fills urn_str with a UUID in the format required
617 for the WARC-Record-Id header.
618 The string will be 47 characters long. */
620 warc_uuid_str (char *urn_str)
626 uuid_generate (record_id);
627 uuid_unparse (record_id, uuid_str);
629 warc_uuid_random (uuid_str);
632 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
635 /* Write a warcinfo record to the current file.
636 Updates warc_current_warcinfo_uuid_str. */
638 warc_write_warcinfo_record (char *filename)
640 /* Write warc-info record as the first record of the file. */
641 /* We add the record id of this info record to the other records in the file. */
642 warc_current_warcinfo_uuid_str = (char *) malloc (48);
643 warc_uuid_str (warc_current_warcinfo_uuid_str);
646 warc_timestamp (timestamp);
648 char *filename_copy, *filename_basename;
649 filename_copy = strdup (filename);
650 filename_basename = strdup (basename (filename_copy));
652 warc_write_start_record ();
653 warc_write_header ("WARC-Type", "warcinfo");
654 warc_write_header ("Content-Type", "application/warc-fields");
655 warc_write_header ("WARC-Date", timestamp);
656 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
657 warc_write_header ("WARC-Filename", filename_basename);
659 /* Create content. */
660 FILE *warc_tmp = warc_tempfile ();
661 if (warc_tmp == NULL)
663 free (filename_copy);
664 free (filename_basename);
668 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
669 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
670 fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
671 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
672 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
673 /* Add the user headers, if any. */
674 if (opt.warc_user_headers)
677 for (i = 0; opt.warc_user_headers[i]; i++)
678 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
680 fprintf(warc_tmp, "\r\n");
682 warc_write_digest_headers (warc_tmp, -1);
683 warc_write_block_from_file (warc_tmp);
684 warc_write_end_record ();
688 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
691 free (filename_copy);
692 free (filename_basename);
694 return warc_write_ok;
697 /* Opens a new WARC file.
698 If META is true, generates a filename ending with 'meta.warc.gz'.
701 1. close the current WARC file (if there is one);
702 2. increment warc_current_file_number;
703 3. open a new WARC file;
704 4. write the initial warcinfo record.
706 Returns true on success, false otherwise.
709 warc_start_new_file (bool meta)
711 if (opt.warc_filename == NULL)
714 if (warc_current_file != NULL)
715 fclose (warc_current_file);
716 if (warc_current_warcinfo_uuid_str)
717 free (warc_current_warcinfo_uuid_str);
718 if (warc_current_filename)
719 free (warc_current_filename);
721 warc_current_file_number++;
723 int base_filename_length = strlen (opt.warc_filename);
724 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
725 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
726 warc_current_filename = new_filename;
729 char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
731 char *extension = "warc";
734 /* If max size is enabled, we add a serial number to the file names. */
736 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
737 else if (opt.warc_maxsize > 0)
738 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
740 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
742 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
744 /* Open the WARC file. */
745 warc_current_file = fopen (new_filename, "wb+");
746 if (warc_current_file == NULL)
748 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
752 if (! warc_write_warcinfo_record (new_filename))
755 /* Add warcinfo uuid to manifest. */
756 if (warc_manifest_fp)
757 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
762 /* Opens the CDX file for output. */
764 warc_start_cdx_file ()
766 int filename_length = strlen (opt.warc_filename);
767 char *cdx_filename = alloca (filename_length + 4 + 1);
768 memcpy (cdx_filename, opt.warc_filename, filename_length);
769 memcpy (cdx_filename + filename_length, ".cdx", 5);
770 warc_current_cdx_file = fopen (cdx_filename, "a+");
771 if (warc_current_cdx_file == NULL)
774 /* Print the CDX header.
780 * k - new style checksum
783 * V - compressed arc file offset
787 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
788 fflush (warc_current_cdx_file);
793 #define CDX_FIELDSEP " \t\r\n"
795 /* Parse the CDX header and find the field numbers of the original url,
796 checksum and record ID fields. */
798 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
800 *field_num_original_url = -1;
801 *field_num_checksum = -1;
802 *field_num_record_id = -1;
806 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
808 if (token != NULL && strcmp (token, "CDX") == 0)
811 while (token != NULL)
813 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
819 *field_num_original_url = field_num;
822 *field_num_checksum = field_num;
825 *field_num_record_id = field_num;
833 return *field_num_original_url != -1
834 && *field_num_checksum != -1
835 && *field_num_record_id != -1;
838 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
840 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
842 char *original_url = NULL;
843 char *checksum = NULL;
844 char *record_id = NULL;
848 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
850 /* Read this line to get the fields we need. */
852 while (token != NULL)
855 if (field_num == field_num_original_url)
857 else if (field_num == field_num_checksum)
859 else if (field_num == field_num_record_id)
865 *val = strdup (token);
867 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
871 if (original_url != NULL && checksum != NULL && record_id != NULL)
873 /* For some extra efficiency, we decode the base32 encoded
874 checksum value. This should produce exactly SHA1_DIGEST_SIZE
878 base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
881 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
883 /* This is a valid line with a valid checksum. */
884 struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
885 rec->url = original_url;
886 rec->uuid = record_id;
887 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
888 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
894 if (checksum_v != NULL)
901 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
902 the warc_cdx_dedup_table. */
904 warc_load_cdx_dedup_file ()
906 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
910 int field_num_original_url = -1;
911 int field_num_checksum = -1;
912 int field_num_record_id = -1;
914 char *lineptr = NULL;
918 /* The first line should contain the CDX header.
919 Format: " CDX x x x x x"
920 where x are field type indicators. For our purposes, we only
921 need 'a' (the original url), 'k' (the SHA1 checksum) and
922 'u' (the WARC record id). */
923 line_length = getline (&lineptr, &n, f);
924 if (line_length != -1)
925 warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
927 /* If the file contains all three fields, read the complete file. */
928 if (field_num_original_url == -1
929 || field_num_checksum == -1
930 || field_num_record_id == -1)
932 if (field_num_original_url == -1)
933 logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
934 if (field_num_checksum == -1)
935 logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
936 if (field_num_record_id == -1)
937 logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
941 /* Initialize the table. */
942 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
946 line_length = getline (&lineptr, &n, f);
947 if (line_length != -1)
948 warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
951 while (line_length != -1);
954 int nrecords = hash_table_count (warc_cdx_dedup_table);
955 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
956 "Loaded %d records from CDX.\n\n", nrecords),
966 /* Returns the existing duplicate CDX record for the given url and payload
967 digest. Returns NULL if the url is not found or if the payload digest
968 does not match, or if CDX deduplication is disabled. */
969 static struct warc_cdx_record *
970 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
972 if (warc_cdx_dedup_table == NULL)
976 struct warc_cdx_record *rec_existing;
977 hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
979 if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
985 /* Initializes the WARC writer (if opt.warc_filename is set).
986 This should be called before any WARC record is written. */
990 warc_write_ok = true;
992 if (opt.warc_filename != NULL)
994 if (opt.warc_cdx_dedup_filename != NULL)
996 if (! warc_load_cdx_dedup_file ())
998 logprintf (LOG_NOTQUIET,
999 _("Could not read CDX file %s for deduplication.\n"),
1000 quote (opt.warc_cdx_dedup_filename));
1005 warc_manifest_fp = warc_tempfile ();
1006 if (warc_manifest_fp == NULL)
1008 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
1012 if (opt.warc_keep_log)
1014 warc_log_fp = warc_tempfile ();
1015 if (warc_log_fp == NULL)
1017 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
1020 log_set_warc_log_fp (warc_log_fp);
1023 warc_current_file_number = -1;
1024 if (! warc_start_new_file (false))
1026 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1030 if (opt.warc_cdx_enabled)
1032 if (! warc_start_cdx_file ())
1034 logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
1041 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1043 warc_write_metadata ()
1045 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1046 if (opt.warc_maxsize > 0)
1047 warc_start_new_file (true);
1049 char manifest_uuid [48];
1050 warc_uuid_str (manifest_uuid);
1052 fflush (warc_manifest_fp);
1053 warc_write_resource_record (manifest_uuid,
1054 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1055 NULL, NULL, NULL, "text/plain",
1056 warc_manifest_fp, -1);
1057 /* warc_write_resource_record has closed warc_manifest_fp. */
1059 FILE * warc_tmp_fp = warc_tempfile ();
1060 if (warc_tmp_fp == NULL)
1062 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1065 fflush (warc_tmp_fp);
1066 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1068 warc_write_resource_record (manifest_uuid,
1069 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1070 NULL, NULL, NULL, "text/plain",
1072 /* warc_write_resource_record has closed warc_tmp_fp. */
1074 if (warc_log_fp != NULL)
1076 warc_write_resource_record (NULL,
1077 "metadata://gnu.org/software/wget/warc/wget.log",
1078 NULL, manifest_uuid, NULL, "text/plain",
1080 /* warc_write_resource_record has closed warc_log_fp. */
1083 log_set_warc_log_fp (NULL);
1087 /* Finishes the WARC writing.
1088 This should be called at the end of the program. */
1092 if (warc_current_file != NULL)
1094 warc_write_metadata ();
1095 free (warc_current_warcinfo_uuid_str);
1096 fclose (warc_current_file);
1098 if (warc_current_cdx_file != NULL)
1099 fclose (warc_current_cdx_file);
1100 if (warc_log_fp != NULL)
1102 fclose (warc_log_fp);
1103 log_set_warc_log_fp (NULL);
1107 /* Creates a temporary file for writing WARC output.
1108 The temporary file will be created in opt.warc_tempdir.
1109 Returns the pointer to the temporary file, or NULL. */
1114 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1117 int fd = mkstemp (filename);
1121 if (unlink (filename) < 0)
1124 return fdopen (fd, "wb+");
1128 /* Writes a request record to the WARC file.
1129 url is the target uri of the request,
1130 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1131 record_uuid is the uuid of the request (generated with warc_uuid_str),
1132 body is a pointer to a file containing the request headers and body.
1133 ip is the ip address of the server (or NULL),
1134 Calling this function will close body.
1135 Returns true on success, false on error. */
1137 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1139 warc_write_start_record ();
1140 warc_write_header ("WARC-Type", "request");
1141 warc_write_header ("WARC-Target-URI", url);
1142 warc_write_header ("Content-Type", "application/http;msgtype=request");
1143 warc_write_date_header (timestamp_str);
1144 warc_write_header ("WARC-Record-ID", record_uuid);
1145 warc_write_ip_header (ip);
1146 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1147 warc_write_digest_headers (body, payload_offset);
1148 warc_write_block_from_file (body);
1149 warc_write_end_record ();
1153 return warc_write_ok;
1156 /* Writes a response record to the CDX file.
1157 url is the target uri of the request/response,
1158 timestamp_str is the timestamp of the request that generated this response,
1159 (generated with warc_timestamp),
1160 mime_type is the mime type of the response body (will be printed to CDX),
1161 response_code is the HTTP response code (will be printed to CDX),
1162 payload_digest is the sha1 digest of the payload,
1163 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1164 offset is the position of the WARC record in the WARC file,
1165 warc_filename is the filename of the WARC,
1166 response_uuid is the uuid of the response.
1167 Returns true on success, false on error. */
1169 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1171 /* Transform the timestamp. */
1172 char timestamp_str_cdx [15];
1173 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1174 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1175 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1176 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1177 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1178 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1179 timestamp_str_cdx[14] = '\0';
1181 /* Rewrite the checksum. */
1183 if (payload_digest != NULL)
1184 checksum = payload_digest + 5; /* Skip the "sha1:" */
1188 if (mime_type == NULL || strlen(mime_type) == 0)
1190 if (redirect_location == NULL || strlen(redirect_location) == 0)
1191 redirect_location = "-";
1193 /* Print the CDX line. */
1194 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1195 fflush (warc_current_cdx_file);
1200 /* Writes a revisit record to the WARC file.
1201 url is the target uri of the request/response,
1202 timestamp_str is the timestamp of the request that generated this response
1203 (generated with warc_timestamp),
1204 concurrent_to_uuid is the uuid of the request for that generated this response
1205 (generated with warc_uuid_str),
1206 refers_to_uuid is the uuid of the original response
1207 (generated with warc_uuid_str),
1208 payload_digest is the sha1 digest of the payload,
1209 ip is the ip address of the server (or NULL),
1210 body is a pointer to a file containing the response headers (without payload).
1211 Calling this function will close body.
1212 Returns true on success, false on error. */
1214 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1216 char revisit_uuid [48];
1217 warc_uuid_str (revisit_uuid);
1219 char *block_digest = NULL;
1220 char sha1_res_block[SHA1_DIGEST_SIZE];
1221 sha1_stream (body, sha1_res_block);
1222 block_digest = warc_base32_sha1_digest (sha1_res_block);
1224 warc_write_start_record ();
1225 warc_write_header ("WARC-Type", "revisit");
1226 warc_write_header ("WARC-Record-ID", revisit_uuid);
1227 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1228 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1229 warc_write_header ("WARC-Refers-To", refers_to);
1230 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1231 warc_write_header ("WARC-Truncated", "length");
1232 warc_write_header ("WARC-Target-URI", url);
1233 warc_write_date_header (timestamp_str);
1234 warc_write_ip_header (ip);
1235 warc_write_header ("Content-Type", "application/http;msgtype=response");
1236 warc_write_header ("WARC-Block-Digest", block_digest);
1237 warc_write_header ("WARC-Payload-Digest", payload_digest);
1238 warc_write_block_from_file (body);
1239 warc_write_end_record ();
1242 free (block_digest);
1244 return warc_write_ok;
1247 /* Writes a response record to the WARC file.
1248 url is the target uri of the request/response,
1249 timestamp_str is the timestamp of the request that generated this response
1250 (generated with warc_timestamp),
1251 concurrent_to_uuid is the uuid of the request for that generated this response
1252 (generated with warc_uuid_str),
1253 ip is the ip address of the server (or NULL),
1254 body is a pointer to a file containing the response headers and body.
1255 mime_type is the mime type of the response body (will be printed to CDX),
1256 response_code is the HTTP response code (will be printed to CDX),
1257 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1258 Calling this function will close body.
1259 Returns true on success, false on error. */
1261 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1263 char *block_digest = NULL;
1264 char *payload_digest = NULL;
1265 char sha1_res_block[SHA1_DIGEST_SIZE];
1266 char sha1_res_payload[SHA1_DIGEST_SIZE];
1268 if (opt.warc_digests_enabled)
1270 /* Calculate the block and payload digests. */
1272 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1274 /* Decide (based on url + payload digest) if we have seen this
1276 struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1277 if (rec_existing != NULL)
1279 /* Found an existing record. */
1280 logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1282 /* Remove the payload from the file. */
1283 if (payload_offset > 0)
1285 if (ftruncate (fileno (body), payload_offset) == -1)
1289 /* Send the original payload digest. */
1290 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1291 bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1292 free (payload_digest);
1297 block_digest = warc_base32_sha1_digest (sha1_res_block);
1298 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1302 /* Not a revisit, just store the record. */
1304 char response_uuid [48];
1305 warc_uuid_str (response_uuid);
1307 fseek (warc_current_file, 0L, SEEK_END);
1308 size_t offset = ftell (warc_current_file);
1310 warc_write_start_record ();
1311 warc_write_header ("WARC-Type", "response");
1312 warc_write_header ("WARC-Record-ID", response_uuid);
1313 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1314 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1315 warc_write_header ("WARC-Target-URI", url);
1316 warc_write_date_header (timestamp_str);
1317 warc_write_ip_header (ip);
1318 warc_write_header ("WARC-Block-Digest", block_digest);
1319 warc_write_header ("WARC-Payload-Digest", payload_digest);
1320 warc_write_header ("Content-Type", "application/http;msgtype=response");
1321 warc_write_block_from_file (body);
1322 warc_write_end_record ();
1326 if (warc_write_ok && opt.warc_cdx_enabled)
1328 /* Add this record to the CDX. */
1329 warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1333 free (block_digest);
1335 free (payload_digest);
1337 return warc_write_ok;
1340 /* Writes a resource record to the WARC file.
1341 resource_uuid is the uuid of the resource (or NULL),
1342 url is the target uri of the resource,
1343 timestamp_str is the timestamp (generated with warc_timestamp),
1344 concurrent_to_uuid is the uuid of the request for that generated this resource
1345 (generated with warc_uuid_str) or NULL,
1346 ip is the ip address of the server (or NULL),
1347 content_type is the mime type of the body (or NULL),
1348 body is a pointer to a file containing the resource data.
1349 Calling this function will close body.
1350 Returns true on success, false on error. */
1352 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1354 if (resource_uuid == NULL)
1356 resource_uuid = alloca (48);
1357 warc_uuid_str (resource_uuid);
1360 if (content_type == NULL)
1361 content_type = "application/octet-stream";
1363 warc_write_start_record ();
1364 warc_write_header ("WARC-Type", "resource");
1365 warc_write_header ("WARC-Record-ID", resource_uuid);
1366 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1367 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1368 warc_write_header ("WARC-Target-URI", url);
1369 warc_write_date_header (timestamp_str);
1370 warc_write_ip_header (ip);
1371 warc_write_digest_headers (body, payload_offset);
1372 warc_write_header ("Content-Type", content_type);
1373 warc_write_block_from_file (body);
1374 warc_write_end_record ();
1378 return warc_write_ok;