1 /* Utility functions for writing WARC files.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
49 #include <uuid/uuid.h>
58 extern char *version_string;
60 /* Set by main in main.c */
61 extern char *program_argstring;
64 /* The log file (a temporary file that contains a copy
66 static FILE *warc_log_fp;
68 /* The manifest file (a temporary file that contains the
69 warcinfo uuid of every file in this crawl). */
70 static FILE *warc_manifest_fp;
72 /* The current WARC file (or NULL, if WARC is disabled). */
73 static FILE *warc_current_file;
76 /* The gzip stream for the current WARC file
77 (or NULL, if WARC or gzip is disabled). */
78 static gzFile warc_current_gzfile;
80 /* The offset of the current gzip record in the WARC file. */
81 static off_t warc_current_gzfile_offset;
83 /* The uncompressed size (so far) of the current record. */
84 static off_t warc_current_gzfile_uncompressed_size;
87 /* This is true until a warc_write_* method fails. */
88 static bool warc_write_ok;
90 /* The current CDX file (or NULL, if CDX is disabled). */
91 static FILE *warc_current_cdx_file;
93 /* The record id of the warcinfo record of the current WARC file. */
94 static char *warc_current_warcinfo_uuid_str;
96 /* The file name of the current WARC file. */
97 static char *warc_current_filename;
99 /* The serial number of the current WARC file. This number is
100 incremented each time a new file is opened and is used in the
101 WARC file's filename. */
102 static int warc_current_file_number;
104 /* The table of CDX records, if deduplication is enabled. */
105 struct hash_table * warc_cdx_dedup_table;
107 static bool warc_start_new_file (bool meta);
110 struct warc_cdx_record
114 char digest[SHA1_DIGEST_SIZE];
118 warc_hash_sha1_digest (const void *key)
120 /* We just use some of the first bytes of the digest. */
122 memcpy (&v, key, sizeof (unsigned long));
127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
129 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
134 /* Writes SIZE bytes from BUFFER to the current WARC file,
135 through gzwrite if compression is enabled.
136 Returns the number of uncompressed bytes written. */
138 warc_write_buffer (const char *buffer, size_t size)
141 if (warc_current_gzfile)
143 warc_current_gzfile_uncompressed_size += size;
144 return gzwrite (warc_current_gzfile, buffer, size);
148 return fwrite (buffer, 1, size, warc_current_file);
151 /* Writes STR to the current WARC file.
152 Returns false and set warc_write_ok to false if there
155 warc_write_string (const char *str)
160 size_t n = strlen (str);
161 if (n != warc_write_buffer (str, n))
162 warc_write_ok = false;
164 return warc_write_ok;
168 #define EXTRA_GZIP_HEADER_SIZE 12
169 #define GZIP_STATIC_HEADER_SIZE 10
170 #define FLG_FEXTRA 0x04
173 /* Starts a new WARC record. Writes the version header.
174 If opt.warc_maxsize is set and the current file is becoming
175 too large, this will open a new WARC file.
177 If compression is enabled, this will start a new
178 gzip stream in the current WARC file.
180 Returns false and set warc_write_ok to false if there
183 warc_write_start_record (void)
188 fflush (warc_current_file);
189 if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
190 warc_start_new_file (false);
193 /* Start a GZIP stream, if required. */
194 if (opt.warc_compression_enabled)
196 /* Record the starting offset of the new record. */
197 warc_current_gzfile_offset = ftello (warc_current_file);
199 /* Reserve space for the extra GZIP header field.
200 In warc_write_end_record we will fill this space
201 with information about the uncompressed and
202 compressed size of the record. */
203 fprintf (warc_current_file, "XXXXXXXXXXXX");
204 fflush (warc_current_file);
206 /* Start a new GZIP stream. */
207 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
208 warc_current_gzfile_uncompressed_size = 0;
210 if (warc_current_gzfile == NULL)
212 logprintf (LOG_NOTQUIET,
213 _("Error opening GZIP stream to WARC file.\n"));
214 warc_write_ok = false;
220 warc_write_string ("WARC/1.0\r\n");
221 return warc_write_ok;
224 /* Writes a WARC header to the current WARC record.
225 This method may be run after warc_write_start_record and
226 before warc_write_block_from_file. */
228 warc_write_header (const char *name, const char *value)
232 warc_write_string (name);
233 warc_write_string (": ");
234 warc_write_string (value);
235 warc_write_string ("\r\n");
237 return warc_write_ok;
240 /* Copies the contents of DATA_IN to the WARC record.
241 Adds a Content-Length header to the WARC record.
242 Run this method after warc_write_header,
243 then run warc_write_end_record. */
245 warc_write_block_from_file (FILE *data_in)
247 /* Add the Content-Length header. */
248 char *content_length;
249 fseeko (data_in, 0L, SEEK_END);
250 if (! asprintf (&content_length, "%ld", ftello (data_in)))
252 warc_write_ok = false;
255 warc_write_header ("Content-Length", content_length);
256 free (content_length);
258 /* End of the WARC header section. */
259 warc_write_string ("\r\n");
261 if (fseeko (data_in, 0L, SEEK_SET) != 0)
262 warc_write_ok = false;
264 /* Copy the data in the file to the WARC record. */
267 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
269 if (warc_write_buffer (buffer, s) < s)
270 warc_write_ok = false;
273 return warc_write_ok;
276 /* Run this method to close the current WARC record.
278 If compression is enabled, this method closes the
279 current GZIP stream and fills the extra GZIP header
280 with the uncompressed and compressed length of the
283 warc_write_end_record (void)
285 warc_write_buffer ("\r\n\r\n", 4);
288 /* We start a new gzip stream for each record. */
289 if (warc_write_ok && warc_current_gzfile)
291 if (gzclose (warc_current_gzfile) != Z_OK)
293 warc_write_ok = false;
297 fflush (warc_current_file);
298 fseeko (warc_current_file, 0, SEEK_END);
300 /* The WARC standard suggests that we add 'skip length' data in the
301 extra header field of the GZIP stream.
303 In warc_write_start_record we reserved space for this extra header.
304 This extra space starts at warc_current_gzfile_offset and fills
305 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
306 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
308 We need to do three things:
309 1. Move the static GZIP header to warc_current_gzfile_offset;
310 2. Set the FEXTRA flag in the GZIP header;
311 3. Write the extra GZIP header after the static header, that is,
312 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
315 /* Calculate the uncompressed and compressed sizes. */
316 off_t current_offset = ftello (warc_current_file);
317 off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
318 off_t compressed_size = warc_current_gzfile_uncompressed_size;
320 /* Go back to the static GZIP header. */
321 fseeko (warc_current_file, warc_current_gzfile_offset
322 + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
324 /* Read the header. */
325 char static_header[GZIP_STATIC_HEADER_SIZE];
326 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
328 if (result != GZIP_STATIC_HEADER_SIZE)
330 warc_write_ok = false;
334 /* Set the FEXTRA flag in the flags byte of the header. */
335 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
337 /* Write the header back to the file, but starting at
338 warc_current_gzfile_offset. */
339 fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
340 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
342 /* Prepare the extra GZIP header. */
343 char extra_header[EXTRA_GZIP_HEADER_SIZE];
344 /* XLEN, the length of the extra header fields. */
345 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
346 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
347 /* The extra header field identifier for the WARC skip length. */
348 extra_header[2] = 's';
349 extra_header[3] = 'l';
350 /* The size of the uncompressed record. */
351 extra_header[4] = (uncompressed_size & 255);
352 extra_header[5] = (uncompressed_size >> 8) & 255;
353 extra_header[6] = (uncompressed_size >> 16) & 255;
354 extra_header[7] = (uncompressed_size >> 24) & 255;
355 /* The size of the compressed record. */
356 extra_header[8] = (compressed_size & 255);
357 extra_header[9] = (compressed_size >> 8) & 255;
358 extra_header[10] = (compressed_size >> 16) & 255;
359 extra_header[11] = (compressed_size >> 24) & 255;
361 /* Write the extra header after the static header. */
362 fseeko (warc_current_file, warc_current_gzfile_offset
363 + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
364 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
366 /* Done, move back to the end of the file. */
367 fflush (warc_current_file);
368 fseeko (warc_current_file, 0, SEEK_END);
370 #endif /* HAVE_LIBZ */
372 return warc_write_ok;
376 /* Writes the WARC-Date header for the given timestamp to
377 the current WARC record.
378 If timestamp is NULL, the current time will be used. */
380 warc_write_date_header (const char *timestamp)
382 if (timestamp == NULL)
384 char current_timestamp[21];
385 warc_timestamp (current_timestamp);
386 timestamp = current_timestamp;
388 return warc_write_header ("WARC-Date", timestamp);
391 /* Writes the WARC-IP-Address header for the given IP to
392 the current WARC record. If IP is NULL, no header will
395 warc_write_ip_header (ip_address *ip)
398 return warc_write_header ("WARC-IP-Address", print_address (ip));
400 return warc_write_ok;
404 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
405 from gnulib/sha1.c. This version calculates two digests in one go.
407 Compute SHA1 message digests for bytes read from STREAM. The
408 digest of the complete file will be written into the 16 bytes
409 beginning at RES_BLOCK.
411 If payload_offset >= 0, a second digest will be calculated of the
412 portion of the file starting at payload_offset and continuing to
413 the end of the file. The digest number will be written into the
414 16 bytes beginning ad RES_PAYLOAD. */
416 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
417 off_t payload_offset)
419 #define BLOCKSIZE 32768
421 struct sha1_ctx ctx_block;
422 struct sha1_ctx ctx_payload;
426 char *buffer = malloc (BLOCKSIZE + 72);
430 /* Initialize the computation context. */
431 sha1_init_ctx (&ctx_block);
432 if (payload_offset >= 0)
433 sha1_init_ctx (&ctx_payload);
437 /* Iterate over full file contents. */
440 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
441 computation function processes the whole buffer so that with the
442 next round of the loop another block can be read. */
446 /* Read block. Take care for partial reads. */
449 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
454 if (sum == BLOCKSIZE)
459 /* Check for the error flag IFF N == 0, so that we don't
460 exit the loop after a partial read due to e.g., EAGAIN
467 goto process_partial_block;
470 /* We've read at least one byte, so ignore errors. But always
471 check for EOF, since feof may be true even though N > 0.
472 Otherwise, we could end up calling fread after EOF. */
474 goto process_partial_block;
477 /* Process buffer with BLOCKSIZE bytes. Note that
480 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
481 if (payload_offset >= 0 && payload_offset < pos)
483 /* At least part of the buffer contains data from payload. */
484 off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
485 if (start_of_payload <= 0)
486 /* All bytes in the buffer belong to the payload. */
487 start_of_payload = 0;
489 /* Process the payload part of the buffer.
490 Note: we can't use sha1_process_block here even if we
491 process the complete buffer. Because the payload doesn't
492 have to start with a full block, there may still be some
493 bytes left from the previous buffer. Therefore, we need
494 to continue with sha1_process_bytes. */
495 sha1_process_bytes (buffer + start_of_payload,
496 BLOCKSIZE - start_of_payload, &ctx_payload);
500 process_partial_block:;
502 /* Process any remaining bytes. */
505 sha1_process_bytes (buffer, sum, &ctx_block);
506 if (payload_offset >= 0 && payload_offset < pos)
508 /* At least part of the buffer contains data from payload. */
509 off_t start_of_payload = payload_offset - (pos - sum);
510 if (start_of_payload <= 0)
511 /* All bytes in the buffer belong to the payload. */
512 start_of_payload = 0;
514 /* Process the payload part of the buffer. */
515 sha1_process_bytes (buffer + start_of_payload,
516 sum - start_of_payload, &ctx_payload);
520 /* Construct result in desired memory. */
521 sha1_finish_ctx (&ctx_block, res_block);
522 if (payload_offset >= 0)
523 sha1_finish_ctx (&ctx_payload, res_payload);
530 /* Converts the SHA1 digest to a base32-encoded string.
531 "sha1:DIGEST\0" (Allocates a new string for the response.) */
533 warc_base32_sha1_digest (char *sha1_digest)
535 // length: "sha1:" + digest + "\0"
536 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
537 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
538 BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
539 memcpy (sha1_base32, "sha1:", 5);
540 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
545 /* Sets the digest headers of the record.
546 This method will calculate the block digest and, if payload_offset >= 0,
547 will also calculate the payload digest of the payload starting at the
550 warc_write_digest_headers (FILE *file, long payload_offset)
552 if (opt.warc_digests_enabled)
554 /* Calculate the block and payload digests. */
555 char sha1_res_block[SHA1_DIGEST_SIZE];
556 char sha1_res_payload[SHA1_DIGEST_SIZE];
559 if (warc_sha1_stream_with_payload (file, sha1_res_block,
560 sha1_res_payload, payload_offset) == 0)
564 digest = warc_base32_sha1_digest (sha1_res_block);
565 warc_write_header ("WARC-Block-Digest", digest);
568 if (payload_offset >= 0)
570 digest = warc_base32_sha1_digest (sha1_res_payload);
571 warc_write_header ("WARC-Payload-Digest", digest);
579 /* Fills timestamp with the current time and date.
580 The UTC time is formatted following ISO 8601, as required
581 for use in the WARC-Date header.
582 The timestamp will be 21 characters long. */
584 warc_timestamp (char *timestamp)
587 struct tm * timeinfo;
589 timeinfo = gmtime (&rawtime);
590 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
594 /* Fills urn_str with a UUID in the format required
595 for the WARC-Record-Id header.
596 The string will be 47 characters long. */
598 warc_uuid_str (char *urn_str)
603 uuid_generate (record_id);
604 uuid_unparse (record_id, uuid_str);
606 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
609 /* Fills urn_str with a UUID based on random numbers in the format
610 required for the WARC-Record-Id header.
611 (See RFC 4122, UUID version 4.)
613 Note: this is a fallback method, it is much better to use the
614 methods provided by libuuid.
616 The string will be 47 characters long. */
618 warc_uuid_str (char *urn_str)
620 // RFC 4122, a version 4 UUID with only random numbers
622 unsigned char uuid_data[16];
625 uuid_data[i] = random_number (255);
627 // Set the four most significant bits (bits 12 through 15) of the
628 // time_hi_and_version field to the 4-bit version number
629 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
631 // Set the two most significant bits (bits 6 and 7) of the
632 // clock_seq_hi_and_reserved to zero and one, respectively.
633 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
636 "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
637 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
638 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
639 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
644 /* Write a warcinfo record to the current file.
645 Updates warc_current_warcinfo_uuid_str. */
647 warc_write_warcinfo_record (char *filename)
649 /* Write warc-info record as the first record of the file. */
650 /* We add the record id of this info record to the other records in the
652 warc_current_warcinfo_uuid_str = (char *) malloc (48);
653 warc_uuid_str (warc_current_warcinfo_uuid_str);
656 warc_timestamp (timestamp);
658 char *filename_copy, *filename_basename;
659 filename_copy = strdup (filename);
660 filename_basename = strdup (basename (filename_copy));
662 warc_write_start_record ();
663 warc_write_header ("WARC-Type", "warcinfo");
664 warc_write_header ("Content-Type", "application/warc-fields");
665 warc_write_header ("WARC-Date", timestamp);
666 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
667 warc_write_header ("WARC-Filename", filename_basename);
669 /* Create content. */
670 FILE *warc_tmp = warc_tempfile ();
671 if (warc_tmp == NULL)
673 free (filename_copy);
674 free (filename_basename);
678 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
679 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
681 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
682 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
683 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
684 /* Add the user headers, if any. */
685 if (opt.warc_user_headers)
688 for (i = 0; opt.warc_user_headers[i]; i++)
689 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
691 fprintf(warc_tmp, "\r\n");
693 warc_write_digest_headers (warc_tmp, -1);
694 warc_write_block_from_file (warc_tmp);
695 warc_write_end_record ();
698 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
700 free (filename_copy);
701 free (filename_basename);
703 return warc_write_ok;
706 /* Opens a new WARC file.
707 If META is true, generates a filename ending with 'meta.warc.gz'.
710 1. close the current WARC file (if there is one);
711 2. increment warc_current_file_number;
712 3. open a new WARC file;
713 4. write the initial warcinfo record.
715 Returns true on success, false otherwise.
718 warc_start_new_file (bool meta)
720 if (opt.warc_filename == NULL)
723 if (warc_current_file != NULL)
724 fclose (warc_current_file);
725 if (warc_current_warcinfo_uuid_str)
726 free (warc_current_warcinfo_uuid_str);
727 if (warc_current_filename)
728 free (warc_current_filename);
730 warc_current_file_number++;
732 int base_filename_length = strlen (opt.warc_filename);
733 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
734 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
735 warc_current_filename = new_filename;
738 const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
740 const char *extension = "warc";
743 /* If max size is enabled, we add a serial number to the file names. */
745 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
746 else if (opt.warc_maxsize > 0)
748 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
749 warc_current_file_number, extension);
752 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
754 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
756 /* Open the WARC file. */
757 warc_current_file = fopen (new_filename, "wb+");
758 if (warc_current_file == NULL)
760 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
761 quote (new_filename));
765 if (! warc_write_warcinfo_record (new_filename))
768 /* Add warcinfo uuid to manifest. */
769 if (warc_manifest_fp)
770 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
775 /* Opens the CDX file for output. */
777 warc_start_cdx_file (void)
779 int filename_length = strlen (opt.warc_filename);
780 char *cdx_filename = alloca (filename_length + 4 + 1);
781 memcpy (cdx_filename, opt.warc_filename, filename_length);
782 memcpy (cdx_filename + filename_length, ".cdx", 5);
783 warc_current_cdx_file = fopen (cdx_filename, "a+");
784 if (warc_current_cdx_file == NULL)
787 /* Print the CDX header.
793 * k - new style checksum
796 * V - compressed arc file offset
800 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
801 fflush (warc_current_cdx_file);
806 #define CDX_FIELDSEP " \t\r\n"
808 /* Parse the CDX header and find the field numbers of the original url,
809 checksum and record ID fields. */
811 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
812 int *field_num_checksum, int *field_num_record_id)
814 *field_num_original_url = -1;
815 *field_num_checksum = -1;
816 *field_num_record_id = -1;
820 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
822 if (token != NULL && strcmp (token, "CDX") == 0)
825 while (token != NULL)
827 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
833 *field_num_original_url = field_num;
836 *field_num_checksum = field_num;
839 *field_num_record_id = field_num;
847 return *field_num_original_url != -1
848 && *field_num_checksum != -1
849 && *field_num_record_id != -1;
852 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
854 warc_process_cdx_line (char *lineptr, int field_num_original_url,
855 int field_num_checksum, int field_num_record_id)
857 char *original_url = NULL;
858 char *checksum = NULL;
859 char *record_id = NULL;
863 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
865 /* Read this line to get the fields we need. */
867 while (token != NULL)
870 if (field_num == field_num_original_url)
872 else if (field_num == field_num_checksum)
874 else if (field_num == field_num_record_id)
880 *val = strdup (token);
882 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
886 if (original_url != NULL && checksum != NULL && record_id != NULL)
888 /* For some extra efficiency, we decode the base32 encoded
889 checksum value. This should produce exactly SHA1_DIGEST_SIZE
893 base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
897 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
899 /* This is a valid line with a valid checksum. */
900 struct warc_cdx_record *rec;
901 rec = malloc (sizeof (struct warc_cdx_record));
902 rec->url = original_url;
903 rec->uuid = record_id;
904 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
905 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
911 if (checksum_v != NULL)
918 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
919 the warc_cdx_dedup_table. */
921 warc_load_cdx_dedup_file (void)
923 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
927 int field_num_original_url = -1;
928 int field_num_checksum = -1;
929 int field_num_record_id = -1;
931 char *lineptr = NULL;
935 /* The first line should contain the CDX header.
936 Format: " CDX x x x x x"
937 where x are field type indicators. For our purposes, we only
938 need 'a' (the original url), 'k' (the SHA1 checksum) and
939 'u' (the WARC record id). */
940 line_length = getline (&lineptr, &n, f);
941 if (line_length != -1)
942 warc_parse_cdx_header (lineptr, &field_num_original_url,
943 &field_num_checksum, &field_num_record_id);
945 /* If the file contains all three fields, read the complete file. */
946 if (field_num_original_url == -1
947 || field_num_checksum == -1
948 || field_num_record_id == -1)
950 if (field_num_original_url == -1)
951 logprintf (LOG_NOTQUIET,
952 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
953 if (field_num_checksum == -1)
954 logprintf (LOG_NOTQUIET,
955 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
956 if (field_num_record_id == -1)
957 logprintf (LOG_NOTQUIET,
958 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
962 /* Initialize the table. */
963 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
964 warc_cmp_sha1_digest);
968 line_length = getline (&lineptr, &n, f);
969 if (line_length != -1)
971 warc_process_cdx_line (lineptr, field_num_original_url,
972 field_num_checksum, field_num_record_id);
976 while (line_length != -1);
979 int nrecords = hash_table_count (warc_cdx_dedup_table);
980 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
981 "Loaded %d records from CDX.\n\n",
993 /* Returns the existing duplicate CDX record for the given url and payload
994 digest. Returns NULL if the url is not found or if the payload digest
995 does not match, or if CDX deduplication is disabled. */
996 static struct warc_cdx_record *
997 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
999 if (warc_cdx_dedup_table == NULL)
1003 struct warc_cdx_record *rec_existing;
1004 hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key,
1007 if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
1008 return rec_existing;
1013 /* Initializes the WARC writer (if opt.warc_filename is set).
1014 This should be called before any WARC record is written. */
1018 warc_write_ok = true;
1020 if (opt.warc_filename != NULL)
1022 if (opt.warc_cdx_dedup_filename != NULL)
1024 if (! warc_load_cdx_dedup_file ())
1026 logprintf (LOG_NOTQUIET,
1027 _("Could not read CDX file %s for deduplication.\n"),
1028 quote (opt.warc_cdx_dedup_filename));
1033 warc_manifest_fp = warc_tempfile ();
1034 if (warc_manifest_fp == NULL)
1036 logprintf (LOG_NOTQUIET,
1037 _("Could not open temporary WARC manifest file.\n"));
1041 if (opt.warc_keep_log)
1043 warc_log_fp = warc_tempfile ();
1044 if (warc_log_fp == NULL)
1046 logprintf (LOG_NOTQUIET,
1047 _("Could not open temporary WARC log file.\n"));
1050 log_set_warc_log_fp (warc_log_fp);
1053 warc_current_file_number = -1;
1054 if (! warc_start_new_file (false))
1056 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1060 if (opt.warc_cdx_enabled)
1062 if (! warc_start_cdx_file ())
1064 logprintf (LOG_NOTQUIET,
1065 _("Could not open CDX file for output.\n"));
1072 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1074 warc_write_metadata (void)
1076 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1077 if (opt.warc_maxsize > 0)
1078 warc_start_new_file (true);
1080 char manifest_uuid [48];
1081 warc_uuid_str (manifest_uuid);
1083 fflush (warc_manifest_fp);
1084 warc_write_resource_record (manifest_uuid,
1085 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1086 NULL, NULL, NULL, "text/plain",
1087 warc_manifest_fp, -1);
1088 /* warc_write_resource_record has closed warc_manifest_fp. */
1090 FILE * warc_tmp_fp = warc_tempfile ();
1091 if (warc_tmp_fp == NULL)
1093 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1096 fflush (warc_tmp_fp);
1097 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1099 warc_write_resource_record (manifest_uuid,
1100 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1101 NULL, NULL, NULL, "text/plain",
1103 /* warc_write_resource_record has closed warc_tmp_fp. */
1105 if (warc_log_fp != NULL)
1107 warc_write_resource_record (NULL,
1108 "metadata://gnu.org/software/wget/warc/wget.log",
1109 NULL, manifest_uuid, NULL, "text/plain",
1111 /* warc_write_resource_record has closed warc_log_fp. */
1114 log_set_warc_log_fp (NULL);
1118 /* Finishes the WARC writing.
1119 This should be called at the end of the program. */
1123 if (warc_current_file != NULL)
1125 warc_write_metadata ();
1126 free (warc_current_warcinfo_uuid_str);
1127 fclose (warc_current_file);
1129 if (warc_current_cdx_file != NULL)
1130 fclose (warc_current_cdx_file);
1131 if (warc_log_fp != NULL)
1133 fclose (warc_log_fp);
1134 log_set_warc_log_fp (NULL);
1138 /* Creates a temporary file for writing WARC output.
1139 The temporary file will be created in opt.warc_tempdir.
1140 Returns the pointer to the temporary file, or NULL. */
1142 warc_tempfile (void)
1145 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1148 int fd = mkstemp (filename);
1152 if (unlink (filename) < 0)
1155 return fdopen (fd, "wb+");
1159 /* Writes a request record to the WARC file.
1160 url is the target uri of the request,
1161 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1162 record_uuid is the uuid of the request (generated with warc_uuid_str),
1163 body is a pointer to a file containing the request headers and body.
1164 ip is the ip address of the server (or NULL),
1165 Calling this function will close body.
1166 Returns true on success, false on error. */
1168 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1169 ip_address *ip, FILE *body, off_t payload_offset)
1171 warc_write_start_record ();
1172 warc_write_header ("WARC-Type", "request");
1173 warc_write_header ("WARC-Target-URI", url);
1174 warc_write_header ("Content-Type", "application/http;msgtype=request");
1175 warc_write_date_header (timestamp_str);
1176 warc_write_header ("WARC-Record-ID", record_uuid);
1177 warc_write_ip_header (ip);
1178 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1179 warc_write_digest_headers (body, payload_offset);
1180 warc_write_block_from_file (body);
1181 warc_write_end_record ();
1185 return warc_write_ok;
1188 /* Writes a response record to the CDX file.
1189 url is the target uri of the request/response,
1190 timestamp_str is the timestamp of the request that generated this response,
1191 (generated with warc_timestamp),
1192 mime_type is the mime type of the response body (will be printed to CDX),
1193 response_code is the HTTP response code (will be printed to CDX),
1194 payload_digest is the sha1 digest of the payload,
1195 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1196 offset is the position of the WARC record in the WARC file,
1197 warc_filename is the filename of the WARC,
1198 response_uuid is the uuid of the response.
1199 Returns true on success, false on error. */
1201 warc_write_cdx_record (const char *url, const char *timestamp_str,
1202 const char *mime_type, int response_code,
1203 const char *payload_digest, const char *redirect_location,
1204 off_t offset, const char *warc_filename,
1205 const char *response_uuid)
1207 /* Transform the timestamp. */
1208 char timestamp_str_cdx [15];
1209 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1210 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1211 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1212 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1213 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1214 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1215 timestamp_str_cdx[14] = '\0';
1217 /* Rewrite the checksum. */
1218 const char *checksum;
1219 if (payload_digest != NULL)
1220 checksum = payload_digest + 5; /* Skip the "sha1:" */
1224 if (mime_type == NULL || strlen(mime_type) == 0)
1226 if (redirect_location == NULL || strlen(redirect_location) == 0)
1227 redirect_location = "-";
1229 /* Print the CDX line. */
1230 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url,
1231 timestamp_str_cdx, url, mime_type, response_code, checksum,
1232 redirect_location, offset, warc_current_filename, response_uuid);
1233 fflush (warc_current_cdx_file);
1238 /* Writes a revisit record to the WARC file.
1239 url is the target uri of the request/response,
1240 timestamp_str is the timestamp of the request that generated this response
1241 (generated with warc_timestamp),
1242 concurrent_to_uuid is the uuid of the request for that generated this response
1243 (generated with warc_uuid_str),
1244 refers_to_uuid is the uuid of the original response
1245 (generated with warc_uuid_str),
1246 payload_digest is the sha1 digest of the payload,
1247 ip is the ip address of the server (or NULL),
1248 body is a pointer to a file containing the response headers (without payload).
1249 Calling this function will close body.
1250 Returns true on success, false on error. */
1252 warc_write_revisit_record (char *url, char *timestamp_str,
1253 char *concurrent_to_uuid, char *payload_digest,
1254 char *refers_to, ip_address *ip, FILE *body)
1256 char revisit_uuid [48];
1257 warc_uuid_str (revisit_uuid);
1259 char *block_digest = NULL;
1260 char sha1_res_block[SHA1_DIGEST_SIZE];
1261 sha1_stream (body, sha1_res_block);
1262 block_digest = warc_base32_sha1_digest (sha1_res_block);
1264 warc_write_start_record ();
1265 warc_write_header ("WARC-Type", "revisit");
1266 warc_write_header ("WARC-Record-ID", revisit_uuid);
1267 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1268 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1269 warc_write_header ("WARC-Refers-To", refers_to);
1270 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1271 warc_write_header ("WARC-Truncated", "length");
1272 warc_write_header ("WARC-Target-URI", url);
1273 warc_write_date_header (timestamp_str);
1274 warc_write_ip_header (ip);
1275 warc_write_header ("Content-Type", "application/http;msgtype=response");
1276 warc_write_header ("WARC-Block-Digest", block_digest);
1277 warc_write_header ("WARC-Payload-Digest", payload_digest);
1278 warc_write_block_from_file (body);
1279 warc_write_end_record ();
1282 free (block_digest);
1284 return warc_write_ok;
1287 /* Writes a response record to the WARC file.
1288 url is the target uri of the request/response,
1289 timestamp_str is the timestamp of the request that generated this response
1290 (generated with warc_timestamp),
1291 concurrent_to_uuid is the uuid of the request for that generated this response
1292 (generated with warc_uuid_str),
1293 ip is the ip address of the server (or NULL),
1294 body is a pointer to a file containing the response headers and body.
1295 mime_type is the mime type of the response body (will be printed to CDX),
1296 response_code is the HTTP response code (will be printed to CDX),
1297 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1298 Calling this function will close body.
1299 Returns true on success, false on error. */
1301 warc_write_response_record (char *url, char *timestamp_str,
1302 char *concurrent_to_uuid, ip_address *ip,
1303 FILE *body, off_t payload_offset, char *mime_type,
1304 int response_code, char *redirect_location)
1306 char *block_digest = NULL;
1307 char *payload_digest = NULL;
1308 char sha1_res_block[SHA1_DIGEST_SIZE];
1309 char sha1_res_payload[SHA1_DIGEST_SIZE];
1311 if (opt.warc_digests_enabled)
1313 /* Calculate the block and payload digests. */
1315 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1316 payload_offset) == 0)
1318 /* Decide (based on url + payload digest) if we have seen this
1320 struct warc_cdx_record *rec_existing;
1321 rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1322 if (rec_existing != NULL)
1326 /* Found an existing record. */
1327 logprintf (LOG_VERBOSE,
1328 _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1330 /* Remove the payload from the file. */
1331 if (payload_offset > 0)
1333 if (ftruncate (fileno (body), payload_offset) == -1)
1337 /* Send the original payload digest. */
1338 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1339 result = warc_write_revisit_record (url, timestamp_str,
1340 concurrent_to_uuid, payload_digest, rec_existing->uuid,
1342 free (payload_digest);
1347 block_digest = warc_base32_sha1_digest (sha1_res_block);
1348 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1352 /* Not a revisit, just store the record. */
1354 char response_uuid [48];
1355 warc_uuid_str (response_uuid);
1357 fseeko (warc_current_file, 0L, SEEK_END);
1358 off_t offset = ftello (warc_current_file);
1360 warc_write_start_record ();
1361 warc_write_header ("WARC-Type", "response");
1362 warc_write_header ("WARC-Record-ID", response_uuid);
1363 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1364 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1365 warc_write_header ("WARC-Target-URI", url);
1366 warc_write_date_header (timestamp_str);
1367 warc_write_ip_header (ip);
1368 warc_write_header ("WARC-Block-Digest", block_digest);
1369 warc_write_header ("WARC-Payload-Digest", payload_digest);
1370 warc_write_header ("Content-Type", "application/http;msgtype=response");
1371 warc_write_block_from_file (body);
1372 warc_write_end_record ();
1376 if (warc_write_ok && opt.warc_cdx_enabled)
1378 /* Add this record to the CDX. */
1379 warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1380 payload_digest, redirect_location, offset, warc_current_filename,
1385 free (block_digest);
1387 free (payload_digest);
1389 return warc_write_ok;
1392 /* Writes a resource record to the WARC file.
1393 resource_uuid is the uuid of the resource (or NULL),
1394 url is the target uri of the resource,
1395 timestamp_str is the timestamp (generated with warc_timestamp),
1396 concurrent_to_uuid is the uuid of the request for that generated this
1397 resource (generated with warc_uuid_str) or NULL,
1398 ip is the ip address of the server (or NULL),
1399 content_type is the mime type of the body (or NULL),
1400 body is a pointer to a file containing the resource data.
1401 Calling this function will close body.
1402 Returns true on success, false on error. */
1404 warc_write_resource_record (char *resource_uuid, const char *url,
1405 const char *timestamp_str, const char *concurrent_to_uuid,
1406 ip_address *ip, const char *content_type, FILE *body,
1407 off_t payload_offset)
1409 if (resource_uuid == NULL)
1411 resource_uuid = alloca (48);
1412 warc_uuid_str (resource_uuid);
1415 if (content_type == NULL)
1416 content_type = "application/octet-stream";
1418 warc_write_start_record ();
1419 warc_write_header ("WARC-Type", "resource");
1420 warc_write_header ("WARC-Record-ID", resource_uuid);
1421 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1422 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1423 warc_write_header ("WARC-Target-URI", url);
1424 warc_write_date_header (timestamp_str);
1425 warc_write_ip_header (ip);
1426 warc_write_digest_headers (body, payload_offset);
1427 warc_write_header ("Content-Type", content_type);
1428 warc_write_block_from_file (body);
1429 warc_write_end_record ();
1433 return warc_write_ok;