1 /* Utility functions for writing WARC files.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
49 #include <uuid/uuid.h>
58 extern char *version_string;
60 /* Set by main in main.c */
61 extern char *program_argstring;
64 /* The log file (a temporary file that contains a copy
66 static FILE *warc_log_fp;
68 /* The manifest file (a temporary file that contains the
69 warcinfo uuid of every file in this crawl). */
70 static FILE *warc_manifest_fp;
72 /* The current WARC file (or NULL, if WARC is disabled). */
73 static FILE *warc_current_file;
76 /* The gzip stream for the current WARC file
77 (or NULL, if WARC or gzip is disabled). */
78 static gzFile warc_current_gzfile;
80 /* The offset of the current gzip record in the WARC file. */
81 static off_t warc_current_gzfile_offset;
83 /* The uncompressed size (so far) of the current record. */
84 static off_t warc_current_gzfile_uncompressed_size;
87 /* This is true until a warc_write_* method fails. */
88 static bool warc_write_ok;
90 /* The current CDX file (or NULL, if CDX is disabled). */
91 static FILE *warc_current_cdx_file;
93 /* The record id of the warcinfo record of the current WARC file. */
94 static char *warc_current_warcinfo_uuid_str;
96 /* The file name of the current WARC file. */
97 static char *warc_current_filename;
99 /* The serial number of the current WARC file. This number is
100 incremented each time a new file is opened and is used in the
101 WARC file's filename. */
102 static int warc_current_file_number;
104 /* The table of CDX records, if deduplication is enabled. */
105 struct hash_table * warc_cdx_dedup_table;
107 static bool warc_start_new_file (bool meta);
110 struct warc_cdx_record
114 char digest[SHA1_DIGEST_SIZE];
118 warc_hash_sha1_digest (const void *key)
120 /* We just use some of the first bytes of the digest. */
122 memcpy (&v, key, sizeof (unsigned long));
127 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
129 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
134 /* Writes SIZE bytes from BUFFER to the current WARC file,
135 through gzwrite if compression is enabled.
136 Returns the number of uncompressed bytes written. */
138 warc_write_buffer (const char *buffer, size_t size)
141 if (warc_current_gzfile)
143 warc_current_gzfile_uncompressed_size += size;
144 return gzwrite (warc_current_gzfile, buffer, size);
148 return fwrite (buffer, 1, size, warc_current_file);
151 /* Writes STR to the current WARC file.
152 Returns false and set warc_write_ok to false if there
155 warc_write_string (const char *str)
160 size_t n = strlen (str);
161 if (n != warc_write_buffer (str, n))
162 warc_write_ok = false;
164 return warc_write_ok;
168 #define EXTRA_GZIP_HEADER_SIZE 14
169 #define GZIP_STATIC_HEADER_SIZE 10
170 #define FLG_FEXTRA 0x04
173 /* Starts a new WARC record. Writes the version header.
174 If opt.warc_maxsize is set and the current file is becoming
175 too large, this will open a new WARC file.
177 If compression is enabled, this will start a new
178 gzip stream in the current WARC file.
180 Returns false and set warc_write_ok to false if there
183 warc_write_start_record (void)
188 fflush (warc_current_file);
189 if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
190 warc_start_new_file (false);
193 /* Start a GZIP stream, if required. */
194 if (opt.warc_compression_enabled)
196 /* Record the starting offset of the new record. */
197 warc_current_gzfile_offset = ftello (warc_current_file);
199 /* Reserve space for the extra GZIP header field.
200 In warc_write_end_record we will fill this space
201 with information about the uncompressed and
202 compressed size of the record. */
203 fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
204 fflush (warc_current_file);
206 /* Start a new GZIP stream. */
207 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
208 warc_current_gzfile_uncompressed_size = 0;
210 if (warc_current_gzfile == NULL)
212 logprintf (LOG_NOTQUIET,
213 _("Error opening GZIP stream to WARC file.\n"));
214 warc_write_ok = false;
220 warc_write_string ("WARC/1.0\r\n");
221 return warc_write_ok;
224 /* Writes a WARC header to the current WARC record.
225 This method may be run after warc_write_start_record and
226 before warc_write_block_from_file. */
228 warc_write_header (const char *name, const char *value)
232 warc_write_string (name);
233 warc_write_string (": ");
234 warc_write_string (value);
235 warc_write_string ("\r\n");
237 return warc_write_ok;
240 /* Copies the contents of DATA_IN to the WARC record.
241 Adds a Content-Length header to the WARC record.
242 Run this method after warc_write_header,
243 then run warc_write_end_record. */
245 warc_write_block_from_file (FILE *data_in)
247 /* Add the Content-Length header. */
248 char content_length[MAX_INT_TO_STRING_LEN(off_t)];
249 fseeko (data_in, 0L, SEEK_END);
250 number_to_string (content_length, ftello (data_in));
251 warc_write_header ("Content-Length", content_length);
253 /* End of the WARC header section. */
254 warc_write_string ("\r\n");
256 if (fseeko (data_in, 0L, SEEK_SET) != 0)
257 warc_write_ok = false;
259 /* Copy the data in the file to the WARC record. */
262 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
264 if (warc_write_buffer (buffer, s) < s)
265 warc_write_ok = false;
268 return warc_write_ok;
271 /* Run this method to close the current WARC record.
273 If compression is enabled, this method closes the
274 current GZIP stream and fills the extra GZIP header
275 with the uncompressed and compressed length of the
278 warc_write_end_record (void)
280 warc_write_buffer ("\r\n\r\n", 4);
283 /* We start a new gzip stream for each record. */
284 if (warc_write_ok && warc_current_gzfile)
286 if (gzclose (warc_current_gzfile) != Z_OK)
288 warc_write_ok = false;
292 fflush (warc_current_file);
293 fseeko (warc_current_file, 0, SEEK_END);
295 /* The WARC standard suggests that we add 'skip length' data in the
296 extra header field of the GZIP stream.
298 In warc_write_start_record we reserved space for this extra header.
299 This extra space starts at warc_current_gzfile_offset and fills
300 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
301 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
303 We need to do three things:
304 1. Move the static GZIP header to warc_current_gzfile_offset;
305 2. Set the FEXTRA flag in the GZIP header;
306 3. Write the extra GZIP header after the static header, that is,
307 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
310 /* Calculate the uncompressed and compressed sizes. */
311 off_t current_offset = ftello (warc_current_file);
312 off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
313 off_t compressed_size = warc_current_gzfile_uncompressed_size;
315 /* Go back to the static GZIP header. */
316 fseeko (warc_current_file, warc_current_gzfile_offset
317 + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
319 /* Read the header. */
320 char static_header[GZIP_STATIC_HEADER_SIZE];
321 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
323 if (result != GZIP_STATIC_HEADER_SIZE)
325 warc_write_ok = false;
329 /* Set the FEXTRA flag in the flags byte of the header. */
330 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
332 /* Write the header back to the file, but starting at
333 warc_current_gzfile_offset. */
334 fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
335 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
337 /* Prepare the extra GZIP header. */
338 char extra_header[EXTRA_GZIP_HEADER_SIZE];
339 /* XLEN, the length of the extra header fields. */
340 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
341 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
342 /* The extra header field identifier for the WARC skip length. */
343 extra_header[2] = 's';
344 extra_header[3] = 'l';
345 /* The size of the field value (8 bytes). */
346 extra_header[4] = (8 & 255);
347 extra_header[5] = ((8 >> 8) & 255);
348 /* The size of the uncompressed record. */
349 extra_header[6] = (uncompressed_size & 255);
350 extra_header[7] = (uncompressed_size >> 8) & 255;
351 extra_header[8] = (uncompressed_size >> 16) & 255;
352 extra_header[9] = (uncompressed_size >> 24) & 255;
353 /* The size of the compressed record. */
354 extra_header[10] = (compressed_size & 255);
355 extra_header[11] = (compressed_size >> 8) & 255;
356 extra_header[12] = (compressed_size >> 16) & 255;
357 extra_header[13] = (compressed_size >> 24) & 255;
359 /* Write the extra header after the static header. */
360 fseeko (warc_current_file, warc_current_gzfile_offset
361 + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
362 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
364 /* Done, move back to the end of the file. */
365 fflush (warc_current_file);
366 fseeko (warc_current_file, 0, SEEK_END);
368 #endif /* HAVE_LIBZ */
370 return warc_write_ok;
374 /* Writes the WARC-Date header for the given timestamp to
375 the current WARC record.
376 If timestamp is NULL, the current time will be used. */
378 warc_write_date_header (const char *timestamp)
380 if (timestamp == NULL)
382 char current_timestamp[21];
383 warc_timestamp (current_timestamp);
384 timestamp = current_timestamp;
386 return warc_write_header ("WARC-Date", timestamp);
389 /* Writes the WARC-IP-Address header for the given IP to
390 the current WARC record. If IP is NULL, no header will
393 warc_write_ip_header (ip_address *ip)
396 return warc_write_header ("WARC-IP-Address", print_address (ip));
398 return warc_write_ok;
402 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
403 from gnulib/sha1.c. This version calculates two digests in one go.
405 Compute SHA1 message digests for bytes read from STREAM. The
406 digest of the complete file will be written into the 16 bytes
407 beginning at RES_BLOCK.
409 If payload_offset >= 0, a second digest will be calculated of the
410 portion of the file starting at payload_offset and continuing to
411 the end of the file. The digest number will be written into the
412 16 bytes beginning ad RES_PAYLOAD. */
414 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
415 off_t payload_offset)
417 #define BLOCKSIZE 32768
419 struct sha1_ctx ctx_block;
420 struct sha1_ctx ctx_payload;
424 char *buffer = malloc (BLOCKSIZE + 72);
428 /* Initialize the computation context. */
429 sha1_init_ctx (&ctx_block);
430 if (payload_offset >= 0)
431 sha1_init_ctx (&ctx_payload);
435 /* Iterate over full file contents. */
438 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
439 computation function processes the whole buffer so that with the
440 next round of the loop another block can be read. */
444 /* Read block. Take care for partial reads. */
447 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
452 if (sum == BLOCKSIZE)
457 /* Check for the error flag IFF N == 0, so that we don't
458 exit the loop after a partial read due to e.g., EAGAIN
465 goto process_partial_block;
468 /* We've read at least one byte, so ignore errors. But always
469 check for EOF, since feof may be true even though N > 0.
470 Otherwise, we could end up calling fread after EOF. */
472 goto process_partial_block;
475 /* Process buffer with BLOCKSIZE bytes. Note that
478 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
479 if (payload_offset >= 0 && payload_offset < pos)
481 /* At least part of the buffer contains data from payload. */
482 off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
483 if (start_of_payload <= 0)
484 /* All bytes in the buffer belong to the payload. */
485 start_of_payload = 0;
487 /* Process the payload part of the buffer.
488 Note: we can't use sha1_process_block here even if we
489 process the complete buffer. Because the payload doesn't
490 have to start with a full block, there may still be some
491 bytes left from the previous buffer. Therefore, we need
492 to continue with sha1_process_bytes. */
493 sha1_process_bytes (buffer + start_of_payload,
494 BLOCKSIZE - start_of_payload, &ctx_payload);
498 process_partial_block:;
500 /* Process any remaining bytes. */
503 sha1_process_bytes (buffer, sum, &ctx_block);
504 if (payload_offset >= 0 && payload_offset < pos)
506 /* At least part of the buffer contains data from payload. */
507 off_t start_of_payload = payload_offset - (pos - sum);
508 if (start_of_payload <= 0)
509 /* All bytes in the buffer belong to the payload. */
510 start_of_payload = 0;
512 /* Process the payload part of the buffer. */
513 sha1_process_bytes (buffer + start_of_payload,
514 sum - start_of_payload, &ctx_payload);
518 /* Construct result in desired memory. */
519 sha1_finish_ctx (&ctx_block, res_block);
520 if (payload_offset >= 0)
521 sha1_finish_ctx (&ctx_payload, res_payload);
528 /* Converts the SHA1 digest to a base32-encoded string.
529 "sha1:DIGEST\0" (Allocates a new string for the response.) */
531 warc_base32_sha1_digest (char *sha1_digest)
533 /* length: "sha1:" + digest + "\0" */
534 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
535 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
536 BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
537 memcpy (sha1_base32, "sha1:", 5);
538 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
543 /* Sets the digest headers of the record.
544 This method will calculate the block digest and, if payload_offset >= 0,
545 will also calculate the payload digest of the payload starting at the
548 warc_write_digest_headers (FILE *file, long payload_offset)
550 if (opt.warc_digests_enabled)
552 /* Calculate the block and payload digests. */
553 char sha1_res_block[SHA1_DIGEST_SIZE];
554 char sha1_res_payload[SHA1_DIGEST_SIZE];
557 if (warc_sha1_stream_with_payload (file, sha1_res_block,
558 sha1_res_payload, payload_offset) == 0)
562 digest = warc_base32_sha1_digest (sha1_res_block);
563 warc_write_header ("WARC-Block-Digest", digest);
566 if (payload_offset >= 0)
568 digest = warc_base32_sha1_digest (sha1_res_payload);
569 warc_write_header ("WARC-Payload-Digest", digest);
577 /* Fills timestamp with the current time and date.
578 The UTC time is formatted following ISO 8601, as required
579 for use in the WARC-Date header.
580 The timestamp will be 21 characters long. */
582 warc_timestamp (char *timestamp)
585 struct tm * timeinfo;
587 timeinfo = gmtime (&rawtime);
588 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
592 /* Fills urn_str with a UUID in the format required
593 for the WARC-Record-Id header.
594 The string will be 47 characters long. */
596 warc_uuid_str (char *urn_str)
601 uuid_generate (record_id);
602 uuid_unparse (record_id, uuid_str);
604 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
607 /* Fills urn_str with a UUID based on random numbers in the format
608 required for the WARC-Record-Id header.
609 (See RFC 4122, UUID version 4.)
611 Note: this is a fallback method, it is much better to use the
612 methods provided by libuuid.
614 The string will be 47 characters long. */
616 warc_uuid_str (char *urn_str)
618 // RFC 4122, a version 4 UUID with only random numbers
620 unsigned char uuid_data[16];
623 uuid_data[i] = random_number (255);
625 // Set the four most significant bits (bits 12 through 15) of the
626 // time_hi_and_version field to the 4-bit version number
627 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
629 // Set the two most significant bits (bits 6 and 7) of the
630 // clock_seq_hi_and_reserved to zero and one, respectively.
631 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
634 "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
635 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
636 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
637 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
642 /* Write a warcinfo record to the current file.
643 Updates warc_current_warcinfo_uuid_str. */
645 warc_write_warcinfo_record (char *filename)
647 /* Write warc-info record as the first record of the file. */
648 /* We add the record id of this info record to the other records in the
650 warc_current_warcinfo_uuid_str = (char *) malloc (48);
651 warc_uuid_str (warc_current_warcinfo_uuid_str);
654 warc_timestamp (timestamp);
656 char *filename_copy, *filename_basename;
657 filename_copy = strdup (filename);
658 filename_basename = strdup (basename (filename_copy));
660 warc_write_start_record ();
661 warc_write_header ("WARC-Type", "warcinfo");
662 warc_write_header ("Content-Type", "application/warc-fields");
663 warc_write_header ("WARC-Date", timestamp);
664 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
665 warc_write_header ("WARC-Filename", filename_basename);
667 /* Create content. */
668 FILE *warc_tmp = warc_tempfile ();
669 if (warc_tmp == NULL)
671 free (filename_copy);
672 free (filename_basename);
676 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
677 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
679 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
680 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
681 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
682 /* Add the user headers, if any. */
683 if (opt.warc_user_headers)
686 for (i = 0; opt.warc_user_headers[i]; i++)
687 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
689 fprintf(warc_tmp, "\r\n");
691 warc_write_digest_headers (warc_tmp, -1);
692 warc_write_block_from_file (warc_tmp);
693 warc_write_end_record ();
696 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
698 free (filename_copy);
699 free (filename_basename);
701 return warc_write_ok;
704 /* Opens a new WARC file.
705 If META is true, generates a filename ending with 'meta.warc.gz'.
708 1. close the current WARC file (if there is one);
709 2. increment warc_current_file_number;
710 3. open a new WARC file;
711 4. write the initial warcinfo record.
713 Returns true on success, false otherwise.
716 warc_start_new_file (bool meta)
718 if (opt.warc_filename == NULL)
721 if (warc_current_file != NULL)
722 fclose (warc_current_file);
723 if (warc_current_warcinfo_uuid_str)
724 free (warc_current_warcinfo_uuid_str);
725 if (warc_current_filename)
726 free (warc_current_filename);
728 warc_current_file_number++;
730 int base_filename_length = strlen (opt.warc_filename);
731 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
732 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
733 warc_current_filename = new_filename;
736 const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
738 const char *extension = "warc";
741 /* If max size is enabled, we add a serial number to the file names. */
743 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
744 else if (opt.warc_maxsize > 0)
746 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
747 warc_current_file_number, extension);
750 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
752 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
754 /* Open the WARC file. */
755 warc_current_file = fopen (new_filename, "wb+");
756 if (warc_current_file == NULL)
758 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
759 quote (new_filename));
763 if (! warc_write_warcinfo_record (new_filename))
766 /* Add warcinfo uuid to manifest. */
767 if (warc_manifest_fp)
768 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
773 /* Opens the CDX file for output. */
775 warc_start_cdx_file (void)
777 int filename_length = strlen (opt.warc_filename);
778 char *cdx_filename = alloca (filename_length + 4 + 1);
779 memcpy (cdx_filename, opt.warc_filename, filename_length);
780 memcpy (cdx_filename + filename_length, ".cdx", 5);
781 warc_current_cdx_file = fopen (cdx_filename, "a+");
782 if (warc_current_cdx_file == NULL)
785 /* Print the CDX header.
791 * k - new style checksum
794 * V - compressed arc file offset
798 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
799 fflush (warc_current_cdx_file);
804 #define CDX_FIELDSEP " \t\r\n"
806 /* Parse the CDX header and find the field numbers of the original url,
807 checksum and record ID fields. */
809 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
810 int *field_num_checksum, int *field_num_record_id)
812 *field_num_original_url = -1;
813 *field_num_checksum = -1;
814 *field_num_record_id = -1;
818 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
820 if (token != NULL && strcmp (token, "CDX") == 0)
823 while (token != NULL)
825 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
831 *field_num_original_url = field_num;
834 *field_num_checksum = field_num;
837 *field_num_record_id = field_num;
845 return *field_num_original_url != -1
846 && *field_num_checksum != -1
847 && *field_num_record_id != -1;
850 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
852 warc_process_cdx_line (char *lineptr, int field_num_original_url,
853 int field_num_checksum, int field_num_record_id)
855 char *original_url = NULL;
856 char *checksum = NULL;
857 char *record_id = NULL;
861 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
863 /* Read this line to get the fields we need. */
865 while (token != NULL)
868 if (field_num == field_num_original_url)
870 else if (field_num == field_num_checksum)
872 else if (field_num == field_num_record_id)
878 *val = strdup (token);
880 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
884 if (original_url != NULL && checksum != NULL && record_id != NULL)
886 /* For some extra efficiency, we decode the base32 encoded
887 checksum value. This should produce exactly SHA1_DIGEST_SIZE
891 base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
895 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
897 /* This is a valid line with a valid checksum. */
898 struct warc_cdx_record *rec;
899 rec = malloc (sizeof (struct warc_cdx_record));
900 rec->url = original_url;
901 rec->uuid = record_id;
902 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
903 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
909 if (checksum_v != NULL)
916 xfree_null(checksum);
917 xfree_null(original_url);
918 xfree_null(record_id);
922 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
923 the warc_cdx_dedup_table. */
925 warc_load_cdx_dedup_file (void)
927 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
931 int field_num_original_url = -1;
932 int field_num_checksum = -1;
933 int field_num_record_id = -1;
935 char *lineptr = NULL;
939 /* The first line should contain the CDX header.
940 Format: " CDX x x x x x"
941 where x are field type indicators. For our purposes, we only
942 need 'a' (the original url), 'k' (the SHA1 checksum) and
943 'u' (the WARC record id). */
944 line_length = getline (&lineptr, &n, f);
945 if (line_length != -1)
946 warc_parse_cdx_header (lineptr, &field_num_original_url,
947 &field_num_checksum, &field_num_record_id);
949 /* If the file contains all three fields, read the complete file. */
950 if (field_num_original_url == -1
951 || field_num_checksum == -1
952 || field_num_record_id == -1)
954 if (field_num_original_url == -1)
955 logprintf (LOG_NOTQUIET,
956 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
957 if (field_num_checksum == -1)
958 logprintf (LOG_NOTQUIET,
959 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
960 if (field_num_record_id == -1)
961 logprintf (LOG_NOTQUIET,
962 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
966 /* Initialize the table. */
967 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
968 warc_cmp_sha1_digest);
972 line_length = getline (&lineptr, &n, f);
973 if (line_length != -1)
975 warc_process_cdx_line (lineptr, field_num_original_url,
976 field_num_checksum, field_num_record_id);
980 while (line_length != -1);
983 int nrecords = hash_table_count (warc_cdx_dedup_table);
984 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
985 "Loaded %d records from CDX.\n\n",
997 /* Returns the existing duplicate CDX record for the given url and payload
998 digest. Returns NULL if the url is not found or if the payload digest
999 does not match, or if CDX deduplication is disabled. */
1000 static struct warc_cdx_record *
1001 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1003 if (warc_cdx_dedup_table == NULL)
1006 struct warc_cdx_record *rec_existing
1007 = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1009 if (rec_existing && strcmp (rec_existing->url, url) == 0)
1010 return rec_existing;
1015 /* Initializes the WARC writer (if opt.warc_filename is set).
1016 This should be called before any WARC record is written. */
1020 warc_write_ok = true;
1022 if (opt.warc_filename != NULL)
1024 if (opt.warc_cdx_dedup_filename != NULL)
1026 if (! warc_load_cdx_dedup_file ())
1028 logprintf (LOG_NOTQUIET,
1029 _("Could not read CDX file %s for deduplication.\n"),
1030 quote (opt.warc_cdx_dedup_filename));
1035 warc_manifest_fp = warc_tempfile ();
1036 if (warc_manifest_fp == NULL)
1038 logprintf (LOG_NOTQUIET,
1039 _("Could not open temporary WARC manifest file.\n"));
1043 if (opt.warc_keep_log)
1045 warc_log_fp = warc_tempfile ();
1046 if (warc_log_fp == NULL)
1048 logprintf (LOG_NOTQUIET,
1049 _("Could not open temporary WARC log file.\n"));
1052 log_set_warc_log_fp (warc_log_fp);
1055 warc_current_file_number = -1;
1056 if (! warc_start_new_file (false))
1058 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1062 if (opt.warc_cdx_enabled)
1064 if (! warc_start_cdx_file ())
1066 logprintf (LOG_NOTQUIET,
1067 _("Could not open CDX file for output.\n"));
1074 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1076 warc_write_metadata (void)
1078 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1079 if (opt.warc_maxsize > 0)
1080 warc_start_new_file (true);
1082 char manifest_uuid [48];
1083 warc_uuid_str (manifest_uuid);
1085 fflush (warc_manifest_fp);
1086 warc_write_metadata_record (manifest_uuid,
1087 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1088 NULL, NULL, NULL, "text/plain",
1089 warc_manifest_fp, -1);
1090 /* warc_write_resource_record has closed warc_manifest_fp. */
1092 FILE * warc_tmp_fp = warc_tempfile ();
1093 if (warc_tmp_fp == NULL)
1095 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1098 fflush (warc_tmp_fp);
1099 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1101 warc_write_resource_record (NULL,
1102 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1103 NULL, manifest_uuid, NULL, "text/plain",
1105 /* warc_write_resource_record has closed warc_tmp_fp. */
1107 if (warc_log_fp != NULL)
1109 warc_write_resource_record (NULL,
1110 "metadata://gnu.org/software/wget/warc/wget.log",
1111 NULL, manifest_uuid, NULL, "text/plain",
1113 /* warc_write_resource_record has closed warc_log_fp. */
1116 log_set_warc_log_fp (NULL);
1120 /* Finishes the WARC writing.
1121 This should be called at the end of the program. */
1125 if (warc_current_file != NULL)
1127 warc_write_metadata ();
1128 free (warc_current_warcinfo_uuid_str);
1129 fclose (warc_current_file);
1131 if (warc_current_cdx_file != NULL)
1132 fclose (warc_current_cdx_file);
1133 if (warc_log_fp != NULL)
1135 fclose (warc_log_fp);
1136 log_set_warc_log_fp (NULL);
1140 /* Creates a temporary file for writing WARC output.
1141 The temporary file will be created in opt.warc_tempdir.
1142 Returns the pointer to the temporary file, or NULL. */
1144 warc_tempfile (void)
1147 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1150 int fd = mkstemp (filename);
1154 if (unlink (filename) < 0)
1157 return fdopen (fd, "wb+");
1161 /* Writes a request record to the WARC file.
1162 url is the target uri of the request,
1163 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1164 record_uuid is the uuid of the request (generated with warc_uuid_str),
1165 body is a pointer to a file containing the request headers and body.
1166 ip is the ip address of the server (or NULL),
1167 Calling this function will close body.
1168 Returns true on success, false on error. */
1170 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1171 ip_address *ip, FILE *body, off_t payload_offset)
1173 warc_write_start_record ();
1174 warc_write_header ("WARC-Type", "request");
1175 warc_write_header ("WARC-Target-URI", url);
1176 warc_write_header ("Content-Type", "application/http;msgtype=request");
1177 warc_write_date_header (timestamp_str);
1178 warc_write_header ("WARC-Record-ID", record_uuid);
1179 warc_write_ip_header (ip);
1180 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1181 warc_write_digest_headers (body, payload_offset);
1182 warc_write_block_from_file (body);
1183 warc_write_end_record ();
1187 return warc_write_ok;
1190 /* Writes a response record to the CDX file.
1191 url is the target uri of the request/response,
1192 timestamp_str is the timestamp of the request that generated this response,
1193 (generated with warc_timestamp),
1194 mime_type is the mime type of the response body (will be printed to CDX),
1195 response_code is the HTTP response code (will be printed to CDX),
1196 payload_digest is the sha1 digest of the payload,
1197 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1198 offset is the position of the WARC record in the WARC file,
1199 warc_filename is the filename of the WARC,
1200 response_uuid is the uuid of the response.
1201 Returns true on success, false on error. */
1203 warc_write_cdx_record (const char *url, const char *timestamp_str,
1204 const char *mime_type, int response_code,
1205 const char *payload_digest, const char *redirect_location,
1206 off_t offset, const char *warc_filename,
1207 const char *response_uuid)
1209 /* Transform the timestamp. */
1210 char timestamp_str_cdx [15];
1211 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1212 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1213 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1214 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1215 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1216 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1217 timestamp_str_cdx[14] = '\0';
1219 /* Rewrite the checksum. */
1220 const char *checksum;
1221 if (payload_digest != NULL)
1222 checksum = payload_digest + 5; /* Skip the "sha1:" */
1226 if (mime_type == NULL || strlen(mime_type) == 0)
1228 if (redirect_location == NULL || strlen(redirect_location) == 0)
1229 redirect_location = "-";
1231 char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1232 number_to_string (offset_string, offset);
1234 /* Print the CDX line. */
1235 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1236 timestamp_str_cdx, url, mime_type, response_code, checksum,
1237 redirect_location, offset_string, warc_current_filename,
1239 fflush (warc_current_cdx_file);
1244 /* Writes a revisit record to the WARC file.
1245 url is the target uri of the request/response,
1246 timestamp_str is the timestamp of the request that generated this response
1247 (generated with warc_timestamp),
1248 concurrent_to_uuid is the uuid of the request for that generated this response
1249 (generated with warc_uuid_str),
1250 refers_to_uuid is the uuid of the original response
1251 (generated with warc_uuid_str),
1252 payload_digest is the sha1 digest of the payload,
1253 ip is the ip address of the server (or NULL),
1254 body is a pointer to a file containing the response headers (without payload).
1255 Calling this function will close body.
1256 Returns true on success, false on error. */
1258 warc_write_revisit_record (char *url, char *timestamp_str,
1259 char *concurrent_to_uuid, char *payload_digest,
1260 char *refers_to, ip_address *ip, FILE *body)
1262 char revisit_uuid [48];
1263 warc_uuid_str (revisit_uuid);
1265 char *block_digest = NULL;
1266 char sha1_res_block[SHA1_DIGEST_SIZE];
1267 sha1_stream (body, sha1_res_block);
1268 block_digest = warc_base32_sha1_digest (sha1_res_block);
1270 warc_write_start_record ();
1271 warc_write_header ("WARC-Type", "revisit");
1272 warc_write_header ("WARC-Record-ID", revisit_uuid);
1273 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1274 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1275 warc_write_header ("WARC-Refers-To", refers_to);
1276 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1277 warc_write_header ("WARC-Truncated", "length");
1278 warc_write_header ("WARC-Target-URI", url);
1279 warc_write_date_header (timestamp_str);
1280 warc_write_ip_header (ip);
1281 warc_write_header ("Content-Type", "application/http;msgtype=response");
1282 warc_write_header ("WARC-Block-Digest", block_digest);
1283 warc_write_header ("WARC-Payload-Digest", payload_digest);
1284 warc_write_block_from_file (body);
1285 warc_write_end_record ();
1288 free (block_digest);
1290 return warc_write_ok;
1293 /* Writes a response record to the WARC file.
1294 url is the target uri of the request/response,
1295 timestamp_str is the timestamp of the request that generated this response
1296 (generated with warc_timestamp),
1297 concurrent_to_uuid is the uuid of the request for that generated this response
1298 (generated with warc_uuid_str),
1299 ip is the ip address of the server (or NULL),
1300 body is a pointer to a file containing the response headers and body.
1301 mime_type is the mime type of the response body (will be printed to CDX),
1302 response_code is the HTTP response code (will be printed to CDX),
1303 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1304 Calling this function will close body.
1305 Returns true on success, false on error. */
1307 warc_write_response_record (char *url, char *timestamp_str,
1308 char *concurrent_to_uuid, ip_address *ip,
1309 FILE *body, off_t payload_offset, char *mime_type,
1310 int response_code, char *redirect_location)
1312 char *block_digest = NULL;
1313 char *payload_digest = NULL;
1314 char sha1_res_block[SHA1_DIGEST_SIZE];
1315 char sha1_res_payload[SHA1_DIGEST_SIZE];
1317 if (opt.warc_digests_enabled)
1319 /* Calculate the block and payload digests. */
1321 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1322 payload_offset) == 0)
1324 /* Decide (based on url + payload digest) if we have seen this
1326 struct warc_cdx_record *rec_existing;
1327 rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1328 if (rec_existing != NULL)
1332 /* Found an existing record. */
1333 logprintf (LOG_VERBOSE,
1334 _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1336 /* Remove the payload from the file. */
1337 if (payload_offset > 0)
1339 if (ftruncate (fileno (body), payload_offset) == -1)
1343 /* Send the original payload digest. */
1344 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1345 result = warc_write_revisit_record (url, timestamp_str,
1346 concurrent_to_uuid, payload_digest, rec_existing->uuid,
1348 free (payload_digest);
1353 block_digest = warc_base32_sha1_digest (sha1_res_block);
1354 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1358 /* Not a revisit, just store the record. */
1360 char response_uuid [48];
1361 warc_uuid_str (response_uuid);
1363 fseeko (warc_current_file, 0L, SEEK_END);
1364 off_t offset = ftello (warc_current_file);
1366 warc_write_start_record ();
1367 warc_write_header ("WARC-Type", "response");
1368 warc_write_header ("WARC-Record-ID", response_uuid);
1369 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1370 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1371 warc_write_header ("WARC-Target-URI", url);
1372 warc_write_date_header (timestamp_str);
1373 warc_write_ip_header (ip);
1374 warc_write_header ("WARC-Block-Digest", block_digest);
1375 warc_write_header ("WARC-Payload-Digest", payload_digest);
1376 warc_write_header ("Content-Type", "application/http;msgtype=response");
1377 warc_write_block_from_file (body);
1378 warc_write_end_record ();
1382 if (warc_write_ok && opt.warc_cdx_enabled)
1384 /* Add this record to the CDX. */
1385 warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1386 payload_digest, redirect_location, offset, warc_current_filename,
1391 free (block_digest);
1393 free (payload_digest);
1395 return warc_write_ok;
1398 /* Writes a resource or metadata record to the WARC file.
1399 warc_type is either "resource" or "metadata",
1400 resource_uuid is the uuid of the resource (or NULL),
1401 url is the target uri of the resource,
1402 timestamp_str is the timestamp (generated with warc_timestamp),
1403 concurrent_to_uuid is the uuid of the record that generated this,
1404 resource (generated with warc_uuid_str) or NULL,
1405 ip is the ip address of the server (or NULL),
1406 content_type is the mime type of the body (or NULL),
1407 body is a pointer to a file containing the resource data.
1408 Calling this function will close body.
1409 Returns true on success, false on error. */
1411 warc_write_record (const char *record_type, char *resource_uuid,
1412 const char *url, const char *timestamp_str,
1413 const char *concurrent_to_uuid,
1414 ip_address *ip, const char *content_type, FILE *body,
1415 off_t payload_offset)
1417 if (resource_uuid == NULL)
1419 resource_uuid = alloca (48);
1420 warc_uuid_str (resource_uuid);
1423 if (content_type == NULL)
1424 content_type = "application/octet-stream";
1426 warc_write_start_record ();
1427 warc_write_header ("WARC-Type", record_type);
1428 warc_write_header ("WARC-Record-ID", resource_uuid);
1429 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1430 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1431 warc_write_header ("WARC-Target-URI", url);
1432 warc_write_date_header (timestamp_str);
1433 warc_write_ip_header (ip);
1434 warc_write_digest_headers (body, payload_offset);
1435 warc_write_header ("Content-Type", content_type);
1436 warc_write_block_from_file (body);
1437 warc_write_end_record ();
1441 return warc_write_ok;
1444 /* Writes a resource record to the WARC file.
1445 resource_uuid is the uuid of the resource (or NULL),
1446 url is the target uri of the resource,
1447 timestamp_str is the timestamp (generated with warc_timestamp),
1448 concurrent_to_uuid is the uuid of the record that generated this,
1449 resource (generated with warc_uuid_str) or NULL,
1450 ip is the ip address of the server (or NULL),
1451 content_type is the mime type of the body (or NULL),
1452 body is a pointer to a file containing the resource data.
1453 Calling this function will close body.
1454 Returns true on success, false on error. */
1456 warc_write_resource_record (char *resource_uuid, const char *url,
1457 const char *timestamp_str, const char *concurrent_to_uuid,
1458 ip_address *ip, const char *content_type, FILE *body,
1459 off_t payload_offset)
1461 return warc_write_record ("resource",
1462 resource_uuid, url, timestamp_str, concurrent_to_uuid,
1463 ip, content_type, body, payload_offset);
1466 /* Writes a metadata record to the WARC file.
1467 record_uuid is the uuid of the record (or NULL),
1468 url is the target uri of the record,
1469 timestamp_str is the timestamp (generated with warc_timestamp),
1470 concurrent_to_uuid is the uuid of the record that generated this,
1471 record (generated with warc_uuid_str) or NULL,
1472 ip is the ip address of the server (or NULL),
1473 content_type is the mime type of the body (or NULL),
1474 body is a pointer to a file containing the record data.
1475 Calling this function will close body.
1476 Returns true on success, false on error. */
1478 warc_write_metadata_record (char *record_uuid, const char *url,
1479 const char *timestamp_str, const char *concurrent_to_uuid,
1480 ip_address *ip, const char *content_type, FILE *body,
1481 off_t payload_offset)
1483 return warc_write_record ("metadata",
1484 record_uuid, url, timestamp_str, concurrent_to_uuid,
1485 ip, content_type, body, payload_offset);