1 /* Utility functions for writing WARC files. */
19 #include <uuid/uuid.h>
28 extern char *version_string;
30 /* Set by main in main.c */
31 extern char *program_argstring;
34 /* The log file (a temporary file that contains a copy
36 static FILE *warc_log_fp;
38 /* The manifest file (a temporary file that contains the
39 warcinfo uuid of every file in this crawl). */
40 static FILE *warc_manifest_fp;
42 /* The current WARC file (or NULL, if WARC is disabled). */
43 static FILE *warc_current_file;
45 /* The gzip stream for the current WARC file
46 (or NULL, if WARC or gzip is disabled). */
47 static gzFile *warc_current_gzfile;
49 /* The offset of the current gzip record in the WARC file. */
50 static size_t warc_current_gzfile_offset;
52 /* The uncompressed size (so far) of the current record. */
53 static size_t warc_current_gzfile_uncompressed_size;
55 /* This is true until a warc_write_* method fails. */
56 static bool warc_write_ok;
58 /* The current CDX file (or NULL, if CDX is disabled). */
59 static FILE *warc_current_cdx_file;
61 /* The record id of the warcinfo record of the current WARC file. */
62 static char *warc_current_warcinfo_uuid_str;
64 /* The file name of the current WARC file. */
65 static char *warc_current_filename;
67 /* The serial number of the current WARC file. This number is
68 incremented each time a new file is opened and is used in the
69 WARC file's filename. */
70 static int warc_current_file_number;
72 /* The table of CDX records, if deduplication is enabled. */
73 struct hash_table * warc_cdx_dedup_table;
75 static bool warc_start_new_file (bool meta);
78 struct warc_cdx_record
82 char digest[SHA1_DIGEST_SIZE];
86 warc_hash_sha1_digest (const void *key)
88 /* We just use some of the first bytes of the digest. */
90 memcpy (&v, key, sizeof (unsigned long));
95 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
97 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
102 /* Writes SIZE bytes from BUFFER to the current WARC file,
103 through gzwrite if compression is enabled.
104 Returns the number of uncompressed bytes written. */
106 warc_write_buffer (const char *buffer, size_t size)
108 if (warc_current_gzfile)
110 warc_current_gzfile_uncompressed_size += size;
111 return gzwrite (warc_current_gzfile, buffer, size);
114 return fwrite (buffer, 1, size, warc_current_file);
117 /* Writes STR to the current WARC file.
118 Returns false and set warc_write_ok to false if there
121 warc_write_string (const char *str)
126 size_t n = strlen (str);
127 if (n != warc_write_buffer (str, n))
128 warc_write_ok = false;
130 return warc_write_ok;
134 #define EXTRA_GZIP_HEADER_SIZE 12
135 #define GZIP_STATIC_HEADER_SIZE 10
136 #define FLG_FEXTRA 0x04
139 /* Starts a new WARC record. Writes the version header.
140 If opt.warc_maxsize is set and the current file is becoming
141 too large, this will open a new WARC file.
143 If compression is enabled, this will start a new
144 gzip stream in the current WARC file.
146 Returns false and set warc_write_ok to false if there
149 warc_write_start_record ()
154 fflush (warc_current_file);
155 if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
156 warc_start_new_file (false);
158 /* Start a GZIP stream, if required. */
159 if (opt.warc_compression_enabled)
161 /* Record the starting offset of the new record. */
162 warc_current_gzfile_offset = ftell (warc_current_file);
164 /* Reserve space for the extra GZIP header field.
165 In warc_write_end_record we will fill this space
166 with information about the uncompressed and
167 compressed size of the record. */
168 fprintf (warc_current_file, "XXXXXXXXXXXX");
169 fflush (warc_current_file);
171 /* Start a new GZIP stream. */
172 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
173 warc_current_gzfile_uncompressed_size = 0;
175 if (warc_current_gzfile == NULL)
177 logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
178 warc_write_ok = false;
183 warc_write_string ("WARC/1.0\r\n");
184 return warc_write_ok;
187 /* Writes a WARC header to the current WARC record.
188 This method may be run after warc_write_start_record and
189 before warc_write_block_from_file. */
191 warc_write_header (const char *name, const char *value)
195 warc_write_string (name);
196 warc_write_string (": ");
197 warc_write_string (value);
198 warc_write_string ("\r\n");
200 return warc_write_ok;
203 /* Copies the contents of DATA_IN to the WARC record.
204 Adds a Content-Length header to the WARC record.
205 Run this method after warc_write_header,
206 then run warc_write_end_record. */
208 warc_write_block_from_file (FILE *data_in)
210 /* Add the Content-Length header. */
211 char *content_length;
212 fseek (data_in, 0L, SEEK_END);
213 if (! asprintf (&content_length, "%ld", ftell (data_in)))
215 warc_write_ok = false;
218 warc_write_header ("Content-Length", content_length);
219 free (content_length);
221 /* End of the WARC header section. */
222 warc_write_string ("\r\n");
224 if (fseek (data_in, 0L, SEEK_SET) != 0)
225 warc_write_ok = false;
227 /* Copy the data in the file to the WARC record. */
230 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
232 if (warc_write_buffer (buffer, s) < s)
233 warc_write_ok = false;
236 return warc_write_ok;
239 /* Run this method to close the current WARC record.
241 If compression is enabled, this method closes the
242 current GZIP stream and fills the extra GZIP header
243 with the uncompressed and compressed length of the
246 warc_write_end_record ()
248 warc_write_buffer ("\r\n\r\n", 4);
250 /* We start a new gzip stream for each record. */
251 if (warc_write_ok && warc_current_gzfile)
253 if (gzclose (warc_current_gzfile) != Z_OK)
255 warc_write_ok = false;
259 fflush (warc_current_file);
260 fseek (warc_current_file, 0, SEEK_END);
262 /* The WARC standard suggests that we add 'skip length' data in the
263 extra header field of the GZIP stream.
265 In warc_write_start_record we reserved space for this extra header.
266 This extra space starts at warc_current_gzfile_offset and fills
267 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
268 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
270 We need to do three things:
271 1. Move the static GZIP header to warc_current_gzfile_offset;
272 2. Set the FEXTRA flag in the GZIP header;
273 3. Write the extra GZIP header after the static header, that is,
274 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
277 /* Calculate the uncompressed and compressed sizes. */
278 size_t current_offset = ftell (warc_current_file);
279 size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
280 size_t compressed_size = warc_current_gzfile_uncompressed_size;
282 /* Go back to the static GZIP header. */
283 fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
285 /* Read the header. */
286 char static_header[GZIP_STATIC_HEADER_SIZE];
287 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
288 if (result != GZIP_STATIC_HEADER_SIZE)
290 warc_write_ok = false;
294 /* Set the FEXTRA flag in the flags byte of the header. */
295 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
297 /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
298 fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
299 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
301 /* Prepare the extra GZIP header. */
302 char extra_header[EXTRA_GZIP_HEADER_SIZE];
303 /* XLEN, the length of the extra header fields. */
304 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
305 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
306 /* The extra header field identifier for the WARC skip length. */
307 extra_header[2] = 's';
308 extra_header[3] = 'l';
309 /* The size of the uncompressed record. */
310 extra_header[4] = (uncompressed_size & 255);
311 extra_header[5] = (uncompressed_size >> 8) & 255;
312 extra_header[6] = (uncompressed_size >> 16) & 255;
313 extra_header[7] = (uncompressed_size >> 24) & 255;
314 /* The size of the compressed record. */
315 extra_header[8] = (compressed_size & 255);
316 extra_header[9] = (compressed_size >> 8) & 255;
317 extra_header[10] = (compressed_size >> 16) & 255;
318 extra_header[11] = (compressed_size >> 24) & 255;
320 /* Write the extra header after the static header. */
321 fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
322 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
324 /* Done, move back to the end of the file. */
325 fflush (warc_current_file);
326 fseek (warc_current_file, 0, SEEK_END);
329 return warc_write_ok;
333 /* Writes the WARC-Date header for the given timestamp to
334 the current WARC record.
335 If timestamp is NULL, the current time will be used. */
337 warc_write_date_header (char *timestamp)
339 if (timestamp == NULL)
341 char current_timestamp[21];
342 warc_timestamp (current_timestamp);
343 timestamp = current_timestamp;
345 return warc_write_header ("WARC-Date", timestamp);
348 /* Writes the WARC-IP-Address header for the given IP to
349 the current WARC record. If IP is NULL, no header will
352 warc_write_ip_header (ip_address *ip)
355 return warc_write_header ("WARC-IP-Address", print_address (ip));
357 return warc_write_ok;
361 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
362 from gnulib/sha1.c. This version calculates two digests in one go.
364 Compute SHA1 message digests for bytes read from STREAM. The
365 digest of the complete file will be written into the 16 bytes
366 beginning at RES_BLOCK.
368 If payload_offset >= 0, a second digest will be calculated of the
369 portion of the file starting at payload_offset and continuing to
370 the end of the file. The digest number will be written into the
371 16 bytes beginning ad RES_PAYLOAD. */
373 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
375 #define BLOCKSIZE 32768
377 struct sha1_ctx ctx_block;
378 struct sha1_ctx ctx_payload;
382 char *buffer = malloc (BLOCKSIZE + 72);
386 /* Initialize the computation context. */
387 sha1_init_ctx (&ctx_block);
388 if (payload_offset >= 0)
389 sha1_init_ctx (&ctx_payload);
393 /* Iterate over full file contents. */
396 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
397 computation function processes the whole buffer so that with the
398 next round of the loop another block can be read. */
402 /* Read block. Take care for partial reads. */
405 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
410 if (sum == BLOCKSIZE)
415 /* Check for the error flag IFF N == 0, so that we don't
416 exit the loop after a partial read due to e.g., EAGAIN
423 goto process_partial_block;
426 /* We've read at least one byte, so ignore errors. But always
427 check for EOF, since feof may be true even though N > 0.
428 Otherwise, we could end up calling fread after EOF. */
430 goto process_partial_block;
433 /* Process buffer with BLOCKSIZE bytes. Note that
436 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
437 if (payload_offset >= 0 && payload_offset < pos)
439 /* At least part of the buffer contains data from payload. */
440 int start_of_payload = payload_offset - (pos - BLOCKSIZE);
441 if (start_of_payload <= 0)
442 /* All bytes in the buffer belong to the payload. */
443 start_of_payload = 0;
445 /* Process the payload part of the buffer.
446 Note: we can't use sha1_process_block here even if we
447 process the complete buffer. Because the payload doesn't
448 have to start with a full block, there may still be some
449 bytes left from the previous buffer. Therefore, we need
450 to continue with sha1_process_bytes. */
451 sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
455 process_partial_block:;
457 /* Process any remaining bytes. */
460 sha1_process_bytes (buffer, sum, &ctx_block);
461 if (payload_offset >= 0 && payload_offset < pos)
463 /* At least part of the buffer contains data from payload. */
464 int start_of_payload = payload_offset - (pos - sum);
465 if (start_of_payload <= 0)
466 /* All bytes in the buffer belong to the payload. */
467 start_of_payload = 0;
469 /* Process the payload part of the buffer. */
470 sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
474 /* Construct result in desired memory. */
475 sha1_finish_ctx (&ctx_block, res_block);
476 if (payload_offset >= 0)
477 sha1_finish_ctx (&ctx_payload, res_payload);
484 /* Converts the SHA1 digest to a base32-encoded string.
485 "sha1:DIGEST\0" (Allocates a new string for the response.) */
487 warc_base32_sha1_digest (char *sha1_digest)
489 // length: "sha1:" + digest + "\0"
490 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
491 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
492 memcpy (sha1_base32, "sha1:", 5);
493 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
498 /* Sets the digest headers of the record.
499 This method will calculate the block digest and, if payload_offset >= 0,
500 will also calculate the payload digest of the payload starting at the
503 warc_write_digest_headers (FILE *file, long payload_offset)
505 if (opt.warc_digests_enabled)
507 /* Calculate the block and payload digests. */
508 char sha1_res_block[SHA1_DIGEST_SIZE];
509 char sha1_res_payload[SHA1_DIGEST_SIZE];
512 if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
516 digest = warc_base32_sha1_digest (sha1_res_block);
517 warc_write_header ("WARC-Block-Digest", digest);
520 if (payload_offset >= 0)
522 digest = warc_base32_sha1_digest (sha1_res_payload);
523 warc_write_header ("WARC-Payload-Digest", digest);
531 /* Fills timestamp with the current time and date.
532 The UTC time is formatted following ISO 8601, as required
533 for use in the WARC-Date header.
534 The timestamp will be 21 characters long. */
536 warc_timestamp (char *timestamp)
539 struct tm * timeinfo;
541 timeinfo = gmtime (&rawtime);
542 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
545 /* Fills uuid_str with a UUID based on random numbers.
546 (See RFC 4122, UUID version 4.)
548 Note: this is a fallback method, it is much better to use the
549 methods provided by libuuid.
551 The uuid_str will be 36 characters long. */
553 warc_uuid_random (char *uuid_str)
555 // RFC 4122, a version 4 UUID with only random numbers
557 unsigned char uuid_data[16];
560 uuid_data[i] = random_number (255);
562 // Set the four most significant bits (bits 12 through 15) of the
563 // time_hi_and_version field to the 4-bit version number
564 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
566 // Set the two most significant bits (bits 6 and 7) of the
567 // clock_seq_hi_and_reserved to zero and one, respectively.
568 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
571 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
572 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
573 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
574 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
578 /* Fills urn_str with a UUID in the format required
579 for the WARC-Record-Id header.
580 The string will be 47 characters long. */
582 warc_uuid_str (char *urn_str)
588 uuid_generate (record_id);
589 uuid_unparse (record_id, uuid_str);
591 warc_uuid_random (uuid_str);
594 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
597 /* Write a warcinfo record to the current file.
598 Updates warc_current_warcinfo_uuid_str. */
600 warc_write_warcinfo_record (char *filename)
602 /* Write warc-info record as the first record of the file. */
603 /* We add the record id of this info record to the other records in the file. */
604 warc_current_warcinfo_uuid_str = (char *) malloc (48);
605 warc_uuid_str (warc_current_warcinfo_uuid_str);
608 warc_timestamp (timestamp);
610 char *filename_copy, *filename_basename;
611 filename_copy = strdup (filename);
612 filename_basename = strdup (basename (filename_copy));
614 warc_write_start_record ();
615 warc_write_header ("WARC-Type", "warcinfo");
616 warc_write_header ("Content-Type", "application/warc-fields");
617 warc_write_header ("WARC-Date", timestamp);
618 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
619 warc_write_header ("WARC-Filename", filename_basename);
621 /* Create content. */
622 FILE *warc_tmp = warc_tempfile ();
623 if (warc_tmp == NULL)
625 free (filename_copy);
626 free (filename_basename);
630 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
631 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
632 fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
633 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
634 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
635 /* Add the user headers, if any. */
636 if (opt.warc_user_headers)
639 for (i = 0; opt.warc_user_headers[i]; i++)
640 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
642 fprintf(warc_tmp, "\r\n");
644 warc_write_digest_headers (warc_tmp, -1);
645 warc_write_block_from_file (warc_tmp);
646 warc_write_end_record ();
650 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
653 free (filename_copy);
654 free (filename_basename);
656 return warc_write_ok;
659 /* Opens a new WARC file.
660 If META is true, generates a filename ending with 'meta.warc.gz'.
663 1. close the current WARC file (if there is one);
664 2. increment warc_current_file_number;
665 3. open a new WARC file;
666 4. write the initial warcinfo record.
668 Returns true on success, false otherwise.
671 warc_start_new_file (bool meta)
673 if (opt.warc_filename == NULL)
676 if (warc_current_file != NULL)
677 fclose (warc_current_file);
678 if (warc_current_warcinfo_uuid_str)
679 free (warc_current_warcinfo_uuid_str);
680 if (warc_current_filename)
681 free (warc_current_filename);
683 warc_current_file_number++;
685 int base_filename_length = strlen (opt.warc_filename);
686 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
687 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
688 warc_current_filename = new_filename;
690 char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
692 /* If max size is enabled, we add a serial number to the file names. */
694 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
695 else if (opt.warc_maxsize > 0)
696 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
698 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
700 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
702 /* Open the WARC file. */
703 warc_current_file = fopen (new_filename, "wb+");
704 if (warc_current_file == NULL)
706 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
710 if (! warc_write_warcinfo_record (new_filename))
713 /* Add warcinfo uuid to manifest. */
714 if (warc_manifest_fp)
715 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
720 /* Opens the CDX file for output. */
722 warc_start_cdx_file ()
724 int filename_length = strlen (opt.warc_filename);
725 char *cdx_filename = alloca (filename_length + 4 + 1);
726 memcpy (cdx_filename, opt.warc_filename, filename_length);
727 memcpy (cdx_filename + filename_length, ".cdx", 5);
728 warc_current_cdx_file = fopen (cdx_filename, "a+");
729 if (warc_current_cdx_file == NULL)
732 /* Print the CDX header.
738 * k - new style checksum
741 * V - compressed arc file offset
745 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
746 fflush (warc_current_cdx_file);
751 #define CDX_FIELDSEP " \t\r\n"
753 /* Parse the CDX header and find the field numbers of the original url,
754 checksum and record ID fields. */
756 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
758 *field_num_original_url = -1;
759 *field_num_checksum = -1;
760 *field_num_record_id = -1;
764 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
766 if (token != NULL && strcmp (token, "CDX") == 0)
769 while (token != NULL)
771 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
777 *field_num_original_url = field_num;
780 *field_num_checksum = field_num;
783 *field_num_record_id = field_num;
791 return *field_num_original_url != -1
792 && *field_num_checksum != -1
793 && *field_num_record_id != -1;
796 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
798 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
800 char *original_url = NULL;
801 char *checksum = NULL;
802 char *record_id = NULL;
806 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
808 /* Read this line to get the fields we need. */
810 while (token != NULL)
813 if (field_num == field_num_original_url)
815 else if (field_num == field_num_checksum)
817 else if (field_num == field_num_record_id)
823 *val = strdup (token);
825 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
829 if (original_url != NULL && checksum != NULL && record_id != NULL)
831 /* For some extra efficiency, we decode the base32 encoded
832 checksum value. This should produce exactly SHA1_DIGEST_SIZE
836 base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
839 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
841 /* This is a valid line with a valid checksum. */
842 struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
843 rec->url = original_url;
844 rec->uuid = record_id;
845 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
846 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
852 if (checksum_v != NULL)
859 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
860 the warc_cdx_dedup_table. */
862 warc_load_cdx_dedup_file ()
864 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
868 int field_num_original_url = -1;
869 int field_num_checksum = -1;
870 int field_num_record_id = -1;
872 char *lineptr = NULL;
876 /* The first line should contain the CDX header.
877 Format: " CDX x x x x x"
878 where x are field type indicators. For our purposes, we only
879 need 'a' (the original url), 'k' (the SHA1 checksum) and
880 'u' (the WARC record id). */
881 line_length = getline (&lineptr, &n, f);
882 if (line_length != -1)
883 warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
885 /* If the file contains all three fields, read the complete file. */
886 if (field_num_original_url == -1
887 || field_num_checksum == -1
888 || field_num_record_id == -1)
890 if (field_num_original_url == -1)
891 logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
892 if (field_num_checksum == -1)
893 logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
894 if (field_num_record_id == -1)
895 logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
899 /* Initialize the table. */
900 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
904 line_length = getline (&lineptr, &n, f);
905 if (line_length != -1)
906 warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
909 while (line_length != -1);
912 int nrecords = hash_table_count (warc_cdx_dedup_table);
913 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
914 "Loaded %d records from CDX.\n\n", nrecords),
924 /* Returns the existing duplicate CDX record for the given url and payload
925 digest. Returns NULL if the url is not found or if the payload digest
926 does not match, or if CDX deduplication is disabled. */
927 static struct warc_cdx_record *
928 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
930 if (warc_cdx_dedup_table == NULL)
934 struct warc_cdx_record *rec_existing;
935 hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
937 if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
943 /* Initializes the WARC writer (if opt.warc_filename is set).
944 This should be called before any WARC record is written. */
948 warc_write_ok = true;
950 if (opt.warc_filename != NULL)
952 if (opt.warc_cdx_dedup_filename != NULL)
954 if (! warc_load_cdx_dedup_file ())
956 logprintf (LOG_NOTQUIET,
957 _("Could not read CDX file %s for deduplication.\n"),
958 quote (opt.warc_cdx_dedup_filename));
963 warc_manifest_fp = warc_tempfile ();
964 if (warc_manifest_fp == NULL)
966 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
970 if (opt.warc_keep_log)
972 warc_log_fp = warc_tempfile ();
973 if (warc_log_fp == NULL)
975 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
978 log_set_warc_log_fp (warc_log_fp);
981 warc_current_file_number = -1;
982 if (! warc_start_new_file (false))
984 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
988 if (opt.warc_cdx_enabled)
990 if (! warc_start_cdx_file ())
992 logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
999 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1001 warc_write_metadata ()
1003 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1004 if (opt.warc_maxsize > 0)
1005 warc_start_new_file (true);
1007 char manifest_uuid [48];
1008 warc_uuid_str (manifest_uuid);
1010 fflush (warc_manifest_fp);
1011 warc_write_resource_record (manifest_uuid,
1012 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1013 NULL, NULL, NULL, "text/plain",
1014 warc_manifest_fp, -1);
1015 /* warc_write_resource_record has closed warc_manifest_fp. */
1017 FILE * warc_tmp_fp = warc_tempfile ();
1018 if (warc_tmp_fp == NULL)
1020 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1023 fflush (warc_tmp_fp);
1024 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1026 warc_write_resource_record (manifest_uuid,
1027 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1028 NULL, NULL, NULL, "text/plain",
1030 /* warc_write_resource_record has closed warc_tmp_fp. */
1032 if (warc_log_fp != NULL)
1034 warc_write_resource_record (NULL,
1035 "metadata://gnu.org/software/wget/warc/wget.log",
1036 NULL, manifest_uuid, NULL, "text/plain",
1038 /* warc_write_resource_record has closed warc_log_fp. */
1041 log_set_warc_log_fp (NULL);
1045 /* Finishes the WARC writing.
1046 This should be called at the end of the program. */
1050 if (warc_current_file != NULL)
1052 warc_write_metadata ();
1053 free (warc_current_warcinfo_uuid_str);
1054 fclose (warc_current_file);
1056 if (warc_current_cdx_file != NULL)
1057 fclose (warc_current_cdx_file);
1058 if (warc_log_fp != NULL)
1060 fclose (warc_log_fp);
1061 log_set_warc_log_fp (NULL);
1065 /* Creates a temporary file for writing WARC output.
1066 The temporary file will be created in opt.warc_tempdir.
1067 Returns the pointer to the temporary file, or NULL. */
1072 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1075 int fd = mkstemp (filename);
1079 if (unlink (filename) < 0)
1082 return fdopen (fd, "wb+");
1086 /* Writes a request record to the WARC file.
1087 url is the target uri of the request,
1088 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1089 record_uuid is the uuid of the request (generated with warc_uuid_str),
1090 body is a pointer to a file containing the request headers and body.
1091 ip is the ip address of the server (or NULL),
1092 Calling this function will close body.
1093 Returns true on success, false on error. */
1095 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1097 warc_write_start_record ();
1098 warc_write_header ("WARC-Type", "request");
1099 warc_write_header ("WARC-Target-URI", url);
1100 warc_write_header ("Content-Type", "application/http;msgtype=request");
1101 warc_write_date_header (timestamp_str);
1102 warc_write_header ("WARC-Record-ID", record_uuid);
1103 warc_write_ip_header (ip);
1104 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1105 warc_write_digest_headers (body, payload_offset);
1106 warc_write_block_from_file (body);
1107 warc_write_end_record ();
1111 return warc_write_ok;
1114 /* Writes a response record to the CDX file.
1115 url is the target uri of the request/response,
1116 timestamp_str is the timestamp of the request that generated this response,
1117 (generated with warc_timestamp),
1118 mime_type is the mime type of the response body (will be printed to CDX),
1119 response_code is the HTTP response code (will be printed to CDX),
1120 payload_digest is the sha1 digest of the payload,
1121 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1122 offset is the position of the WARC record in the WARC file,
1123 warc_filename is the filename of the WARC,
1124 response_uuid is the uuid of the response.
1125 Returns true on success, false on error. */
1127 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1129 /* Transform the timestamp. */
1130 char timestamp_str_cdx [15];
1131 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1132 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1133 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1134 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1135 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1136 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1137 timestamp_str_cdx[14] = '\0';
1139 /* Rewrite the checksum. */
1141 if (payload_digest != NULL)
1142 checksum = payload_digest + 5; /* Skip the "sha1:" */
1146 if (mime_type == NULL || strlen(mime_type) == 0)
1148 if (redirect_location == NULL || strlen(redirect_location) == 0)
1149 redirect_location = "-";
1151 /* Print the CDX line. */
1152 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1153 fflush (warc_current_cdx_file);
1158 /* Writes a revisit record to the WARC file.
1159 url is the target uri of the request/response,
1160 timestamp_str is the timestamp of the request that generated this response
1161 (generated with warc_timestamp),
1162 concurrent_to_uuid is the uuid of the request for that generated this response
1163 (generated with warc_uuid_str),
1164 refers_to_uuid is the uuid of the original response
1165 (generated with warc_uuid_str),
1166 payload_digest is the sha1 digest of the payload,
1167 ip is the ip address of the server (or NULL),
1168 body is a pointer to a file containing the response headers (without payload).
1169 Calling this function will close body.
1170 Returns true on success, false on error. */
1172 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1174 char revisit_uuid [48];
1175 warc_uuid_str (revisit_uuid);
1177 char *block_digest = NULL;
1178 char sha1_res_block[SHA1_DIGEST_SIZE];
1179 sha1_stream (body, sha1_res_block);
1180 block_digest = warc_base32_sha1_digest (sha1_res_block);
1182 warc_write_start_record ();
1183 warc_write_header ("WARC-Type", "revisit");
1184 warc_write_header ("WARC-Record-ID", revisit_uuid);
1185 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1186 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1187 warc_write_header ("WARC-Refers-To", refers_to);
1188 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1189 warc_write_header ("WARC-Truncated", "length");
1190 warc_write_header ("WARC-Target-URI", url);
1191 warc_write_date_header (timestamp_str);
1192 warc_write_ip_header (ip);
1193 warc_write_header ("Content-Type", "application/http;msgtype=response");
1194 warc_write_header ("WARC-Block-Digest", block_digest);
1195 warc_write_header ("WARC-Payload-Digest", payload_digest);
1196 warc_write_block_from_file (body);
1197 warc_write_end_record ();
1200 free (block_digest);
1202 return warc_write_ok;
1205 /* Writes a response record to the WARC file.
1206 url is the target uri of the request/response,
1207 timestamp_str is the timestamp of the request that generated this response
1208 (generated with warc_timestamp),
1209 concurrent_to_uuid is the uuid of the request for that generated this response
1210 (generated with warc_uuid_str),
1211 ip is the ip address of the server (or NULL),
1212 body is a pointer to a file containing the response headers and body.
1213 mime_type is the mime type of the response body (will be printed to CDX),
1214 response_code is the HTTP response code (will be printed to CDX),
1215 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1216 Calling this function will close body.
1217 Returns true on success, false on error. */
1219 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1221 char *block_digest = NULL;
1222 char *payload_digest = NULL;
1223 char sha1_res_block[SHA1_DIGEST_SIZE];
1224 char sha1_res_payload[SHA1_DIGEST_SIZE];
1226 if (opt.warc_digests_enabled)
1228 /* Calculate the block and payload digests. */
1230 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1232 /* Decide (based on url + payload digest) if we have seen this
1234 struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1235 if (rec_existing != NULL)
1237 /* Found an existing record. */
1238 logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1240 /* Remove the payload from the file. */
1241 if (payload_offset > 0)
1243 if (ftruncate (fileno (body), payload_offset) == -1)
1247 /* Send the original payload digest. */
1248 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1249 bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1250 free (payload_digest);
1255 block_digest = warc_base32_sha1_digest (sha1_res_block);
1256 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1260 /* Not a revisit, just store the record. */
1262 char response_uuid [48];
1263 warc_uuid_str (response_uuid);
1265 fseek (warc_current_file, 0L, SEEK_END);
1266 size_t offset = ftell (warc_current_file);
1268 warc_write_start_record ();
1269 warc_write_header ("WARC-Type", "response");
1270 warc_write_header ("WARC-Record-ID", response_uuid);
1271 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1272 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1273 warc_write_header ("WARC-Target-URI", url);
1274 warc_write_date_header (timestamp_str);
1275 warc_write_ip_header (ip);
1276 warc_write_header ("WARC-Block-Digest", block_digest);
1277 warc_write_header ("WARC-Payload-Digest", payload_digest);
1278 warc_write_header ("Content-Type", "application/http;msgtype=response");
1279 warc_write_block_from_file (body);
1280 warc_write_end_record ();
1284 if (warc_write_ok && opt.warc_cdx_enabled)
1286 /* Add this record to the CDX. */
1287 warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1291 free (block_digest);
1293 free (payload_digest);
1295 return warc_write_ok;
1298 /* Writes a resource record to the WARC file.
1299 resource_uuid is the uuid of the resource (or NULL),
1300 url is the target uri of the resource,
1301 timestamp_str is the timestamp (generated with warc_timestamp),
1302 concurrent_to_uuid is the uuid of the request for that generated this resource
1303 (generated with warc_uuid_str) or NULL,
1304 ip is the ip address of the server (or NULL),
1305 content_type is the mime type of the body (or NULL),
1306 body is a pointer to a file containing the resource data.
1307 Calling this function will close body.
1308 Returns true on success, false on error. */
1310 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1312 if (resource_uuid == NULL)
1314 resource_uuid = alloca (48);
1315 warc_uuid_str (resource_uuid);
1318 if (content_type == NULL)
1319 content_type = "application/octet-stream";
1321 warc_write_start_record ();
1322 warc_write_header ("WARC-Type", "resource");
1323 warc_write_header ("WARC-Record-ID", resource_uuid);
1324 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1325 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1326 warc_write_header ("WARC-Target-URI", url);
1327 warc_write_date_header (timestamp_str);
1328 warc_write_ip_header (ip);
1329 warc_write_digest_headers (body, payload_offset);
1330 warc_write_header ("Content-Type", content_type);
1331 warc_write_block_from_file (body);
1332 warc_write_end_record ();
1336 return warc_write_ok;