1 /* Utility functions for writing WARC files. */
19 #include <uuid/uuid.h>
24 extern char *version_string;
26 /* Set by main in main.c */
27 extern char *program_argstring;
30 /* The log file (a temporary file that contains a copy
32 static FILE *warc_log_fp;
34 /* The manifest file (a temporary file that contains the
35 warcinfo uuid of every file in this crawl). */
36 static FILE *warc_manifest_fp;
38 /* The current WARC file (or NULL, if WARC is disabled). */
39 static FILE *warc_current_file;
41 /* The gzip stream for the current WARC file
42 (or NULL, if WARC or gzip is disabled). */
43 static gzFile *warc_current_gzfile;
45 /* The offset of the current gzip record in the WARC file. */
46 static size_t warc_current_gzfile_offset;
48 /* The uncompressed size (so far) of the current record. */
49 static size_t warc_current_gzfile_uncompressed_size;
51 /* This is true until a warc_write_* method fails. */
52 static bool warc_write_ok;
54 /* The current CDX file (or NULL, if CDX is disabled). */
55 static FILE *warc_current_cdx_file;
57 /* The record id of the warcinfo record of the current WARC file. */
58 static char *warc_current_warcinfo_uuid_str;
60 /* The file name of the current WARC file. */
61 static char *warc_current_filename;
63 /* The serial number of the current WARC file. This number is
64 incremented each time a new file is opened and is used in the
65 WARC file's filename. */
66 static int warc_current_file_number;
68 /* The table of CDX records, if deduplication is enabled. */
69 struct hash_table * warc_cdx_dedup_table;
71 static bool warc_start_new_file (bool meta);
74 struct warc_cdx_record
78 char digest[SHA1_DIGEST_SIZE];
82 warc_hash_sha1_digest (const void *key)
84 /* We just use some of the first bytes of the digest. */
86 memcpy (&v, key, sizeof (unsigned long));
91 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
93 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
98 /* Writes SIZE bytes from BUFFER to the current WARC file,
99 through gzwrite if compression is enabled.
100 Returns the number of uncompressed bytes written. */
102 warc_write_buffer (const char *buffer, size_t size)
104 if (warc_current_gzfile)
106 warc_current_gzfile_uncompressed_size += size;
107 return gzwrite (warc_current_gzfile, buffer, size);
110 return fwrite (buffer, 1, size, warc_current_file);
113 /* Writes STR to the current WARC file.
114 Returns false and set warc_write_ok to false if there
117 warc_write_string (const char *str)
122 size_t n = strlen (str);
123 if (n != warc_write_buffer (str, n))
124 warc_write_ok = false;
126 return warc_write_ok;
130 #define EXTRA_GZIP_HEADER_SIZE 12
131 #define GZIP_STATIC_HEADER_SIZE 10
132 #define FLG_FEXTRA 0x04
135 /* Starts a new WARC record. Writes the version header.
136 If opt.warc_maxsize is set and the current file is becoming
137 too large, this will open a new WARC file.
139 If compression is enabled, this will start a new
140 gzip stream in the current WARC file.
142 Returns false and set warc_write_ok to false if there
145 warc_write_start_record ()
150 fflush (warc_current_file);
151 if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
152 warc_start_new_file (false);
154 /* Start a GZIP stream, if required. */
155 if (opt.warc_compression_enabled)
157 /* Record the starting offset of the new record. */
158 warc_current_gzfile_offset = ftell (warc_current_file);
160 /* Reserve space for the extra GZIP header field.
161 In warc_write_end_record we will fill this space
162 with information about the uncompressed and
163 compressed size of the record. */
164 fprintf (warc_current_file, "XXXXXXXXXXXX");
165 fflush (warc_current_file);
167 /* Start a new GZIP stream. */
168 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb+9");
169 warc_current_gzfile_uncompressed_size = 0;
171 if (warc_current_gzfile == NULL)
173 logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
174 warc_write_ok = false;
179 warc_write_string ("WARC/1.0\r\n");
180 return warc_write_ok;
183 /* Writes a WARC header to the current WARC record.
184 This method may be run after warc_write_start_record and
185 before warc_write_block_from_file. */
187 warc_write_header (const char *name, const char *value)
191 warc_write_string (name);
192 warc_write_string (": ");
193 warc_write_string (value);
194 warc_write_string ("\r\n");
196 return warc_write_ok;
199 /* Copies the contents of DATA_IN to the WARC record.
200 Adds a Content-Length header to the WARC record.
201 Run this method after warc_write_header,
202 then run warc_write_end_record. */
204 warc_write_block_from_file (FILE *data_in)
206 /* Add the Content-Length header. */
207 char *content_length;
208 fseek (data_in, 0L, SEEK_END);
209 if (! asprintf (&content_length, "%ld", ftell (data_in)))
211 warc_write_ok = false;
214 warc_write_header ("Content-Length", content_length);
215 free (content_length);
217 /* End of the WARC header section. */
218 warc_write_string ("\r\n");
220 if (fseek (data_in, 0L, SEEK_SET) != 0)
221 warc_write_ok = false;
223 /* Copy the data in the file to the WARC record. */
226 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
228 if (warc_write_buffer (buffer, s) < s)
229 warc_write_ok = false;
232 return warc_write_ok;
235 /* Run this method to close the current WARC record.
237 If compression is enabled, this method closes the
238 current GZIP stream and fills the extra GZIP header
239 with the uncompressed and compressed length of the
242 warc_write_end_record ()
244 warc_write_buffer ("\r\n\r\n", 4);
246 /* We start a new gzip stream for each record. */
247 if (warc_write_ok && warc_current_gzfile)
249 if (gzclose (warc_current_gzfile) != Z_OK)
251 warc_write_ok = false;
255 fflush (warc_current_file);
256 fseek (warc_current_file, 0, SEEK_END);
258 /* The WARC standard suggests that we add 'skip length' data in the
259 extra header field of the GZIP stream.
261 In warc_write_start_record we reserved space for this extra header.
262 This extra space starts at warc_current_gzfile_offset and fills
263 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
264 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
266 We need to do three things:
267 1. Move the static GZIP header to warc_current_gzfile_offset;
268 2. Set the FEXTRA flag in the GZIP header;
269 3. Write the extra GZIP header after the static header, that is,
270 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
273 /* Calculate the uncompressed and compressed sizes. */
274 size_t current_offset = ftell (warc_current_file);
275 size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
276 size_t compressed_size = warc_current_gzfile_uncompressed_size;
278 /* Go back to the static GZIP header. */
279 fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
281 /* Read the header. */
282 char static_header[GZIP_STATIC_HEADER_SIZE];
283 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
284 if (result != GZIP_STATIC_HEADER_SIZE)
286 warc_write_ok = false;
290 /* Set the FEXTRA flag in the flags byte of the header. */
291 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
293 /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
294 fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
295 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
297 /* Prepare the extra GZIP header. */
298 char extra_header[EXTRA_GZIP_HEADER_SIZE];
299 /* XLEN, the length of the extra header fields. */
300 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
301 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
302 /* The extra header field identifier for the WARC skip length. */
303 extra_header[2] = 's';
304 extra_header[3] = 'l';
305 /* The size of the uncompressed record. */
306 extra_header[4] = (uncompressed_size & 255);
307 extra_header[5] = (uncompressed_size >> 8) & 255;
308 extra_header[6] = (uncompressed_size >> 16) & 255;
309 extra_header[7] = (uncompressed_size >> 24) & 255;
310 /* The size of the compressed record. */
311 extra_header[8] = (compressed_size & 255);
312 extra_header[9] = (compressed_size >> 8) & 255;
313 extra_header[10] = (compressed_size >> 16) & 255;
314 extra_header[11] = (compressed_size >> 24) & 255;
316 /* Write the extra header after the static header. */
317 fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
318 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
320 /* Done, move back to the end of the file. */
321 fflush (warc_current_file);
322 fseek (warc_current_file, 0, SEEK_END);
325 return warc_write_ok;
329 /* Writes the WARC-Date header for the given timestamp to
330 the current WARC record.
331 If timestamp is NULL, the current time will be used. */
333 warc_write_date_header (char *timestamp)
335 if (timestamp == NULL)
337 char current_timestamp[21];
338 warc_timestamp (current_timestamp);
339 timestamp = current_timestamp;
341 return warc_write_header ("WARC-Date", timestamp);
344 /* Writes the WARC-IP-Address header for the given IP to
345 the current WARC record. If IP is NULL, no header will
348 warc_write_ip_header (ip_address *ip)
351 return warc_write_header ("WARC-IP-Address", print_address (ip));
353 return warc_write_ok;
357 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
358 from gnulib/sha1.c. This version calculates two digests in one go.
360 Compute SHA1 message digests for bytes read from STREAM. The
361 digest of the complete file will be written into the 16 bytes
362 beginning at RES_BLOCK.
364 If payload_offset >= 0, a second digest will be calculated of the
365 portion of the file starting at payload_offset and continuing to
366 the end of the file. The digest number will be written into the
367 16 bytes beginning ad RES_PAYLOAD. */
369 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
371 #define BLOCKSIZE 32768
373 struct sha1_ctx ctx_block;
374 struct sha1_ctx ctx_payload;
378 char *buffer = malloc (BLOCKSIZE + 72);
382 /* Initialize the computation context. */
383 sha1_init_ctx (&ctx_block);
384 if (payload_offset >= 0)
385 sha1_init_ctx (&ctx_payload);
389 /* Iterate over full file contents. */
392 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
393 computation function processes the whole buffer so that with the
394 next round of the loop another block can be read. */
398 /* Read block. Take care for partial reads. */
401 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
406 if (sum == BLOCKSIZE)
411 /* Check for the error flag IFF N == 0, so that we don't
412 exit the loop after a partial read due to e.g., EAGAIN
419 goto process_partial_block;
422 /* We've read at least one byte, so ignore errors. But always
423 check for EOF, since feof may be true even though N > 0.
424 Otherwise, we could end up calling fread after EOF. */
426 goto process_partial_block;
429 /* Process buffer with BLOCKSIZE bytes. Note that
432 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
433 if (payload_offset >= 0 && payload_offset < pos)
435 /* At least part of the buffer contains data from payload. */
436 int start_of_payload = payload_offset - (pos - BLOCKSIZE);
437 if (start_of_payload <= 0)
438 /* All bytes in the buffer belong to the payload. */
439 start_of_payload = 0;
441 /* Process the payload part of the buffer.
442 Note: we can't use sha1_process_block here even if we
443 process the complete buffer. Because the payload doesn't
444 have to start with a full block, there may still be some
445 bytes left from the previous buffer. Therefore, we need
446 to continue with sha1_process_bytes. */
447 sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
451 process_partial_block:;
453 /* Process any remaining bytes. */
456 sha1_process_bytes (buffer, sum, &ctx_block);
457 if (payload_offset >= 0 && payload_offset < pos)
459 /* At least part of the buffer contains data from payload. */
460 int start_of_payload = payload_offset - (pos - sum);
461 if (start_of_payload <= 0)
462 /* All bytes in the buffer belong to the payload. */
463 start_of_payload = 0;
465 /* Process the payload part of the buffer. */
466 sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
470 /* Construct result in desired memory. */
471 sha1_finish_ctx (&ctx_block, res_block);
472 if (payload_offset >= 0)
473 sha1_finish_ctx (&ctx_payload, res_payload);
480 /* Converts the SHA1 digest to a base32-encoded string.
481 "sha1:DIGEST\0" (Allocates a new string for the response.) */
483 warc_base32_sha1_digest (char *sha1_digest)
485 // length: "sha1:" + digest + "\0"
486 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
487 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
488 memcpy (sha1_base32, "sha1:", 5);
489 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
494 /* Sets the digest headers of the record.
495 This method will calculate the block digest and, if payload_offset >= 0,
496 will also calculate the payload digest of the payload starting at the
499 warc_write_digest_headers (FILE *file, long payload_offset)
501 if (opt.warc_digests_enabled)
503 /* Calculate the block and payload digests. */
504 char sha1_res_block[SHA1_DIGEST_SIZE];
505 char sha1_res_payload[SHA1_DIGEST_SIZE];
508 if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
512 digest = warc_base32_sha1_digest (sha1_res_block);
513 warc_write_header ("WARC-Block-Digest", digest);
516 if (payload_offset >= 0)
518 digest = warc_base32_sha1_digest (sha1_res_payload);
519 warc_write_header ("WARC-Payload-Digest", digest);
527 /* Fills timestamp with the current time and date.
528 The UTC time is formatted following ISO 8601, as required
529 for use in the WARC-Date header.
530 The timestamp will be 21 characters long. */
532 warc_timestamp (char *timestamp)
535 struct tm * timeinfo;
537 timeinfo = gmtime (&rawtime);
538 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
541 /* Fills uuid_str with a UUID based on random numbers.
542 (See RFC 4122, UUID version 4.)
544 Note: this is a fallback method, it is much better to use the
545 methods provided by libuuid.
547 The uuid_str will be 36 characters long. */
549 warc_uuid_random (char *uuid_str)
551 // RFC 4122, a version 4 UUID with only random numbers
553 unsigned char uuid_data[16];
556 uuid_data[i] = random_number (255);
558 // Set the four most significant bits (bits 12 through 15) of the
559 // time_hi_and_version field to the 4-bit version number
560 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
562 // Set the two most significant bits (bits 6 and 7) of the
563 // clock_seq_hi_and_reserved to zero and one, respectively.
564 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
567 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
568 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
569 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
570 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
574 /* Fills urn_str with a UUID in the format required
575 for the WARC-Record-Id header.
576 The string will be 47 characters long. */
578 warc_uuid_str (char *urn_str)
584 uuid_generate (record_id);
585 uuid_unparse (record_id, uuid_str);
587 warc_uuid_random (uuid_str);
590 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
593 /* Write a warcinfo record to the current file.
594 Updates warc_current_warcinfo_uuid_str. */
596 warc_write_warcinfo_record (char *filename)
598 /* Write warc-info record as the first record of the file. */
599 /* We add the record id of this info record to the other records in the file. */
600 warc_current_warcinfo_uuid_str = (char *) malloc (48);
601 warc_uuid_str (warc_current_warcinfo_uuid_str);
604 warc_timestamp (timestamp);
606 char *filename_copy, *filename_basename;
607 filename_copy = strdup (filename);
608 filename_basename = basename (filename_copy);
610 warc_write_start_record ();
611 warc_write_header ("WARC-Type", "warcinfo");
612 warc_write_header ("Content-Type", "application/warc-fields");
613 warc_write_header ("WARC-Date", timestamp);
614 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
615 warc_write_header ("WARC-Filename", filename_basename);
617 /* Create content. */
618 FILE *warc_tmp = warc_tempfile ();
619 if (warc_tmp == NULL)
621 free (filename_copy);
625 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
626 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
627 fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
628 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
629 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
630 /* Add the user headers, if any. */
631 if (opt.warc_user_headers)
634 for (i = 0; opt.warc_user_headers[i]; i++)
635 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
637 fprintf(warc_tmp, "\r\n");
639 warc_write_digest_headers (warc_tmp, -1);
640 warc_write_block_from_file (warc_tmp);
641 warc_write_end_record ();
645 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
648 free (filename_copy);
650 return warc_write_ok;
653 /* Opens a new WARC file.
654 If META is true, generates a filename ending with 'meta.warc.gz'.
657 1. close the current WARC file (if there is one);
658 2. increment warc_current_file_number;
659 3. open a new WARC file;
660 4. write the initial warcinfo record.
662 Returns true on success, false otherwise.
665 warc_start_new_file (bool meta)
667 if (opt.warc_filename == NULL)
670 if (warc_current_file != NULL)
671 fclose (warc_current_file);
672 if (warc_current_warcinfo_uuid_str)
673 free (warc_current_warcinfo_uuid_str);
674 if (warc_current_filename)
675 free (warc_current_filename);
677 warc_current_file_number++;
679 int base_filename_length = strlen (opt.warc_filename);
680 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
681 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
682 warc_current_filename = new_filename;
684 char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
686 /* If max size is enabled, we add a serial number to the file names. */
688 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
689 else if (opt.warc_maxsize > 0)
690 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
692 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
694 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
696 /* Open the WARC file. */
697 warc_current_file = fopen (new_filename, "wb+");
698 if (warc_current_file == NULL)
700 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
704 if (! warc_write_warcinfo_record (new_filename))
707 /* Add warcinfo uuid to manifest. */
708 if (warc_manifest_fp)
709 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
714 /* Opens the CDX file for output. */
716 warc_start_cdx_file ()
718 int filename_length = strlen (opt.warc_filename);
719 char *cdx_filename = alloca (filename_length + 4 + 1);
720 memcpy (cdx_filename, opt.warc_filename, filename_length);
721 memcpy (cdx_filename + filename_length, ".cdx", 5);
722 warc_current_cdx_file = fopen (cdx_filename, "a+");
723 if (warc_current_cdx_file == NULL)
726 /* Print the CDX header.
732 * k - new style checksum
735 * V - compressed arc file offset
739 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
740 fflush (warc_current_cdx_file);
745 #define CDX_FIELDSEP " \t\r\n"
747 /* Parse the CDX header and find the field numbers of the original url,
748 checksum and record ID fields. */
750 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
752 *field_num_original_url = -1;
753 *field_num_checksum = -1;
754 *field_num_record_id = -1;
758 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
760 if (token != NULL && strcmp (token, "CDX") == 0)
763 while (token != NULL)
765 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
771 *field_num_original_url = field_num;
774 *field_num_checksum = field_num;
777 *field_num_record_id = field_num;
785 return *field_num_original_url != -1
786 && *field_num_checksum != -1
787 && *field_num_record_id != -1;
790 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
792 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
794 char *original_url = NULL;
795 char *checksum = NULL;
796 char *record_id = NULL;
800 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
802 /* Read this line to get the fields we need. */
804 while (token != NULL)
807 if (field_num == field_num_original_url)
809 else if (field_num == field_num_checksum)
811 else if (field_num == field_num_record_id)
817 *val = strdup (token);
819 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
823 if (original_url != NULL && checksum != NULL && record_id != NULL)
825 /* For some extra efficiency, we decode the base32 encoded
826 checksum value. This should produce exactly SHA1_DIGEST_SIZE
830 base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
833 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
835 /* This is a valid line with a valid checksum. */
836 struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
837 rec->url = original_url;
838 rec->uuid = record_id;
839 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
840 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
846 if (checksum_v != NULL)
853 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
854 the warc_cdx_dedup_table. */
856 warc_load_cdx_dedup_file ()
858 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
862 int field_num_original_url = -1;
863 int field_num_checksum = -1;
864 int field_num_record_id = -1;
866 char *lineptr = NULL;
870 /* The first line should contain the CDX header.
871 Format: " CDX x x x x x"
872 where x are field type indicators. For our purposes, we only
873 need 'a' (the original url), 'k' (the SHA1 checksum) and
874 'u' (the WARC record id). */
875 line_length = getline (&lineptr, &n, f);
876 if (line_length != -1)
877 warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
879 /* If the file contains all three fields, read the complete file. */
880 if (field_num_original_url == -1
881 || field_num_checksum == -1
882 || field_num_record_id == -1)
884 if (field_num_original_url == -1)
885 logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
886 if (field_num_checksum == -1)
887 logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
888 if (field_num_record_id == -1)
889 logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
893 /* Initialize the table. */
894 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
898 line_length = getline (&lineptr, &n, f);
899 if (line_length != -1)
900 warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
903 while (line_length != -1);
906 int nrecords = hash_table_count (warc_cdx_dedup_table);
907 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
908 "Loaded %d records from CDX.\n\n", nrecords),
918 /* Returns the existing duplicate CDX record for the given url and payload
919 digest. Returns NULL if the url is not found or if the payload digest
920 does not match, or if CDX deduplication is disabled. */
921 static struct warc_cdx_record *
922 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
924 if (warc_cdx_dedup_table == NULL)
928 struct warc_cdx_record *rec_existing;
929 hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
931 if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
937 /* Initializes the WARC writer (if opt.warc_filename is set).
938 This should be called before any WARC record is written. */
942 warc_write_ok = true;
944 if (opt.warc_filename != NULL)
946 if (opt.warc_cdx_dedup_filename != NULL)
948 if (! warc_load_cdx_dedup_file ())
950 logprintf (LOG_NOTQUIET,
951 _("Could not read CDX file %s for deduplication.\n"),
952 quote (opt.warc_cdx_dedup_filename));
957 warc_manifest_fp = warc_tempfile ();
958 if (warc_manifest_fp == NULL)
960 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
964 if (opt.warc_keep_log)
966 warc_log_fp = warc_tempfile ();
967 if (warc_log_fp == NULL)
969 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
972 log_set_warc_log_fp (warc_log_fp);
975 warc_current_file_number = -1;
976 if (! warc_start_new_file (false))
978 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
982 if (opt.warc_cdx_enabled)
984 if (! warc_start_cdx_file ())
986 logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
993 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
995 warc_write_metadata ()
997 /* If there are multiple WARC files, the metadata should be written to a separate file. */
998 if (opt.warc_maxsize > 0)
999 warc_start_new_file (true);
1001 char manifest_uuid [48];
1002 warc_uuid_str (manifest_uuid);
1004 fflush (warc_manifest_fp);
1005 warc_write_resource_record (manifest_uuid,
1006 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1007 NULL, NULL, NULL, "text/plain",
1008 warc_manifest_fp, -1);
1009 /* warc_write_resource_record has closed warc_manifest_fp. */
1011 FILE * warc_tmp_fp = warc_tempfile ();
1012 if (warc_tmp_fp == NULL)
1014 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1017 fflush (warc_tmp_fp);
1018 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1020 warc_write_resource_record (manifest_uuid,
1021 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1022 NULL, NULL, NULL, "text/plain",
1024 /* warc_write_resource_record has closed warc_tmp_fp. */
1026 if (warc_log_fp != NULL)
1028 warc_write_resource_record (NULL,
1029 "metadata://gnu.org/software/wget/warc/wget.log",
1030 NULL, manifest_uuid, NULL, "text/plain",
1032 /* warc_write_resource_record has closed warc_log_fp. */
1035 log_set_warc_log_fp (NULL);
1039 /* Finishes the WARC writing.
1040 This should be called at the end of the program. */
1044 if (warc_current_file != NULL)
1046 warc_write_metadata ();
1047 free (warc_current_warcinfo_uuid_str);
1048 fclose (warc_current_file);
1050 if (warc_current_cdx_file != NULL)
1051 fclose (warc_current_cdx_file);
1052 if (warc_log_fp != NULL)
1054 fclose (warc_log_fp);
1055 log_set_warc_log_fp (NULL);
1059 /* Creates a temporary file for writing WARC output.
1060 The temporary file will be created in opt.warc_tempdir.
1061 Returns the pointer to the temporary file, or NULL. */
1066 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1069 int fd = mkstemp (filename);
1073 if (unlink (filename) < 0)
1076 return fdopen (fd, "wb+");
1080 /* Writes a request record to the WARC file.
1081 url is the target uri of the request,
1082 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1083 record_uuid is the uuid of the request (generated with warc_uuid_str),
1084 body is a pointer to a file containing the request headers and body.
1085 ip is the ip address of the server (or NULL),
1086 Calling this function will close body.
1087 Returns true on success, false on error. */
1089 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1091 warc_write_start_record ();
1092 warc_write_header ("WARC-Type", "request");
1093 warc_write_header ("WARC-Target-URI", url);
1094 warc_write_header ("Content-Type", "application/http;msgtype=request");
1095 warc_write_date_header (timestamp_str);
1096 warc_write_header ("WARC-Record-ID", record_uuid);
1097 warc_write_ip_header (ip);
1098 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1099 warc_write_digest_headers (body, payload_offset);
1100 warc_write_block_from_file (body);
1101 warc_write_end_record ();
1105 return warc_write_ok;
1108 /* Writes a response record to the CDX file.
1109 url is the target uri of the request/response,
1110 timestamp_str is the timestamp of the request that generated this response,
1111 (generated with warc_timestamp),
1112 mime_type is the mime type of the response body (will be printed to CDX),
1113 response_code is the HTTP response code (will be printed to CDX),
1114 payload_digest is the sha1 digest of the payload,
1115 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1116 offset is the position of the WARC record in the WARC file,
1117 warc_filename is the filename of the WARC,
1118 response_uuid is the uuid of the response.
1119 Returns true on success, false on error. */
1121 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1123 /* Transform the timestamp. */
1124 char timestamp_str_cdx [15];
1125 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1126 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1127 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1128 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1129 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1130 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1131 timestamp_str_cdx[14] = '\0';
1133 /* Rewrite the checksum. */
1135 if (payload_digest != NULL)
1136 checksum = payload_digest + 5; /* Skip the "sha1:" */
1140 if (mime_type == NULL || strlen(mime_type) == 0)
1142 if (redirect_location == NULL || strlen(redirect_location) == 0)
1143 redirect_location = "-";
1145 /* Print the CDX line. */
1146 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1147 fflush (warc_current_cdx_file);
1152 /* Writes a revisit record to the WARC file.
1153 url is the target uri of the request/response,
1154 timestamp_str is the timestamp of the request that generated this response
1155 (generated with warc_timestamp),
1156 concurrent_to_uuid is the uuid of the request for that generated this response
1157 (generated with warc_uuid_str),
1158 refers_to_uuid is the uuid of the original response
1159 (generated with warc_uuid_str),
1160 payload_digest is the sha1 digest of the payload,
1161 ip is the ip address of the server (or NULL),
1162 body is a pointer to a file containing the response headers (without payload).
1163 Calling this function will close body.
1164 Returns true on success, false on error. */
1166 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1168 char revisit_uuid [48];
1169 warc_uuid_str (revisit_uuid);
1171 char *block_digest = NULL;
1172 char sha1_res_block[SHA1_DIGEST_SIZE];
1173 sha1_stream (body, sha1_res_block);
1174 block_digest = warc_base32_sha1_digest (sha1_res_block);
1176 warc_write_start_record ();
1177 warc_write_header ("WARC-Type", "revisit");
1178 warc_write_header ("WARC-Record-ID", revisit_uuid);
1179 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1180 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1181 warc_write_header ("WARC-Refers-To", refers_to);
1182 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1183 warc_write_header ("WARC-Truncated", "length");
1184 warc_write_header ("WARC-Target-URI", url);
1185 warc_write_date_header (timestamp_str);
1186 warc_write_ip_header (ip);
1187 warc_write_header ("Content-Type", "application/http;msgtype=response");
1188 warc_write_header ("WARC-Block-Digest", block_digest);
1189 warc_write_header ("WARC-Payload-Digest", payload_digest);
1190 warc_write_block_from_file (body);
1191 warc_write_end_record ();
1194 free (block_digest);
1196 return warc_write_ok;
1199 /* Writes a response record to the WARC file.
1200 url is the target uri of the request/response,
1201 timestamp_str is the timestamp of the request that generated this response
1202 (generated with warc_timestamp),
1203 concurrent_to_uuid is the uuid of the request for that generated this response
1204 (generated with warc_uuid_str),
1205 ip is the ip address of the server (or NULL),
1206 body is a pointer to a file containing the response headers and body.
1207 mime_type is the mime type of the response body (will be printed to CDX),
1208 response_code is the HTTP response code (will be printed to CDX),
1209 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1210 Calling this function will close body.
1211 Returns true on success, false on error. */
1213 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1215 char *block_digest = NULL;
1216 char *payload_digest = NULL;
1217 char sha1_res_block[SHA1_DIGEST_SIZE];
1218 char sha1_res_payload[SHA1_DIGEST_SIZE];
1220 if (opt.warc_digests_enabled)
1222 /* Calculate the block and payload digests. */
1224 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1226 /* Decide (based on url + payload digest) if we have seen this
1228 struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1229 if (rec_existing != NULL)
1231 /* Found an existing record. */
1232 logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1234 /* Remove the payload from the file. */
1235 if (payload_offset > 0)
1237 if (ftruncate (fileno (body), payload_offset) == -1)
1241 /* Send the original payload digest. */
1242 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1243 bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1244 free (payload_digest);
1249 block_digest = warc_base32_sha1_digest (sha1_res_block);
1250 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1254 /* Not a revisit, just store the record. */
1256 char response_uuid [48];
1257 warc_uuid_str (response_uuid);
1259 fseek (warc_current_file, 0L, SEEK_END);
1260 size_t offset = ftell (warc_current_file);
1262 warc_write_start_record ();
1263 warc_write_header ("WARC-Type", "response");
1264 warc_write_header ("WARC-Record-ID", response_uuid);
1265 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1266 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1267 warc_write_header ("WARC-Target-URI", url);
1268 warc_write_date_header (timestamp_str);
1269 warc_write_ip_header (ip);
1270 warc_write_header ("WARC-Block-Digest", block_digest);
1271 warc_write_header ("WARC-Payload-Digest", payload_digest);
1272 warc_write_header ("Content-Type", "application/http;msgtype=response");
1273 warc_write_block_from_file (body);
1274 warc_write_end_record ();
1278 if (warc_write_ok && opt.warc_cdx_enabled)
1280 /* Add this record to the CDX. */
1281 warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1285 free (block_digest);
1287 free (payload_digest);
1289 return warc_write_ok;
1292 /* Writes a resource record to the WARC file.
1293 resource_uuid is the uuid of the resource (or NULL),
1294 url is the target uri of the resource,
1295 timestamp_str is the timestamp (generated with warc_timestamp),
1296 concurrent_to_uuid is the uuid of the request for that generated this resource
1297 (generated with warc_uuid_str) or NULL,
1298 ip is the ip address of the server (or NULL),
1299 content_type is the mime type of the body (or NULL),
1300 body is a pointer to a file containing the resource data.
1301 Calling this function will close body.
1302 Returns true on success, false on error. */
1304 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1306 if (resource_uuid == NULL)
1308 resource_uuid = alloca (48);
1309 warc_uuid_str (resource_uuid);
1312 if (content_type == NULL)
1313 content_type = "application/octet-stream";
1315 warc_write_start_record ();
1316 warc_write_header ("WARC-Type", "resource");
1317 warc_write_header ("WARC-Record-ID", resource_uuid);
1318 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1319 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1320 warc_write_header ("WARC-Target-URI", url);
1321 warc_write_date_header (timestamp_str);
1322 warc_write_ip_header (ip);
1323 warc_write_digest_headers (body, payload_offset);
1324 warc_write_header ("Content-Type", content_type);
1325 warc_write_block_from_file (body);
1326 warc_write_end_record ();
1330 return warc_write_ok;