1 /* Utility functions for writing WARC files. */
21 #include <uuid/uuid.h>
30 extern char *version_string;
32 /* Set by main in main.c */
33 extern char *program_argstring;
36 /* The log file (a temporary file that contains a copy
38 static FILE *warc_log_fp;
40 /* The manifest file (a temporary file that contains the
41 warcinfo uuid of every file in this crawl). */
42 static FILE *warc_manifest_fp;
44 /* The current WARC file (or NULL, if WARC is disabled). */
45 static FILE *warc_current_file;
48 /* The gzip stream for the current WARC file
49 (or NULL, if WARC or gzip is disabled). */
50 static gzFile *warc_current_gzfile;
52 /* The offset of the current gzip record in the WARC file. */
53 static size_t warc_current_gzfile_offset;
55 /* The uncompressed size (so far) of the current record. */
56 static size_t warc_current_gzfile_uncompressed_size;
59 /* This is true until a warc_write_* method fails. */
60 static bool warc_write_ok;
62 /* The current CDX file (or NULL, if CDX is disabled). */
63 static FILE *warc_current_cdx_file;
65 /* The record id of the warcinfo record of the current WARC file. */
66 static char *warc_current_warcinfo_uuid_str;
68 /* The file name of the current WARC file. */
69 static char *warc_current_filename;
71 /* The serial number of the current WARC file. This number is
72 incremented each time a new file is opened and is used in the
73 WARC file's filename. */
74 static int warc_current_file_number;
76 /* The table of CDX records, if deduplication is enabled. */
77 struct hash_table * warc_cdx_dedup_table;
79 static bool warc_start_new_file (bool meta);
82 struct warc_cdx_record
86 char digest[SHA1_DIGEST_SIZE];
90 warc_hash_sha1_digest (const void *key)
92 /* We just use some of the first bytes of the digest. */
94 memcpy (&v, key, sizeof (unsigned long));
99 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
101 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
106 /* Writes SIZE bytes from BUFFER to the current WARC file,
107 through gzwrite if compression is enabled.
108 Returns the number of uncompressed bytes written. */
110 warc_write_buffer (const char *buffer, size_t size)
113 if (warc_current_gzfile)
115 warc_current_gzfile_uncompressed_size += size;
116 return gzwrite (warc_current_gzfile, buffer, size);
120 return fwrite (buffer, 1, size, warc_current_file);
123 /* Writes STR to the current WARC file.
124 Returns false and set warc_write_ok to false if there
127 warc_write_string (const char *str)
132 size_t n = strlen (str);
133 if (n != warc_write_buffer (str, n))
134 warc_write_ok = false;
136 return warc_write_ok;
140 #define EXTRA_GZIP_HEADER_SIZE 12
141 #define GZIP_STATIC_HEADER_SIZE 10
142 #define FLG_FEXTRA 0x04
145 /* Starts a new WARC record. Writes the version header.
146 If opt.warc_maxsize is set and the current file is becoming
147 too large, this will open a new WARC file.
149 If compression is enabled, this will start a new
150 gzip stream in the current WARC file.
152 Returns false and set warc_write_ok to false if there
155 warc_write_start_record ()
160 fflush (warc_current_file);
161 if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
162 warc_start_new_file (false);
165 /* Start a GZIP stream, if required. */
166 if (opt.warc_compression_enabled)
168 /* Record the starting offset of the new record. */
169 warc_current_gzfile_offset = ftell (warc_current_file);
171 /* Reserve space for the extra GZIP header field.
172 In warc_write_end_record we will fill this space
173 with information about the uncompressed and
174 compressed size of the record. */
175 fprintf (warc_current_file, "XXXXXXXXXXXX");
176 fflush (warc_current_file);
178 /* Start a new GZIP stream. */
179 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
180 warc_current_gzfile_uncompressed_size = 0;
182 if (warc_current_gzfile == NULL)
184 logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
185 warc_write_ok = false;
191 warc_write_string ("WARC/1.0\r\n");
192 return warc_write_ok;
195 /* Writes a WARC header to the current WARC record.
196 This method may be run after warc_write_start_record and
197 before warc_write_block_from_file. */
199 warc_write_header (const char *name, const char *value)
203 warc_write_string (name);
204 warc_write_string (": ");
205 warc_write_string (value);
206 warc_write_string ("\r\n");
208 return warc_write_ok;
211 /* Copies the contents of DATA_IN to the WARC record.
212 Adds a Content-Length header to the WARC record.
213 Run this method after warc_write_header,
214 then run warc_write_end_record. */
216 warc_write_block_from_file (FILE *data_in)
218 /* Add the Content-Length header. */
219 char *content_length;
220 fseek (data_in, 0L, SEEK_END);
221 if (! asprintf (&content_length, "%ld", ftell (data_in)))
223 warc_write_ok = false;
226 warc_write_header ("Content-Length", content_length);
227 free (content_length);
229 /* End of the WARC header section. */
230 warc_write_string ("\r\n");
232 if (fseek (data_in, 0L, SEEK_SET) != 0)
233 warc_write_ok = false;
235 /* Copy the data in the file to the WARC record. */
238 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
240 if (warc_write_buffer (buffer, s) < s)
241 warc_write_ok = false;
244 return warc_write_ok;
247 /* Run this method to close the current WARC record.
249 If compression is enabled, this method closes the
250 current GZIP stream and fills the extra GZIP header
251 with the uncompressed and compressed length of the
254 warc_write_end_record ()
256 warc_write_buffer ("\r\n\r\n", 4);
259 /* We start a new gzip stream for each record. */
260 if (warc_write_ok && warc_current_gzfile)
262 if (gzclose (warc_current_gzfile) != Z_OK)
264 warc_write_ok = false;
268 fflush (warc_current_file);
269 fseek (warc_current_file, 0, SEEK_END);
271 /* The WARC standard suggests that we add 'skip length' data in the
272 extra header field of the GZIP stream.
274 In warc_write_start_record we reserved space for this extra header.
275 This extra space starts at warc_current_gzfile_offset and fills
276 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
277 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
279 We need to do three things:
280 1. Move the static GZIP header to warc_current_gzfile_offset;
281 2. Set the FEXTRA flag in the GZIP header;
282 3. Write the extra GZIP header after the static header, that is,
283 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
286 /* Calculate the uncompressed and compressed sizes. */
287 size_t current_offset = ftell (warc_current_file);
288 size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
289 size_t compressed_size = warc_current_gzfile_uncompressed_size;
291 /* Go back to the static GZIP header. */
292 fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
294 /* Read the header. */
295 char static_header[GZIP_STATIC_HEADER_SIZE];
296 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
297 if (result != GZIP_STATIC_HEADER_SIZE)
299 warc_write_ok = false;
303 /* Set the FEXTRA flag in the flags byte of the header. */
304 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
306 /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
307 fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
308 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
310 /* Prepare the extra GZIP header. */
311 char extra_header[EXTRA_GZIP_HEADER_SIZE];
312 /* XLEN, the length of the extra header fields. */
313 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
314 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
315 /* The extra header field identifier for the WARC skip length. */
316 extra_header[2] = 's';
317 extra_header[3] = 'l';
318 /* The size of the uncompressed record. */
319 extra_header[4] = (uncompressed_size & 255);
320 extra_header[5] = (uncompressed_size >> 8) & 255;
321 extra_header[6] = (uncompressed_size >> 16) & 255;
322 extra_header[7] = (uncompressed_size >> 24) & 255;
323 /* The size of the compressed record. */
324 extra_header[8] = (compressed_size & 255);
325 extra_header[9] = (compressed_size >> 8) & 255;
326 extra_header[10] = (compressed_size >> 16) & 255;
327 extra_header[11] = (compressed_size >> 24) & 255;
329 /* Write the extra header after the static header. */
330 fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
331 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
333 /* Done, move back to the end of the file. */
334 fflush (warc_current_file);
335 fseek (warc_current_file, 0, SEEK_END);
337 #endif /* HAVE_LIBZ */
339 return warc_write_ok;
343 /* Writes the WARC-Date header for the given timestamp to
344 the current WARC record.
345 If timestamp is NULL, the current time will be used. */
347 warc_write_date_header (char *timestamp)
349 if (timestamp == NULL)
351 char current_timestamp[21];
352 warc_timestamp (current_timestamp);
353 timestamp = current_timestamp;
355 return warc_write_header ("WARC-Date", timestamp);
358 /* Writes the WARC-IP-Address header for the given IP to
359 the current WARC record. If IP is NULL, no header will
362 warc_write_ip_header (ip_address *ip)
365 return warc_write_header ("WARC-IP-Address", print_address (ip));
367 return warc_write_ok;
371 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
372 from gnulib/sha1.c. This version calculates two digests in one go.
374 Compute SHA1 message digests for bytes read from STREAM. The
375 digest of the complete file will be written into the 16 bytes
376 beginning at RES_BLOCK.
378 If payload_offset >= 0, a second digest will be calculated of the
379 portion of the file starting at payload_offset and continuing to
380 the end of the file. The digest number will be written into the
381 16 bytes beginning ad RES_PAYLOAD. */
383 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
385 #define BLOCKSIZE 32768
387 struct sha1_ctx ctx_block;
388 struct sha1_ctx ctx_payload;
392 char *buffer = malloc (BLOCKSIZE + 72);
396 /* Initialize the computation context. */
397 sha1_init_ctx (&ctx_block);
398 if (payload_offset >= 0)
399 sha1_init_ctx (&ctx_payload);
403 /* Iterate over full file contents. */
406 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
407 computation function processes the whole buffer so that with the
408 next round of the loop another block can be read. */
412 /* Read block. Take care for partial reads. */
415 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
420 if (sum == BLOCKSIZE)
425 /* Check for the error flag IFF N == 0, so that we don't
426 exit the loop after a partial read due to e.g., EAGAIN
433 goto process_partial_block;
436 /* We've read at least one byte, so ignore errors. But always
437 check for EOF, since feof may be true even though N > 0.
438 Otherwise, we could end up calling fread after EOF. */
440 goto process_partial_block;
443 /* Process buffer with BLOCKSIZE bytes. Note that
446 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
447 if (payload_offset >= 0 && payload_offset < pos)
449 /* At least part of the buffer contains data from payload. */
450 int start_of_payload = payload_offset - (pos - BLOCKSIZE);
451 if (start_of_payload <= 0)
452 /* All bytes in the buffer belong to the payload. */
453 start_of_payload = 0;
455 /* Process the payload part of the buffer.
456 Note: we can't use sha1_process_block here even if we
457 process the complete buffer. Because the payload doesn't
458 have to start with a full block, there may still be some
459 bytes left from the previous buffer. Therefore, we need
460 to continue with sha1_process_bytes. */
461 sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
465 process_partial_block:;
467 /* Process any remaining bytes. */
470 sha1_process_bytes (buffer, sum, &ctx_block);
471 if (payload_offset >= 0 && payload_offset < pos)
473 /* At least part of the buffer contains data from payload. */
474 int start_of_payload = payload_offset - (pos - sum);
475 if (start_of_payload <= 0)
476 /* All bytes in the buffer belong to the payload. */
477 start_of_payload = 0;
479 /* Process the payload part of the buffer. */
480 sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
484 /* Construct result in desired memory. */
485 sha1_finish_ctx (&ctx_block, res_block);
486 if (payload_offset >= 0)
487 sha1_finish_ctx (&ctx_payload, res_payload);
494 /* Converts the SHA1 digest to a base32-encoded string.
495 "sha1:DIGEST\0" (Allocates a new string for the response.) */
497 warc_base32_sha1_digest (char *sha1_digest)
499 // length: "sha1:" + digest + "\0"
500 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
501 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
502 memcpy (sha1_base32, "sha1:", 5);
503 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
508 /* Sets the digest headers of the record.
509 This method will calculate the block digest and, if payload_offset >= 0,
510 will also calculate the payload digest of the payload starting at the
513 warc_write_digest_headers (FILE *file, long payload_offset)
515 if (opt.warc_digests_enabled)
517 /* Calculate the block and payload digests. */
518 char sha1_res_block[SHA1_DIGEST_SIZE];
519 char sha1_res_payload[SHA1_DIGEST_SIZE];
522 if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
526 digest = warc_base32_sha1_digest (sha1_res_block);
527 warc_write_header ("WARC-Block-Digest", digest);
530 if (payload_offset >= 0)
532 digest = warc_base32_sha1_digest (sha1_res_payload);
533 warc_write_header ("WARC-Payload-Digest", digest);
541 /* Fills timestamp with the current time and date.
542 The UTC time is formatted following ISO 8601, as required
543 for use in the WARC-Date header.
544 The timestamp will be 21 characters long. */
546 warc_timestamp (char *timestamp)
549 struct tm * timeinfo;
551 timeinfo = gmtime (&rawtime);
552 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
555 /* Fills uuid_str with a UUID based on random numbers.
556 (See RFC 4122, UUID version 4.)
558 Note: this is a fallback method, it is much better to use the
559 methods provided by libuuid.
561 The uuid_str will be 36 characters long. */
563 warc_uuid_random (char *uuid_str)
565 // RFC 4122, a version 4 UUID with only random numbers
567 unsigned char uuid_data[16];
570 uuid_data[i] = random_number (255);
572 // Set the four most significant bits (bits 12 through 15) of the
573 // time_hi_and_version field to the 4-bit version number
574 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
576 // Set the two most significant bits (bits 6 and 7) of the
577 // clock_seq_hi_and_reserved to zero and one, respectively.
578 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
581 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
582 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
583 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
584 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
588 /* Fills urn_str with a UUID in the format required
589 for the WARC-Record-Id header.
590 The string will be 47 characters long. */
592 warc_uuid_str (char *urn_str)
598 uuid_generate (record_id);
599 uuid_unparse (record_id, uuid_str);
601 warc_uuid_random (uuid_str);
604 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
607 /* Write a warcinfo record to the current file.
608 Updates warc_current_warcinfo_uuid_str. */
610 warc_write_warcinfo_record (char *filename)
612 /* Write warc-info record as the first record of the file. */
613 /* We add the record id of this info record to the other records in the file. */
614 warc_current_warcinfo_uuid_str = (char *) malloc (48);
615 warc_uuid_str (warc_current_warcinfo_uuid_str);
618 warc_timestamp (timestamp);
620 char *filename_copy, *filename_basename;
621 filename_copy = strdup (filename);
622 filename_basename = strdup (basename (filename_copy));
624 warc_write_start_record ();
625 warc_write_header ("WARC-Type", "warcinfo");
626 warc_write_header ("Content-Type", "application/warc-fields");
627 warc_write_header ("WARC-Date", timestamp);
628 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
629 warc_write_header ("WARC-Filename", filename_basename);
631 /* Create content. */
632 FILE *warc_tmp = warc_tempfile ();
633 if (warc_tmp == NULL)
635 free (filename_copy);
636 free (filename_basename);
640 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
641 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
642 fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
643 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
644 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
645 /* Add the user headers, if any. */
646 if (opt.warc_user_headers)
649 for (i = 0; opt.warc_user_headers[i]; i++)
650 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
652 fprintf(warc_tmp, "\r\n");
654 warc_write_digest_headers (warc_tmp, -1);
655 warc_write_block_from_file (warc_tmp);
656 warc_write_end_record ();
660 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
663 free (filename_copy);
664 free (filename_basename);
666 return warc_write_ok;
669 /* Opens a new WARC file.
670 If META is true, generates a filename ending with 'meta.warc.gz'.
673 1. close the current WARC file (if there is one);
674 2. increment warc_current_file_number;
675 3. open a new WARC file;
676 4. write the initial warcinfo record.
678 Returns true on success, false otherwise.
681 warc_start_new_file (bool meta)
683 if (opt.warc_filename == NULL)
686 if (warc_current_file != NULL)
687 fclose (warc_current_file);
688 if (warc_current_warcinfo_uuid_str)
689 free (warc_current_warcinfo_uuid_str);
690 if (warc_current_filename)
691 free (warc_current_filename);
693 warc_current_file_number++;
695 int base_filename_length = strlen (opt.warc_filename);
696 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
697 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
698 warc_current_filename = new_filename;
701 char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
703 char *extension = "warc";
706 /* If max size is enabled, we add a serial number to the file names. */
708 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
709 else if (opt.warc_maxsize > 0)
710 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
712 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
714 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
716 /* Open the WARC file. */
717 warc_current_file = fopen (new_filename, "wb+");
718 if (warc_current_file == NULL)
720 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
724 if (! warc_write_warcinfo_record (new_filename))
727 /* Add warcinfo uuid to manifest. */
728 if (warc_manifest_fp)
729 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
734 /* Opens the CDX file for output. */
736 warc_start_cdx_file ()
738 int filename_length = strlen (opt.warc_filename);
739 char *cdx_filename = alloca (filename_length + 4 + 1);
740 memcpy (cdx_filename, opt.warc_filename, filename_length);
741 memcpy (cdx_filename + filename_length, ".cdx", 5);
742 warc_current_cdx_file = fopen (cdx_filename, "a+");
743 if (warc_current_cdx_file == NULL)
746 /* Print the CDX header.
752 * k - new style checksum
755 * V - compressed arc file offset
759 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
760 fflush (warc_current_cdx_file);
765 #define CDX_FIELDSEP " \t\r\n"
767 /* Parse the CDX header and find the field numbers of the original url,
768 checksum and record ID fields. */
770 warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
772 *field_num_original_url = -1;
773 *field_num_checksum = -1;
774 *field_num_record_id = -1;
778 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
780 if (token != NULL && strcmp (token, "CDX") == 0)
783 while (token != NULL)
785 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
791 *field_num_original_url = field_num;
794 *field_num_checksum = field_num;
797 *field_num_record_id = field_num;
805 return *field_num_original_url != -1
806 && *field_num_checksum != -1
807 && *field_num_record_id != -1;
810 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
812 warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
814 char *original_url = NULL;
815 char *checksum = NULL;
816 char *record_id = NULL;
820 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
822 /* Read this line to get the fields we need. */
824 while (token != NULL)
827 if (field_num == field_num_original_url)
829 else if (field_num == field_num_checksum)
831 else if (field_num == field_num_record_id)
837 *val = strdup (token);
839 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
843 if (original_url != NULL && checksum != NULL && record_id != NULL)
845 /* For some extra efficiency, we decode the base32 encoded
846 checksum value. This should produce exactly SHA1_DIGEST_SIZE
850 base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
853 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
855 /* This is a valid line with a valid checksum. */
856 struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
857 rec->url = original_url;
858 rec->uuid = record_id;
859 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
860 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
866 if (checksum_v != NULL)
873 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
874 the warc_cdx_dedup_table. */
876 warc_load_cdx_dedup_file ()
878 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
882 int field_num_original_url = -1;
883 int field_num_checksum = -1;
884 int field_num_record_id = -1;
886 char *lineptr = NULL;
890 /* The first line should contain the CDX header.
891 Format: " CDX x x x x x"
892 where x are field type indicators. For our purposes, we only
893 need 'a' (the original url), 'k' (the SHA1 checksum) and
894 'u' (the WARC record id). */
895 line_length = getline (&lineptr, &n, f);
896 if (line_length != -1)
897 warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
899 /* If the file contains all three fields, read the complete file. */
900 if (field_num_original_url == -1
901 || field_num_checksum == -1
902 || field_num_record_id == -1)
904 if (field_num_original_url == -1)
905 logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
906 if (field_num_checksum == -1)
907 logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
908 if (field_num_record_id == -1)
909 logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
913 /* Initialize the table. */
914 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
918 line_length = getline (&lineptr, &n, f);
919 if (line_length != -1)
920 warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
923 while (line_length != -1);
926 int nrecords = hash_table_count (warc_cdx_dedup_table);
927 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
928 "Loaded %d records from CDX.\n\n", nrecords),
938 /* Returns the existing duplicate CDX record for the given url and payload
939 digest. Returns NULL if the url is not found or if the payload digest
940 does not match, or if CDX deduplication is disabled. */
941 static struct warc_cdx_record *
942 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
944 if (warc_cdx_dedup_table == NULL)
948 struct warc_cdx_record *rec_existing;
949 hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
951 if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
957 /* Initializes the WARC writer (if opt.warc_filename is set).
958 This should be called before any WARC record is written. */
962 warc_write_ok = true;
964 if (opt.warc_filename != NULL)
966 if (opt.warc_cdx_dedup_filename != NULL)
968 if (! warc_load_cdx_dedup_file ())
970 logprintf (LOG_NOTQUIET,
971 _("Could not read CDX file %s for deduplication.\n"),
972 quote (opt.warc_cdx_dedup_filename));
977 warc_manifest_fp = warc_tempfile ();
978 if (warc_manifest_fp == NULL)
980 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
984 if (opt.warc_keep_log)
986 warc_log_fp = warc_tempfile ();
987 if (warc_log_fp == NULL)
989 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
992 log_set_warc_log_fp (warc_log_fp);
995 warc_current_file_number = -1;
996 if (! warc_start_new_file (false))
998 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1002 if (opt.warc_cdx_enabled)
1004 if (! warc_start_cdx_file ())
1006 logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
1013 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1015 warc_write_metadata ()
1017 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1018 if (opt.warc_maxsize > 0)
1019 warc_start_new_file (true);
1021 char manifest_uuid [48];
1022 warc_uuid_str (manifest_uuid);
1024 fflush (warc_manifest_fp);
1025 warc_write_resource_record (manifest_uuid,
1026 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1027 NULL, NULL, NULL, "text/plain",
1028 warc_manifest_fp, -1);
1029 /* warc_write_resource_record has closed warc_manifest_fp. */
1031 FILE * warc_tmp_fp = warc_tempfile ();
1032 if (warc_tmp_fp == NULL)
1034 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1037 fflush (warc_tmp_fp);
1038 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1040 warc_write_resource_record (manifest_uuid,
1041 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1042 NULL, NULL, NULL, "text/plain",
1044 /* warc_write_resource_record has closed warc_tmp_fp. */
1046 if (warc_log_fp != NULL)
1048 warc_write_resource_record (NULL,
1049 "metadata://gnu.org/software/wget/warc/wget.log",
1050 NULL, manifest_uuid, NULL, "text/plain",
1052 /* warc_write_resource_record has closed warc_log_fp. */
1055 log_set_warc_log_fp (NULL);
1059 /* Finishes the WARC writing.
1060 This should be called at the end of the program. */
1064 if (warc_current_file != NULL)
1066 warc_write_metadata ();
1067 free (warc_current_warcinfo_uuid_str);
1068 fclose (warc_current_file);
1070 if (warc_current_cdx_file != NULL)
1071 fclose (warc_current_cdx_file);
1072 if (warc_log_fp != NULL)
1074 fclose (warc_log_fp);
1075 log_set_warc_log_fp (NULL);
1079 /* Creates a temporary file for writing WARC output.
1080 The temporary file will be created in opt.warc_tempdir.
1081 Returns the pointer to the temporary file, or NULL. */
1086 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1089 int fd = mkstemp (filename);
1093 if (unlink (filename) < 0)
1096 return fdopen (fd, "wb+");
1100 /* Writes a request record to the WARC file.
1101 url is the target uri of the request,
1102 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1103 record_uuid is the uuid of the request (generated with warc_uuid_str),
1104 body is a pointer to a file containing the request headers and body.
1105 ip is the ip address of the server (or NULL),
1106 Calling this function will close body.
1107 Returns true on success, false on error. */
1109 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
1111 warc_write_start_record ();
1112 warc_write_header ("WARC-Type", "request");
1113 warc_write_header ("WARC-Target-URI", url);
1114 warc_write_header ("Content-Type", "application/http;msgtype=request");
1115 warc_write_date_header (timestamp_str);
1116 warc_write_header ("WARC-Record-ID", record_uuid);
1117 warc_write_ip_header (ip);
1118 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1119 warc_write_digest_headers (body, payload_offset);
1120 warc_write_block_from_file (body);
1121 warc_write_end_record ();
1125 return warc_write_ok;
1128 /* Writes a response record to the CDX file.
1129 url is the target uri of the request/response,
1130 timestamp_str is the timestamp of the request that generated this response,
1131 (generated with warc_timestamp),
1132 mime_type is the mime type of the response body (will be printed to CDX),
1133 response_code is the HTTP response code (will be printed to CDX),
1134 payload_digest is the sha1 digest of the payload,
1135 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1136 offset is the position of the WARC record in the WARC file,
1137 warc_filename is the filename of the WARC,
1138 response_uuid is the uuid of the response.
1139 Returns true on success, false on error. */
1141 warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
1143 /* Transform the timestamp. */
1144 char timestamp_str_cdx [15];
1145 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1146 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1147 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1148 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1149 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1150 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1151 timestamp_str_cdx[14] = '\0';
1153 /* Rewrite the checksum. */
1155 if (payload_digest != NULL)
1156 checksum = payload_digest + 5; /* Skip the "sha1:" */
1160 if (mime_type == NULL || strlen(mime_type) == 0)
1162 if (redirect_location == NULL || strlen(redirect_location) == 0)
1163 redirect_location = "-";
1165 /* Print the CDX line. */
1166 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
1167 fflush (warc_current_cdx_file);
1172 /* Writes a revisit record to the WARC file.
1173 url is the target uri of the request/response,
1174 timestamp_str is the timestamp of the request that generated this response
1175 (generated with warc_timestamp),
1176 concurrent_to_uuid is the uuid of the request for that generated this response
1177 (generated with warc_uuid_str),
1178 refers_to_uuid is the uuid of the original response
1179 (generated with warc_uuid_str),
1180 payload_digest is the sha1 digest of the payload,
1181 ip is the ip address of the server (or NULL),
1182 body is a pointer to a file containing the response headers (without payload).
1183 Calling this function will close body.
1184 Returns true on success, false on error. */
1186 warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
1188 char revisit_uuid [48];
1189 warc_uuid_str (revisit_uuid);
1191 char *block_digest = NULL;
1192 char sha1_res_block[SHA1_DIGEST_SIZE];
1193 sha1_stream (body, sha1_res_block);
1194 block_digest = warc_base32_sha1_digest (sha1_res_block);
1196 warc_write_start_record ();
1197 warc_write_header ("WARC-Type", "revisit");
1198 warc_write_header ("WARC-Record-ID", revisit_uuid);
1199 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1200 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1201 warc_write_header ("WARC-Refers-To", refers_to);
1202 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1203 warc_write_header ("WARC-Truncated", "length");
1204 warc_write_header ("WARC-Target-URI", url);
1205 warc_write_date_header (timestamp_str);
1206 warc_write_ip_header (ip);
1207 warc_write_header ("Content-Type", "application/http;msgtype=response");
1208 warc_write_header ("WARC-Block-Digest", block_digest);
1209 warc_write_header ("WARC-Payload-Digest", payload_digest);
1210 warc_write_block_from_file (body);
1211 warc_write_end_record ();
1214 free (block_digest);
1216 return warc_write_ok;
1219 /* Writes a response record to the WARC file.
1220 url is the target uri of the request/response,
1221 timestamp_str is the timestamp of the request that generated this response
1222 (generated with warc_timestamp),
1223 concurrent_to_uuid is the uuid of the request for that generated this response
1224 (generated with warc_uuid_str),
1225 ip is the ip address of the server (or NULL),
1226 body is a pointer to a file containing the response headers and body.
1227 mime_type is the mime type of the response body (will be printed to CDX),
1228 response_code is the HTTP response code (will be printed to CDX),
1229 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1230 Calling this function will close body.
1231 Returns true on success, false on error. */
1233 warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
1235 char *block_digest = NULL;
1236 char *payload_digest = NULL;
1237 char sha1_res_block[SHA1_DIGEST_SIZE];
1238 char sha1_res_payload[SHA1_DIGEST_SIZE];
1240 if (opt.warc_digests_enabled)
1242 /* Calculate the block and payload digests. */
1244 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
1246 /* Decide (based on url + payload digest) if we have seen this
1248 struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1249 if (rec_existing != NULL)
1251 /* Found an existing record. */
1252 logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1254 /* Remove the payload from the file. */
1255 if (payload_offset > 0)
1257 if (ftruncate (fileno (body), payload_offset) == -1)
1261 /* Send the original payload digest. */
1262 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1263 bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
1264 free (payload_digest);
1269 block_digest = warc_base32_sha1_digest (sha1_res_block);
1270 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1274 /* Not a revisit, just store the record. */
1276 char response_uuid [48];
1277 warc_uuid_str (response_uuid);
1279 fseek (warc_current_file, 0L, SEEK_END);
1280 size_t offset = ftell (warc_current_file);
1282 warc_write_start_record ();
1283 warc_write_header ("WARC-Type", "response");
1284 warc_write_header ("WARC-Record-ID", response_uuid);
1285 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1286 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1287 warc_write_header ("WARC-Target-URI", url);
1288 warc_write_date_header (timestamp_str);
1289 warc_write_ip_header (ip);
1290 warc_write_header ("WARC-Block-Digest", block_digest);
1291 warc_write_header ("WARC-Payload-Digest", payload_digest);
1292 warc_write_header ("Content-Type", "application/http;msgtype=response");
1293 warc_write_block_from_file (body);
1294 warc_write_end_record ();
1298 if (warc_write_ok && opt.warc_cdx_enabled)
1300 /* Add this record to the CDX. */
1301 warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
1305 free (block_digest);
1307 free (payload_digest);
1309 return warc_write_ok;
1312 /* Writes a resource record to the WARC file.
1313 resource_uuid is the uuid of the resource (or NULL),
1314 url is the target uri of the resource,
1315 timestamp_str is the timestamp (generated with warc_timestamp),
1316 concurrent_to_uuid is the uuid of the request for that generated this resource
1317 (generated with warc_uuid_str) or NULL,
1318 ip is the ip address of the server (or NULL),
1319 content_type is the mime type of the body (or NULL),
1320 body is a pointer to a file containing the resource data.
1321 Calling this function will close body.
1322 Returns true on success, false on error. */
1324 warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
1326 if (resource_uuid == NULL)
1328 resource_uuid = alloca (48);
1329 warc_uuid_str (resource_uuid);
1332 if (content_type == NULL)
1333 content_type = "application/octet-stream";
1335 warc_write_start_record ();
1336 warc_write_header ("WARC-Type", "resource");
1337 warc_write_header ("WARC-Record-ID", resource_uuid);
1338 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1339 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1340 warc_write_header ("WARC-Target-URI", url);
1341 warc_write_date_header (timestamp_str);
1342 warc_write_ip_header (ip);
1343 warc_write_digest_headers (body, payload_offset);
1344 warc_write_header ("Content-Type", content_type);
1345 warc_write_block_from_file (body);
1346 warc_write_end_record ();
1350 return warc_write_ok;