1 /* Utility functions for writing WARC files.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or (at
9 your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19 Additional permission under GNU GPL version 3 section 7
21 If you modify this program, or any covered work, by linking or
22 combining it with the OpenSSL project's OpenSSL library (or a
23 modified version of that library), containing parts covered by the
24 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25 grants you additional permission to convey the resulting work.
26 Corresponding Source for a non-source form of such a combination
27 shall include the source code for the parts of OpenSSL used as well
28 as that of the covered work. */
49 #include <uuid/uuid.h>
64 extern char *version_string;
66 /* Set by main in main.c */
67 extern char *program_argstring;
70 /* The log file (a temporary file that contains a copy
72 static FILE *warc_log_fp;
74 /* The manifest file (a temporary file that contains the
75 warcinfo uuid of every file in this crawl). */
76 static FILE *warc_manifest_fp;
78 /* The current WARC file (or NULL, if WARC is disabled). */
79 static FILE *warc_current_file;
82 /* The gzip stream for the current WARC file
83 (or NULL, if WARC or gzip is disabled). */
84 static gzFile warc_current_gzfile;
86 /* The offset of the current gzip record in the WARC file. */
87 static off_t warc_current_gzfile_offset;
89 /* The uncompressed size (so far) of the current record. */
90 static off_t warc_current_gzfile_uncompressed_size;
93 /* This is true until a warc_write_* method fails. */
94 static bool warc_write_ok;
96 /* The current CDX file (or NULL, if CDX is disabled). */
97 static FILE *warc_current_cdx_file;
99 /* The record id of the warcinfo record of the current WARC file. */
100 static char *warc_current_warcinfo_uuid_str;
102 /* The file name of the current WARC file. */
103 static char *warc_current_filename;
105 /* The serial number of the current WARC file. This number is
106 incremented each time a new file is opened and is used in the
107 WARC file's filename. */
108 static int warc_current_file_number;
110 /* The table of CDX records, if deduplication is enabled. */
111 struct hash_table * warc_cdx_dedup_table;
113 static bool warc_start_new_file (bool meta);
116 struct warc_cdx_record
120 char digest[SHA1_DIGEST_SIZE];
124 warc_hash_sha1_digest (const void *key)
126 /* We just use some of the first bytes of the digest. */
128 memcpy (&v, key, sizeof (unsigned long));
133 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
135 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
140 /* Writes SIZE bytes from BUFFER to the current WARC file,
141 through gzwrite if compression is enabled.
142 Returns the number of uncompressed bytes written. */
144 warc_write_buffer (const char *buffer, size_t size)
147 if (warc_current_gzfile)
149 warc_current_gzfile_uncompressed_size += size;
150 return gzwrite (warc_current_gzfile, buffer, size);
154 return fwrite (buffer, 1, size, warc_current_file);
157 /* Writes STR to the current WARC file.
158 Returns false and set warc_write_ok to false if there
161 warc_write_string (const char *str)
166 size_t n = strlen (str);
167 if (n != warc_write_buffer (str, n))
168 warc_write_ok = false;
170 return warc_write_ok;
174 #define EXTRA_GZIP_HEADER_SIZE 14
175 #define GZIP_STATIC_HEADER_SIZE 10
176 #define FLG_FEXTRA 0x04
179 /* Starts a new WARC record. Writes the version header.
180 If opt.warc_maxsize is set and the current file is becoming
181 too large, this will open a new WARC file.
183 If compression is enabled, this will start a new
184 gzip stream in the current WARC file.
186 Returns false and set warc_write_ok to false if there
189 warc_write_start_record (void)
194 fflush (warc_current_file);
195 if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
196 warc_start_new_file (false);
199 /* Start a GZIP stream, if required. */
200 if (opt.warc_compression_enabled)
202 /* Record the starting offset of the new record. */
203 warc_current_gzfile_offset = ftello (warc_current_file);
205 /* Reserve space for the extra GZIP header field.
206 In warc_write_end_record we will fill this space
207 with information about the uncompressed and
208 compressed size of the record. */
209 fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
210 fflush (warc_current_file);
212 /* Start a new GZIP stream. */
213 warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9");
214 warc_current_gzfile_uncompressed_size = 0;
216 if (warc_current_gzfile == NULL)
218 logprintf (LOG_NOTQUIET,
219 _("Error opening GZIP stream to WARC file.\n"));
220 warc_write_ok = false;
226 warc_write_string ("WARC/1.0\r\n");
227 return warc_write_ok;
230 /* Writes a WARC header to the current WARC record.
231 This method may be run after warc_write_start_record and
232 before warc_write_block_from_file. */
234 warc_write_header (const char *name, const char *value)
238 warc_write_string (name);
239 warc_write_string (": ");
240 warc_write_string (value);
241 warc_write_string ("\r\n");
243 return warc_write_ok;
246 /* Copies the contents of DATA_IN to the WARC record.
247 Adds a Content-Length header to the WARC record.
248 Run this method after warc_write_header,
249 then run warc_write_end_record. */
251 warc_write_block_from_file (FILE *data_in)
253 /* Add the Content-Length header. */
254 char content_length[MAX_INT_TO_STRING_LEN(off_t)];
255 fseeko (data_in, 0L, SEEK_END);
256 number_to_string (content_length, ftello (data_in));
257 warc_write_header ("Content-Length", content_length);
259 /* End of the WARC header section. */
260 warc_write_string ("\r\n");
262 if (fseeko (data_in, 0L, SEEK_SET) != 0)
263 warc_write_ok = false;
265 /* Copy the data in the file to the WARC record. */
268 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
270 if (warc_write_buffer (buffer, s) < s)
271 warc_write_ok = false;
274 return warc_write_ok;
277 /* Run this method to close the current WARC record.
279 If compression is enabled, this method closes the
280 current GZIP stream and fills the extra GZIP header
281 with the uncompressed and compressed length of the
284 warc_write_end_record (void)
286 warc_write_buffer ("\r\n\r\n", 4);
289 /* We start a new gzip stream for each record. */
290 if (warc_write_ok && warc_current_gzfile)
292 if (gzclose (warc_current_gzfile) != Z_OK)
294 warc_write_ok = false;
298 fflush (warc_current_file);
299 fseeko (warc_current_file, 0, SEEK_END);
301 /* The WARC standard suggests that we add 'skip length' data in the
302 extra header field of the GZIP stream.
304 In warc_write_start_record we reserved space for this extra header.
305 This extra space starts at warc_current_gzfile_offset and fills
306 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
307 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
309 We need to do three things:
310 1. Move the static GZIP header to warc_current_gzfile_offset;
311 2. Set the FEXTRA flag in the GZIP header;
312 3. Write the extra GZIP header after the static header, that is,
313 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
316 /* Calculate the uncompressed and compressed sizes. */
317 off_t current_offset = ftello (warc_current_file);
318 off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
319 off_t compressed_size = warc_current_gzfile_uncompressed_size;
321 /* Go back to the static GZIP header. */
322 fseeko (warc_current_file, warc_current_gzfile_offset
323 + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
325 /* Read the header. */
326 char static_header[GZIP_STATIC_HEADER_SIZE];
327 size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
329 if (result != GZIP_STATIC_HEADER_SIZE)
331 warc_write_ok = false;
335 /* Set the FEXTRA flag in the flags byte of the header. */
336 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
338 /* Write the header back to the file, but starting at
339 warc_current_gzfile_offset. */
340 fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
341 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
343 /* Prepare the extra GZIP header. */
344 char extra_header[EXTRA_GZIP_HEADER_SIZE];
345 /* XLEN, the length of the extra header fields. */
346 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
347 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
348 /* The extra header field identifier for the WARC skip length. */
349 extra_header[2] = 's';
350 extra_header[3] = 'l';
351 /* The size of the field value (8 bytes). */
352 extra_header[4] = (8 & 255);
353 extra_header[5] = ((8 >> 8) & 255);
354 /* The size of the uncompressed record. */
355 extra_header[6] = (uncompressed_size & 255);
356 extra_header[7] = (uncompressed_size >> 8) & 255;
357 extra_header[8] = (uncompressed_size >> 16) & 255;
358 extra_header[9] = (uncompressed_size >> 24) & 255;
359 /* The size of the compressed record. */
360 extra_header[10] = (compressed_size & 255);
361 extra_header[11] = (compressed_size >> 8) & 255;
362 extra_header[12] = (compressed_size >> 16) & 255;
363 extra_header[13] = (compressed_size >> 24) & 255;
365 /* Write the extra header after the static header. */
366 fseeko (warc_current_file, warc_current_gzfile_offset
367 + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
368 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
370 /* Done, move back to the end of the file. */
371 fflush (warc_current_file);
372 fseeko (warc_current_file, 0, SEEK_END);
374 #endif /* HAVE_LIBZ */
376 return warc_write_ok;
380 /* Writes the WARC-Date header for the given timestamp to
381 the current WARC record.
382 If timestamp is NULL, the current time will be used. */
384 warc_write_date_header (const char *timestamp)
386 if (timestamp == NULL)
388 char current_timestamp[21];
389 warc_timestamp (current_timestamp);
390 timestamp = current_timestamp;
392 return warc_write_header ("WARC-Date", timestamp);
395 /* Writes the WARC-IP-Address header for the given IP to
396 the current WARC record. If IP is NULL, no header will
399 warc_write_ip_header (ip_address *ip)
402 return warc_write_header ("WARC-IP-Address", print_address (ip));
404 return warc_write_ok;
408 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
409 from gnulib/sha1.c. This version calculates two digests in one go.
411 Compute SHA1 message digests for bytes read from STREAM. The
412 digest of the complete file will be written into the 16 bytes
413 beginning at RES_BLOCK.
415 If payload_offset >= 0, a second digest will be calculated of the
416 portion of the file starting at payload_offset and continuing to
417 the end of the file. The digest number will be written into the
418 16 bytes beginning ad RES_PAYLOAD. */
420 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
421 off_t payload_offset)
423 #define BLOCKSIZE 32768
425 struct sha1_ctx ctx_block;
426 struct sha1_ctx ctx_payload;
430 char *buffer = malloc (BLOCKSIZE + 72);
434 /* Initialize the computation context. */
435 sha1_init_ctx (&ctx_block);
436 if (payload_offset >= 0)
437 sha1_init_ctx (&ctx_payload);
441 /* Iterate over full file contents. */
444 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
445 computation function processes the whole buffer so that with the
446 next round of the loop another block can be read. */
450 /* Read block. Take care for partial reads. */
453 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
458 if (sum == BLOCKSIZE)
463 /* Check for the error flag IFF N == 0, so that we don't
464 exit the loop after a partial read due to e.g., EAGAIN
471 goto process_partial_block;
474 /* We've read at least one byte, so ignore errors. But always
475 check for EOF, since feof may be true even though N > 0.
476 Otherwise, we could end up calling fread after EOF. */
478 goto process_partial_block;
481 /* Process buffer with BLOCKSIZE bytes. Note that
484 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
485 if (payload_offset >= 0 && payload_offset < pos)
487 /* At least part of the buffer contains data from payload. */
488 off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
489 if (start_of_payload <= 0)
490 /* All bytes in the buffer belong to the payload. */
491 start_of_payload = 0;
493 /* Process the payload part of the buffer.
494 Note: we can't use sha1_process_block here even if we
495 process the complete buffer. Because the payload doesn't
496 have to start with a full block, there may still be some
497 bytes left from the previous buffer. Therefore, we need
498 to continue with sha1_process_bytes. */
499 sha1_process_bytes (buffer + start_of_payload,
500 BLOCKSIZE - start_of_payload, &ctx_payload);
504 process_partial_block:;
506 /* Process any remaining bytes. */
509 sha1_process_bytes (buffer, sum, &ctx_block);
510 if (payload_offset >= 0 && payload_offset < pos)
512 /* At least part of the buffer contains data from payload. */
513 off_t start_of_payload = payload_offset - (pos - sum);
514 if (start_of_payload <= 0)
515 /* All bytes in the buffer belong to the payload. */
516 start_of_payload = 0;
518 /* Process the payload part of the buffer. */
519 sha1_process_bytes (buffer + start_of_payload,
520 sum - start_of_payload, &ctx_payload);
524 /* Construct result in desired memory. */
525 sha1_finish_ctx (&ctx_block, res_block);
526 if (payload_offset >= 0)
527 sha1_finish_ctx (&ctx_payload, res_payload);
534 /* Converts the SHA1 digest to a base32-encoded string.
535 "sha1:DIGEST\0" (Allocates a new string for the response.) */
537 warc_base32_sha1_digest (char *sha1_digest)
539 /* length: "sha1:" + digest + "\0" */
540 char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
541 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
542 BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
543 memcpy (sha1_base32, "sha1:", 5);
544 sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
549 /* Sets the digest headers of the record.
550 This method will calculate the block digest and, if payload_offset >= 0,
551 will also calculate the payload digest of the payload starting at the
554 warc_write_digest_headers (FILE *file, long payload_offset)
556 if (opt.warc_digests_enabled)
558 /* Calculate the block and payload digests. */
559 char sha1_res_block[SHA1_DIGEST_SIZE];
560 char sha1_res_payload[SHA1_DIGEST_SIZE];
563 if (warc_sha1_stream_with_payload (file, sha1_res_block,
564 sha1_res_payload, payload_offset) == 0)
568 digest = warc_base32_sha1_digest (sha1_res_block);
569 warc_write_header ("WARC-Block-Digest", digest);
572 if (payload_offset >= 0)
574 digest = warc_base32_sha1_digest (sha1_res_payload);
575 warc_write_header ("WARC-Payload-Digest", digest);
583 /* Fills timestamp with the current time and date.
584 The UTC time is formatted following ISO 8601, as required
585 for use in the WARC-Date header.
586 The timestamp will be 21 characters long. */
588 warc_timestamp (char *timestamp)
591 struct tm * timeinfo;
593 timeinfo = gmtime (&rawtime);
594 strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
598 /* Fills urn_str with a UUID in the format required
599 for the WARC-Record-Id header.
600 The string will be 47 characters long. */
602 warc_uuid_str (char *urn_str)
607 uuid_generate (record_id);
608 uuid_unparse (record_id, uuid_str);
610 sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
613 /* Fills urn_str with a UUID based on random numbers in the format
614 required for the WARC-Record-Id header.
615 (See RFC 4122, UUID version 4.)
617 Note: this is a fallback method, it is much better to use the
618 methods provided by libuuid.
620 The string will be 47 characters long. */
622 warc_uuid_str (char *urn_str)
624 // RFC 4122, a version 4 UUID with only random numbers
626 unsigned char uuid_data[16];
629 uuid_data[i] = random_number (255);
631 // Set the four most significant bits (bits 12 through 15) of the
632 // time_hi_and_version field to the 4-bit version number
633 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
635 // Set the two most significant bits (bits 6 and 7) of the
636 // clock_seq_hi_and_reserved to zero and one, respectively.
637 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
640 "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
641 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
642 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
643 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
648 /* Write a warcinfo record to the current file.
649 Updates warc_current_warcinfo_uuid_str. */
651 warc_write_warcinfo_record (char *filename)
653 /* Write warc-info record as the first record of the file. */
654 /* We add the record id of this info record to the other records in the
656 warc_current_warcinfo_uuid_str = (char *) malloc (48);
657 warc_uuid_str (warc_current_warcinfo_uuid_str);
660 warc_timestamp (timestamp);
662 char *filename_copy, *filename_basename;
663 filename_copy = strdup (filename);
664 filename_basename = strdup (basename (filename_copy));
666 warc_write_start_record ();
667 warc_write_header ("WARC-Type", "warcinfo");
668 warc_write_header ("Content-Type", "application/warc-fields");
669 warc_write_header ("WARC-Date", timestamp);
670 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
671 warc_write_header ("WARC-Filename", filename_basename);
673 /* Create content. */
674 FILE *warc_tmp = warc_tempfile ();
675 if (warc_tmp == NULL)
677 free (filename_copy);
678 free (filename_basename);
682 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
683 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
685 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
686 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
687 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
688 /* Add the user headers, if any. */
689 if (opt.warc_user_headers)
692 for (i = 0; opt.warc_user_headers[i]; i++)
693 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
695 fprintf(warc_tmp, "\r\n");
697 warc_write_digest_headers (warc_tmp, -1);
698 warc_write_block_from_file (warc_tmp);
699 warc_write_end_record ();
702 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
704 free (filename_copy);
705 free (filename_basename);
707 return warc_write_ok;
710 /* Opens a new WARC file.
711 If META is true, generates a filename ending with 'meta.warc.gz'.
714 1. close the current WARC file (if there is one);
715 2. increment warc_current_file_number;
716 3. open a new WARC file;
717 4. write the initial warcinfo record.
719 Returns true on success, false otherwise.
722 warc_start_new_file (bool meta)
724 if (opt.warc_filename == NULL)
727 if (warc_current_file != NULL)
728 fclose (warc_current_file);
729 if (warc_current_warcinfo_uuid_str)
730 free (warc_current_warcinfo_uuid_str);
731 if (warc_current_filename)
732 free (warc_current_filename);
734 warc_current_file_number++;
736 int base_filename_length = strlen (opt.warc_filename);
737 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
738 char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
739 warc_current_filename = new_filename;
742 # define WARC_GZ "warc-gz"
743 #else /* def __VMS */
744 # define WARC_GZ "warc.gz"
745 #endif /* def __VMS [else] */
748 const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
750 const char *extension = "warc";
753 /* If max size is enabled, we add a serial number to the file names. */
755 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
756 else if (opt.warc_maxsize > 0)
758 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
759 warc_current_file_number, extension);
762 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
764 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
766 /* Open the WARC file. */
767 warc_current_file = fopen (new_filename, "wb+");
768 if (warc_current_file == NULL)
770 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
771 quote (new_filename));
775 if (! warc_write_warcinfo_record (new_filename))
778 /* Add warcinfo uuid to manifest. */
779 if (warc_manifest_fp)
780 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
785 /* Opens the CDX file for output. */
787 warc_start_cdx_file (void)
789 int filename_length = strlen (opt.warc_filename);
790 char *cdx_filename = alloca (filename_length + 4 + 1);
791 memcpy (cdx_filename, opt.warc_filename, filename_length);
792 memcpy (cdx_filename + filename_length, ".cdx", 5);
793 warc_current_cdx_file = fopen (cdx_filename, "a+");
794 if (warc_current_cdx_file == NULL)
797 /* Print the CDX header.
803 * k - new style checksum
806 * V - compressed arc file offset
810 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
811 fflush (warc_current_cdx_file);
816 #define CDX_FIELDSEP " \t\r\n"
818 /* Parse the CDX header and find the field numbers of the original url,
819 checksum and record ID fields. */
821 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
822 int *field_num_checksum, int *field_num_record_id)
824 *field_num_original_url = -1;
825 *field_num_checksum = -1;
826 *field_num_record_id = -1;
830 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
832 if (token != NULL && strcmp (token, "CDX") == 0)
835 while (token != NULL)
837 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
843 *field_num_original_url = field_num;
846 *field_num_checksum = field_num;
849 *field_num_record_id = field_num;
857 return *field_num_original_url != -1
858 && *field_num_checksum != -1
859 && *field_num_record_id != -1;
862 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
864 warc_process_cdx_line (char *lineptr, int field_num_original_url,
865 int field_num_checksum, int field_num_record_id)
867 char *original_url = NULL;
868 char *checksum = NULL;
869 char *record_id = NULL;
873 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
875 /* Read this line to get the fields we need. */
877 while (token != NULL)
880 if (field_num == field_num_original_url)
882 else if (field_num == field_num_checksum)
884 else if (field_num == field_num_record_id)
890 *val = strdup (token);
892 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
896 if (original_url != NULL && checksum != NULL && record_id != NULL)
898 /* For some extra efficiency, we decode the base32 encoded
899 checksum value. This should produce exactly SHA1_DIGEST_SIZE
903 base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
907 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
909 /* This is a valid line with a valid checksum. */
910 struct warc_cdx_record *rec;
911 rec = malloc (sizeof (struct warc_cdx_record));
912 rec->url = original_url;
913 rec->uuid = record_id;
914 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
915 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
921 if (checksum_v != NULL)
928 xfree_null(checksum);
929 xfree_null(original_url);
930 xfree_null(record_id);
934 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
935 the warc_cdx_dedup_table. */
937 warc_load_cdx_dedup_file (void)
939 FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
943 int field_num_original_url = -1;
944 int field_num_checksum = -1;
945 int field_num_record_id = -1;
947 char *lineptr = NULL;
951 /* The first line should contain the CDX header.
952 Format: " CDX x x x x x"
953 where x are field type indicators. For our purposes, we only
954 need 'a' (the original url), 'k' (the SHA1 checksum) and
955 'u' (the WARC record id). */
956 line_length = getline (&lineptr, &n, f);
957 if (line_length != -1)
958 warc_parse_cdx_header (lineptr, &field_num_original_url,
959 &field_num_checksum, &field_num_record_id);
961 /* If the file contains all three fields, read the complete file. */
962 if (field_num_original_url == -1
963 || field_num_checksum == -1
964 || field_num_record_id == -1)
966 if (field_num_original_url == -1)
967 logprintf (LOG_NOTQUIET,
968 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
969 if (field_num_checksum == -1)
970 logprintf (LOG_NOTQUIET,
971 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
972 if (field_num_record_id == -1)
973 logprintf (LOG_NOTQUIET,
974 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
978 /* Initialize the table. */
979 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
980 warc_cmp_sha1_digest);
984 line_length = getline (&lineptr, &n, f);
985 if (line_length != -1)
987 warc_process_cdx_line (lineptr, field_num_original_url,
988 field_num_checksum, field_num_record_id);
992 while (line_length != -1);
995 int nrecords = hash_table_count (warc_cdx_dedup_table);
996 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
997 "Loaded %d records from CDX.\n\n",
1009 /* Returns the existing duplicate CDX record for the given url and payload
1010 digest. Returns NULL if the url is not found or if the payload digest
1011 does not match, or if CDX deduplication is disabled. */
1012 static struct warc_cdx_record *
1013 warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
1015 if (warc_cdx_dedup_table == NULL)
1018 struct warc_cdx_record *rec_existing
1019 = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1021 if (rec_existing && strcmp (rec_existing->url, url) == 0)
1022 return rec_existing;
1027 /* Initializes the WARC writer (if opt.warc_filename is set).
1028 This should be called before any WARC record is written. */
1032 warc_write_ok = true;
1034 if (opt.warc_filename != NULL)
1036 if (opt.warc_cdx_dedup_filename != NULL)
1038 if (! warc_load_cdx_dedup_file ())
1040 logprintf (LOG_NOTQUIET,
1041 _("Could not read CDX file %s for deduplication.\n"),
1042 quote (opt.warc_cdx_dedup_filename));
1047 warc_manifest_fp = warc_tempfile ();
1048 if (warc_manifest_fp == NULL)
1050 logprintf (LOG_NOTQUIET,
1051 _("Could not open temporary WARC manifest file.\n"));
1055 if (opt.warc_keep_log)
1057 warc_log_fp = warc_tempfile ();
1058 if (warc_log_fp == NULL)
1060 logprintf (LOG_NOTQUIET,
1061 _("Could not open temporary WARC log file.\n"));
1064 log_set_warc_log_fp (warc_log_fp);
1067 warc_current_file_number = -1;
1068 if (! warc_start_new_file (false))
1070 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1074 if (opt.warc_cdx_enabled)
1076 if (! warc_start_cdx_file ())
1078 logprintf (LOG_NOTQUIET,
1079 _("Could not open CDX file for output.\n"));
1086 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1088 warc_write_metadata (void)
1090 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1091 if (opt.warc_maxsize > 0)
1092 warc_start_new_file (true);
1094 char manifest_uuid [48];
1095 warc_uuid_str (manifest_uuid);
1097 fflush (warc_manifest_fp);
1098 warc_write_metadata_record (manifest_uuid,
1099 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1100 NULL, NULL, NULL, "text/plain",
1101 warc_manifest_fp, -1);
1102 /* warc_write_resource_record has closed warc_manifest_fp. */
1104 FILE * warc_tmp_fp = warc_tempfile ();
1105 if (warc_tmp_fp == NULL)
1107 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1110 fflush (warc_tmp_fp);
1111 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1113 warc_write_resource_record (NULL,
1114 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1115 NULL, manifest_uuid, NULL, "text/plain",
1117 /* warc_write_resource_record has closed warc_tmp_fp. */
1119 if (warc_log_fp != NULL)
1121 warc_write_resource_record (NULL,
1122 "metadata://gnu.org/software/wget/warc/wget.log",
1123 NULL, manifest_uuid, NULL, "text/plain",
1125 /* warc_write_resource_record has closed warc_log_fp. */
1128 log_set_warc_log_fp (NULL);
1132 /* Finishes the WARC writing.
1133 This should be called at the end of the program. */
1137 if (warc_current_file != NULL)
1139 warc_write_metadata ();
1140 free (warc_current_warcinfo_uuid_str);
1141 fclose (warc_current_file);
1143 if (warc_current_cdx_file != NULL)
1144 fclose (warc_current_cdx_file);
1145 if (warc_log_fp != NULL)
1147 fclose (warc_log_fp);
1148 log_set_warc_log_fp (NULL);
1152 /* Creates a temporary file for writing WARC output.
1153 The temporary file will be created in opt.warc_tempdir.
1154 Returns the pointer to the temporary file, or NULL. */
1156 warc_tempfile (void)
1159 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1164 * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
1165 * mktemp() to uniquify the (VMS-style) name, and then use a normal
1166 * fopen() with a "create temp file marked for delete" option.
1171 tfn = mktemp (filename); /* Get unique name from template. */
1174 return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */
1176 #else /* def __VMS */
1177 int fd = mkostemp (filename, O_TEMPORARY);
1182 if (unlink (filename) < 0)
1186 return fdopen (fd, "wb+");
1187 #endif /* def __VMS [else] */
1191 /* Writes a request record to the WARC file.
1192 url is the target uri of the request,
1193 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1194 record_uuid is the uuid of the request (generated with warc_uuid_str),
1195 body is a pointer to a file containing the request headers and body.
1196 ip is the ip address of the server (or NULL),
1197 Calling this function will close body.
1198 Returns true on success, false on error. */
1200 warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
1201 ip_address *ip, FILE *body, off_t payload_offset)
1203 warc_write_start_record ();
1204 warc_write_header ("WARC-Type", "request");
1205 warc_write_header ("WARC-Target-URI", url);
1206 warc_write_header ("Content-Type", "application/http;msgtype=request");
1207 warc_write_date_header (timestamp_str);
1208 warc_write_header ("WARC-Record-ID", record_uuid);
1209 warc_write_ip_header (ip);
1210 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1211 warc_write_digest_headers (body, payload_offset);
1212 warc_write_block_from_file (body);
1213 warc_write_end_record ();
1217 return warc_write_ok;
1220 /* Writes a response record to the CDX file.
1221 url is the target uri of the request/response,
1222 timestamp_str is the timestamp of the request that generated this response,
1223 (generated with warc_timestamp),
1224 mime_type is the mime type of the response body (will be printed to CDX),
1225 response_code is the HTTP response code (will be printed to CDX),
1226 payload_digest is the sha1 digest of the payload,
1227 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1228 offset is the position of the WARC record in the WARC file,
1229 warc_filename is the filename of the WARC,
1230 response_uuid is the uuid of the response.
1231 Returns true on success, false on error. */
1233 warc_write_cdx_record (const char *url, const char *timestamp_str,
1234 const char *mime_type, int response_code,
1235 const char *payload_digest, const char *redirect_location,
1236 off_t offset, const char *warc_filename,
1237 const char *response_uuid)
1239 /* Transform the timestamp. */
1240 char timestamp_str_cdx [15];
1241 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1242 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1243 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1244 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1245 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1246 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1247 timestamp_str_cdx[14] = '\0';
1249 /* Rewrite the checksum. */
1250 const char *checksum;
1251 if (payload_digest != NULL)
1252 checksum = payload_digest + 5; /* Skip the "sha1:" */
1256 if (mime_type == NULL || strlen(mime_type) == 0)
1258 if (redirect_location == NULL || strlen(redirect_location) == 0)
1259 redirect_location = "-";
1261 char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1262 number_to_string (offset_string, offset);
1264 /* Print the CDX line. */
1265 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1266 timestamp_str_cdx, url, mime_type, response_code, checksum,
1267 redirect_location, offset_string, warc_current_filename,
1269 fflush (warc_current_cdx_file);
1274 /* Writes a revisit record to the WARC file.
1275 url is the target uri of the request/response,
1276 timestamp_str is the timestamp of the request that generated this response
1277 (generated with warc_timestamp),
1278 concurrent_to_uuid is the uuid of the request for that generated this response
1279 (generated with warc_uuid_str),
1280 refers_to_uuid is the uuid of the original response
1281 (generated with warc_uuid_str),
1282 payload_digest is the sha1 digest of the payload,
1283 ip is the ip address of the server (or NULL),
1284 body is a pointer to a file containing the response headers (without payload).
1285 Calling this function will close body.
1286 Returns true on success, false on error. */
1288 warc_write_revisit_record (char *url, char *timestamp_str,
1289 char *concurrent_to_uuid, char *payload_digest,
1290 char *refers_to, ip_address *ip, FILE *body)
1292 char revisit_uuid [48];
1293 warc_uuid_str (revisit_uuid);
1295 char *block_digest = NULL;
1296 char sha1_res_block[SHA1_DIGEST_SIZE];
1297 sha1_stream (body, sha1_res_block);
1298 block_digest = warc_base32_sha1_digest (sha1_res_block);
1300 warc_write_start_record ();
1301 warc_write_header ("WARC-Type", "revisit");
1302 warc_write_header ("WARC-Record-ID", revisit_uuid);
1303 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1304 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1305 warc_write_header ("WARC-Refers-To", refers_to);
1306 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1307 warc_write_header ("WARC-Truncated", "length");
1308 warc_write_header ("WARC-Target-URI", url);
1309 warc_write_date_header (timestamp_str);
1310 warc_write_ip_header (ip);
1311 warc_write_header ("Content-Type", "application/http;msgtype=response");
1312 warc_write_header ("WARC-Block-Digest", block_digest);
1313 warc_write_header ("WARC-Payload-Digest", payload_digest);
1314 warc_write_block_from_file (body);
1315 warc_write_end_record ();
1318 free (block_digest);
1320 return warc_write_ok;
1323 /* Writes a response record to the WARC file.
1324 url is the target uri of the request/response,
1325 timestamp_str is the timestamp of the request that generated this response
1326 (generated with warc_timestamp),
1327 concurrent_to_uuid is the uuid of the request for that generated this response
1328 (generated with warc_uuid_str),
1329 ip is the ip address of the server (or NULL),
1330 body is a pointer to a file containing the response headers and body.
1331 mime_type is the mime type of the response body (will be printed to CDX),
1332 response_code is the HTTP response code (will be printed to CDX),
1333 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1334 Calling this function will close body.
1335 Returns true on success, false on error. */
1337 warc_write_response_record (char *url, char *timestamp_str,
1338 char *concurrent_to_uuid, ip_address *ip,
1339 FILE *body, off_t payload_offset, char *mime_type,
1340 int response_code, char *redirect_location)
1342 char *block_digest = NULL;
1343 char *payload_digest = NULL;
1344 char sha1_res_block[SHA1_DIGEST_SIZE];
1345 char sha1_res_payload[SHA1_DIGEST_SIZE];
1347 if (opt.warc_digests_enabled)
1349 /* Calculate the block and payload digests. */
1351 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1352 payload_offset) == 0)
1354 /* Decide (based on url + payload digest) if we have seen this
1356 struct warc_cdx_record *rec_existing;
1357 rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1358 if (rec_existing != NULL)
1362 /* Found an existing record. */
1363 logprintf (LOG_VERBOSE,
1364 _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1366 /* Remove the payload from the file. */
1367 if (payload_offset > 0)
1369 if (ftruncate (fileno (body), payload_offset) == -1)
1373 /* Send the original payload digest. */
1374 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1375 result = warc_write_revisit_record (url, timestamp_str,
1376 concurrent_to_uuid, payload_digest, rec_existing->uuid,
1378 free (payload_digest);
1383 block_digest = warc_base32_sha1_digest (sha1_res_block);
1384 payload_digest = warc_base32_sha1_digest (sha1_res_payload);
1388 /* Not a revisit, just store the record. */
1390 char response_uuid [48];
1391 warc_uuid_str (response_uuid);
1393 fseeko (warc_current_file, 0L, SEEK_END);
1394 off_t offset = ftello (warc_current_file);
1396 warc_write_start_record ();
1397 warc_write_header ("WARC-Type", "response");
1398 warc_write_header ("WARC-Record-ID", response_uuid);
1399 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1400 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1401 warc_write_header ("WARC-Target-URI", url);
1402 warc_write_date_header (timestamp_str);
1403 warc_write_ip_header (ip);
1404 warc_write_header ("WARC-Block-Digest", block_digest);
1405 warc_write_header ("WARC-Payload-Digest", payload_digest);
1406 warc_write_header ("Content-Type", "application/http;msgtype=response");
1407 warc_write_block_from_file (body);
1408 warc_write_end_record ();
1412 if (warc_write_ok && opt.warc_cdx_enabled)
1414 /* Add this record to the CDX. */
1415 warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1416 payload_digest, redirect_location, offset, warc_current_filename,
1421 free (block_digest);
1423 free (payload_digest);
1425 return warc_write_ok;
1428 /* Writes a resource or metadata record to the WARC file.
1429 warc_type is either "resource" or "metadata",
1430 resource_uuid is the uuid of the resource (or NULL),
1431 url is the target uri of the resource,
1432 timestamp_str is the timestamp (generated with warc_timestamp),
1433 concurrent_to_uuid is the uuid of the record that generated this,
1434 resource (generated with warc_uuid_str) or NULL,
1435 ip is the ip address of the server (or NULL),
1436 content_type is the mime type of the body (or NULL),
1437 body is a pointer to a file containing the resource data.
1438 Calling this function will close body.
1439 Returns true on success, false on error. */
1441 warc_write_record (const char *record_type, char *resource_uuid,
1442 const char *url, const char *timestamp_str,
1443 const char *concurrent_to_uuid,
1444 ip_address *ip, const char *content_type, FILE *body,
1445 off_t payload_offset)
1447 if (resource_uuid == NULL)
1449 resource_uuid = alloca (48);
1450 warc_uuid_str (resource_uuid);
1453 if (content_type == NULL)
1454 content_type = "application/octet-stream";
1456 warc_write_start_record ();
1457 warc_write_header ("WARC-Type", record_type);
1458 warc_write_header ("WARC-Record-ID", resource_uuid);
1459 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1460 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1461 warc_write_header ("WARC-Target-URI", url);
1462 warc_write_date_header (timestamp_str);
1463 warc_write_ip_header (ip);
1464 warc_write_digest_headers (body, payload_offset);
1465 warc_write_header ("Content-Type", content_type);
1466 warc_write_block_from_file (body);
1467 warc_write_end_record ();
1471 return warc_write_ok;
1474 /* Writes a resource record to the WARC file.
1475 resource_uuid is the uuid of the resource (or NULL),
1476 url is the target uri of the resource,
1477 timestamp_str is the timestamp (generated with warc_timestamp),
1478 concurrent_to_uuid is the uuid of the record that generated this,
1479 resource (generated with warc_uuid_str) or NULL,
1480 ip is the ip address of the server (or NULL),
1481 content_type is the mime type of the body (or NULL),
1482 body is a pointer to a file containing the resource data.
1483 Calling this function will close body.
1484 Returns true on success, false on error. */
1486 warc_write_resource_record (char *resource_uuid, const char *url,
1487 const char *timestamp_str, const char *concurrent_to_uuid,
1488 ip_address *ip, const char *content_type, FILE *body,
1489 off_t payload_offset)
1491 return warc_write_record ("resource",
1492 resource_uuid, url, timestamp_str, concurrent_to_uuid,
1493 ip, content_type, body, payload_offset);
1496 /* Writes a metadata record to the WARC file.
1497 record_uuid is the uuid of the record (or NULL),
1498 url is the target uri of the record,
1499 timestamp_str is the timestamp (generated with warc_timestamp),
1500 concurrent_to_uuid is the uuid of the record that generated this,
1501 record (generated with warc_uuid_str) or NULL,
1502 ip is the ip address of the server (or NULL),
1503 content_type is the mime type of the body (or NULL),
1504 body is a pointer to a file containing the record data.
1505 Calling this function will close body.
1506 Returns true on success, false on error. */
1508 warc_write_metadata_record (char *record_uuid, const char *url,
1509 const char *timestamp_str, const char *concurrent_to_uuid,
1510 ip_address *ip, const char *content_type, FILE *body,
1511 off_t payload_offset)
1513 return warc_write_record ("metadata",
1514 record_uuid, url, timestamp_str, concurrent_to_uuid,
1515 ip, content_type, body, payload_offset);