#ifndef WINDOWS
#include <libgen.h>
+#else
+#include <fcntl.h>
#endif
#include "warc.h"
+#ifndef O_TEMPORARY
+#define O_TEMPORARY 0
+#endif
+
extern char *version_string;
/* Set by main in main.c */
#ifdef HAVE_LIBZ
/* The gzip stream for the current WARC file
(or NULL, if WARC or gzip is disabled). */
-static gzFile *warc_current_gzfile;
+static gzFile warc_current_gzfile;
/* The offset of the current gzip record in the WARC file. */
static off_t warc_current_gzfile_offset;
}
-#define EXTRA_GZIP_HEADER_SIZE 12
+#define EXTRA_GZIP_HEADER_SIZE 14
#define GZIP_STATIC_HEADER_SIZE 10
#define FLG_FEXTRA 0x04
#define OFF_FLG 3
Returns false and set warc_write_ok to false if there
is an error. */
static bool
-warc_write_start_record ()
+warc_write_start_record (void)
{
if (!warc_write_ok)
return false;
In warc_write_end_record we will fill this space
with information about the uncompressed and
compressed size of the record. */
- fprintf (warc_current_file, "XXXXXXXXXXXX");
+ fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
fflush (warc_current_file);
/* Start a new GZIP stream. */
if (warc_current_gzfile == NULL)
{
- logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n"));
+ logprintf (LOG_NOTQUIET,
+_("Error opening GZIP stream to WARC file.\n"));
warc_write_ok = false;
return false;
}
warc_write_block_from_file (FILE *data_in)
{
/* Add the Content-Length header. */
- char *content_length;
+ char content_length[MAX_INT_TO_STRING_LEN(off_t)];
fseeko (data_in, 0L, SEEK_END);
- if (! asprintf (&content_length, "%ld", ftello (data_in)))
- {
- warc_write_ok = false;
- return false;
- }
+ number_to_string (content_length, ftello (data_in));
warc_write_header ("Content-Length", content_length);
- free (content_length);
/* End of the WARC header section. */
warc_write_string ("\r\n");
with the uncompressed and compressed length of the
record. */
static bool
-warc_write_end_record ()
+warc_write_end_record (void)
{
warc_write_buffer ("\r\n\r\n", 4);
/* The WARC standard suggests that we add 'skip length' data in the
extra header field of the GZIP stream.
-
+
In warc_write_start_record we reserved space for this extra header.
This extra space starts at warc_current_gzfile_offset and fills
EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
-
+
We need to do three things:
1. Move the static GZIP header to warc_current_gzfile_offset;
2. Set the FEXTRA flag in the GZIP header;
off_t compressed_size = warc_current_gzfile_uncompressed_size;
/* Go back to the static GZIP header. */
- fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
+ fseeko (warc_current_file, warc_current_gzfile_offset
+ + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
/* Read the header. */
char static_header[GZIP_STATIC_HEADER_SIZE];
- size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
+ size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
+ warc_current_file);
if (result != GZIP_STATIC_HEADER_SIZE)
{
warc_write_ok = false;
/* Set the FEXTRA flag in the flags byte of the header. */
static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
- /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
+ /* Write the header back to the file, but starting at
+ warc_current_gzfile_offset. */
fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
/* The extra header field identifier for the WARC skip length. */
extra_header[2] = 's';
extra_header[3] = 'l';
+ /* The size of the field value (8 bytes). */
+ extra_header[4] = (8 & 255);
+ extra_header[5] = ((8 >> 8) & 255);
/* The size of the uncompressed record. */
- extra_header[4] = (uncompressed_size & 255);
- extra_header[5] = (uncompressed_size >> 8) & 255;
- extra_header[6] = (uncompressed_size >> 16) & 255;
- extra_header[7] = (uncompressed_size >> 24) & 255;
+ extra_header[6] = (uncompressed_size & 255);
+ extra_header[7] = (uncompressed_size >> 8) & 255;
+ extra_header[8] = (uncompressed_size >> 16) & 255;
+ extra_header[9] = (uncompressed_size >> 24) & 255;
/* The size of the compressed record. */
- extra_header[8] = (compressed_size & 255);
- extra_header[9] = (compressed_size >> 8) & 255;
- extra_header[10] = (compressed_size >> 16) & 255;
- extra_header[11] = (compressed_size >> 24) & 255;
+ extra_header[10] = (compressed_size & 255);
+ extra_header[11] = (compressed_size >> 8) & 255;
+ extra_header[12] = (compressed_size >> 16) & 255;
+ extra_header[13] = (compressed_size >> 24) & 255;
/* Write the extra header after the static header. */
- fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
+ fseeko (warc_current_file, warc_current_gzfile_offset
+ + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
/* Done, move back to the end of the file. */
the current WARC record.
If timestamp is NULL, the current time will be used. */
static bool
-warc_write_date_header (char *timestamp)
+warc_write_date_header (const char *timestamp)
{
if (timestamp == NULL)
{
Compute SHA1 message digests for bytes read from STREAM. The
digest of the complete file will be written into the 16 bytes
beginning at RES_BLOCK.
-
+
If payload_offset >= 0, a second digest will be calculated of the
portion of the file starting at payload_offset and continuing to
the end of the file. The digest number will be written into the
16 bytes beginning ad RES_PAYLOAD. */
static int
-warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset)
+warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
+ off_t payload_offset)
{
#define BLOCKSIZE 32768
have to start with a full block, there may still be some
bytes left from the previous buffer. Therefore, we need
to continue with sha1_process_bytes. */
- sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload);
+ sha1_process_bytes (buffer + start_of_payload,
+ BLOCKSIZE - start_of_payload, &ctx_payload);
}
}
start_of_payload = 0;
/* Process the payload part of the buffer. */
- sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload);
+ sha1_process_bytes (buffer + start_of_payload,
+ sum - start_of_payload, &ctx_payload);
}
}
static char *
warc_base32_sha1_digest (char *sha1_digest)
{
- // length: "sha1:" + digest + "\0"
+ /* length: "sha1:" + digest + "\0" */
char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
- base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
+ base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
+ BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
memcpy (sha1_base32, "sha1:", 5);
sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0';
return sha1_base32;
char sha1_res_payload[SHA1_DIGEST_SIZE];
rewind (file);
- if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0)
+ if (warc_sha1_stream_with_payload (file, sha1_res_block,
+ sha1_res_payload, payload_offset) == 0)
{
char *digest;
/* Write a warcinfo record to the current file.
Updates warc_current_warcinfo_uuid_str. */
-bool
+static bool
warc_write_warcinfo_record (char *filename)
{
/* Write warc-info record as the first record of the file. */
- /* We add the record id of this info record to the other records in the file. */
+ /* We add the record id of this info record to the other records in the
+ file. */
warc_current_warcinfo_uuid_str = (char *) malloc (48);
warc_uuid_str (warc_current_warcinfo_uuid_str);
fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
- fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
+ fprintf (warc_tmp,
+"conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
/* Add the user headers, if any. */
warc_write_end_record ();
if (! warc_write_ok)
- {
- logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
- }
+ logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
free (filename_copy);
free (filename_basename);
/* Opens a new WARC file.
If META is true, generates a filename ending with 'meta.warc.gz'.
-
+
This method will:
1. close the current WARC file (if there is one);
2. increment warc_current_file_number;
if (warc_current_file != NULL)
fclose (warc_current_file);
- if (warc_current_warcinfo_uuid_str)
- free (warc_current_warcinfo_uuid_str);
- if (warc_current_filename)
- free (warc_current_filename);
+
+ free (warc_current_warcinfo_uuid_str);
+ free (warc_current_filename);
warc_current_file_number++;
char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
warc_current_filename = new_filename;
+#ifdef __VMS
+# define WARC_GZ "warc-gz"
+#else /* def __VMS */
+# define WARC_GZ "warc.gz"
+#endif /* def __VMS [else] */
+
#ifdef HAVE_LIBZ
- char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
+ const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
#else
- char *extension = "warc";
+ const char *extension = "warc";
#endif
/* If max size is enabled, we add a serial number to the file names. */
if (meta)
sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
else if (opt.warc_maxsize > 0)
- sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension);
+ {
+ sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
+ warc_current_file_number, extension);
+ }
else
sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
warc_current_file = fopen (new_filename, "wb+");
if (warc_current_file == NULL)
{
- logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename));
+ logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
+ quote (new_filename));
return false;
}
/* Opens the CDX file for output. */
static bool
-warc_start_cdx_file ()
+warc_start_cdx_file (void)
{
int filename_length = strlen (opt.warc_filename);
char *cdx_filename = alloca (filename_length + 4 + 1);
/* Parse the CDX header and find the field numbers of the original url,
checksum and record ID fields. */
static bool
-warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id)
+warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
+ int *field_num_checksum, int *field_num_record_id)
{
*field_num_original_url = -1;
*field_num_checksum = -1;
char *token;
char *save_ptr;
token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
-
+
if (token != NULL && strcmp (token, "CDX") == 0)
{
int field_num = 0;
/* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
static void
-warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id)
+warc_process_cdx_line (char *lineptr, int field_num_original_url,
+ int field_num_checksum, int field_num_record_id)
{
char *original_url = NULL;
char *checksum = NULL;
bytes. */
size_t checksum_l;
char * checksum_v;
- base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l);
+ base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
+ &checksum_l);
free (checksum);
if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
{
/* This is a valid line with a valid checksum. */
- struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record));
+ struct warc_cdx_record *rec;
+ rec = malloc (sizeof (struct warc_cdx_record));
rec->url = original_url;
rec->uuid = record_id;
memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
else
{
free (original_url);
- if (checksum_v != NULL)
- free (checksum_v);
+ free (checksum_v);
free (record_id);
}
}
+ else
+ {
+ xfree_null(checksum);
+ xfree_null(original_url);
+ xfree_null(record_id);
+ }
}
/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
the warc_cdx_dedup_table. */
-bool
-warc_load_cdx_dedup_file ()
+static bool
+warc_load_cdx_dedup_file (void)
{
FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
if (f == NULL)
char *lineptr = NULL;
size_t n = 0;
- size_t line_length;
+ ssize_t line_length;
/* The first line should contain the CDX header.
Format: " CDX x x x x x"
'u' (the WARC record id). */
line_length = getline (&lineptr, &n, f);
if (line_length != -1)
- warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id);
+ warc_parse_cdx_header (lineptr, &field_num_original_url,
+ &field_num_checksum, &field_num_record_id);
/* If the file contains all three fields, read the complete file. */
if (field_num_original_url == -1
|| field_num_record_id == -1)
{
if (field_num_original_url == -1)
- logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n"));
+ logprintf (LOG_NOTQUIET,
+_("CDX file does not list original urls. (Missing column 'a'.)\n"));
if (field_num_checksum == -1)
- logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n"));
+ logprintf (LOG_NOTQUIET,
+_("CDX file does not list checksums. (Missing column 'k'.)\n"));
if (field_num_record_id == -1)
- logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n"));
+ logprintf (LOG_NOTQUIET,
+_("CDX file does not list record ids. (Missing column 'u'.)\n"));
}
else
{
/* Initialize the table. */
- warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest);
+ warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
+ warc_cmp_sha1_digest);
do
{
line_length = getline (&lineptr, &n, f);
if (line_length != -1)
- warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id);
+ {
+ warc_process_cdx_line (lineptr, field_num_original_url,
+ field_num_checksum, field_num_record_id);
+ }
}
while (line_length != -1);
/* Print results. */
int nrecords = hash_table_count (warc_cdx_dedup_table);
logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
- "Loaded %d records from CDX.\n\n", nrecords),
+ "Loaded %d records from CDX.\n\n",
+ nrecords),
nrecords);
}
+ free (lineptr);
fclose (f);
return true;
if (warc_cdx_dedup_table == NULL)
return NULL;
- char *key;
- struct warc_cdx_record *rec_existing;
- hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing);
+ struct warc_cdx_record *rec_existing
+ = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
- if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
+ if (rec_existing && strcmp (rec_existing->url, url) == 0)
return rec_existing;
else
return NULL;
/* Initializes the WARC writer (if opt.warc_filename is set).
This should be called before any WARC record is written. */
void
-warc_init ()
+warc_init (void)
{
warc_write_ok = true;
warc_manifest_fp = warc_tempfile ();
if (warc_manifest_fp == NULL)
{
- logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n"));
+ logprintf (LOG_NOTQUIET,
+ _("Could not open temporary WARC manifest file.\n"));
exit(1);
}
warc_log_fp = warc_tempfile ();
if (warc_log_fp == NULL)
{
- logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n"));
+ logprintf (LOG_NOTQUIET,
+ _("Could not open temporary WARC log file.\n"));
exit(1);
}
log_set_warc_log_fp (warc_log_fp);
{
if (! warc_start_cdx_file ())
{
- logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n"));
+ logprintf (LOG_NOTQUIET,
+ _("Could not open CDX file for output.\n"));
exit(1);
}
}
}
/* Writes metadata (manifest, configuration, log file) to the WARC file. */
-void
-warc_write_metadata ()
+static void
+warc_write_metadata (void)
{
/* If there are multiple WARC files, the metadata should be written to a separate file. */
if (opt.warc_maxsize > 0)
warc_uuid_str (manifest_uuid);
fflush (warc_manifest_fp);
- warc_write_resource_record (manifest_uuid,
+ warc_write_metadata_record (manifest_uuid,
"metadata://gnu.org/software/wget/warc/MANIFEST.txt",
NULL, NULL, NULL, "text/plain",
warc_manifest_fp, -1);
fflush (warc_tmp_fp);
fprintf (warc_tmp_fp, "%s\n", program_argstring);
- warc_write_resource_record (manifest_uuid,
- "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
- NULL, NULL, NULL, "text/plain",
+ warc_write_resource_record (NULL,
+ "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
+ NULL, manifest_uuid, NULL, "text/plain",
warc_tmp_fp, -1);
/* warc_write_resource_record has closed warc_tmp_fp. */
if (warc_log_fp != NULL)
{
warc_write_resource_record (NULL,
- "metadata://gnu.org/software/wget/warc/wget.log",
+ "metadata://gnu.org/software/wget/warc/wget.log",
NULL, manifest_uuid, NULL, "text/plain",
warc_log_fp, -1);
/* warc_write_resource_record has closed warc_log_fp. */
/* Finishes the WARC writing.
This should be called at the end of the program. */
void
-warc_close ()
+warc_close (void)
{
if (warc_current_file != NULL)
{
The temporary file will be created in opt.warc_tempdir.
Returns the pointer to the temporary file, or NULL. */
FILE *
-warc_tempfile ()
+warc_tempfile (void)
{
char filename[100];
if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
return NULL;
- int fd = mkstemp (filename);
+#ifdef __VMS
+ /* 2013-07-12 SMS.
+ * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
+ * mktemp() to uniquify the (VMS-style) name, and then use a normal
+ * fopen() with a "create temp file marked for delete" option.
+ */
+ {
+ char *tfn;
+
+ tfn = mktemp (filename); /* Get unique name from template. */
+ if (tfn == NULL)
+ return NULL;
+ return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */
+ }
+#else /* def __VMS */
+ int fd = mkostemp (filename, O_TEMPORARY);
if (fd < 0)
return NULL;
+#if !O_TEMPORARY
if (unlink (filename) < 0)
return NULL;
+#endif
return fdopen (fd, "wb+");
+#endif /* def __VMS [else] */
}
Calling this function will close body.
Returns true on success, false on error. */
bool
-warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset)
+warc_write_request_record (char *url, char *timestamp_str, char *record_uuid,
+ ip_address *ip, FILE *body, off_t payload_offset)
{
warc_write_start_record ();
warc_write_header ("WARC-Type", "request");
warc_write_digest_headers (body, payload_offset);
warc_write_block_from_file (body);
warc_write_end_record ();
-
+
fclose (body);
return warc_write_ok;
response_uuid is the uuid of the response.
Returns true on success, false on error. */
static bool
-warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid)
+warc_write_cdx_record (const char *url, const char *timestamp_str,
+ const char *mime_type, int response_code,
+ const char *payload_digest, const char *redirect_location,
+ off_t offset, const char *warc_filename,
+ const char *response_uuid)
{
/* Transform the timestamp. */
char timestamp_str_cdx [15];
memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
timestamp_str_cdx[14] = '\0';
-
+
/* Rewrite the checksum. */
- char *checksum;
+ const char *checksum;
if (payload_digest != NULL)
checksum = payload_digest + 5; /* Skip the "sha1:" */
else
if (redirect_location == NULL || strlen(redirect_location) == 0)
redirect_location = "-";
+ char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
+ number_to_string (offset_string, offset);
+
/* Print the CDX line. */
- fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid);
+ fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
+ timestamp_str_cdx, url, mime_type, response_code, checksum,
+ redirect_location, offset_string, warc_current_filename,
+ response_uuid);
fflush (warc_current_cdx_file);
return true;
Calling this function will close body.
Returns true on success, false on error. */
static bool
-warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body)
+warc_write_revisit_record (char *url, char *timestamp_str,
+ char *concurrent_to_uuid, char *payload_digest,
+ char *refers_to, ip_address *ip, FILE *body)
{
char revisit_uuid [48];
warc_uuid_str (revisit_uuid);
warc_write_header ("WARC-Payload-Digest", payload_digest);
warc_write_block_from_file (body);
warc_write_end_record ();
-
+
fclose (body);
free (block_digest);
Calling this function will close body.
Returns true on success, false on error. */
bool
-warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location)
+warc_write_response_record (char *url, char *timestamp_str,
+ char *concurrent_to_uuid, ip_address *ip,
+ FILE *body, off_t payload_offset, char *mime_type,
+ int response_code, char *redirect_location)
{
char *block_digest = NULL;
char *payload_digest = NULL;
{
/* Calculate the block and payload digests. */
rewind (body);
- if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0)
+ if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
+ payload_offset) == 0)
{
/* Decide (based on url + payload digest) if we have seen this
data before. */
- struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
+ struct warc_cdx_record *rec_existing;
+ rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
if (rec_existing != NULL)
{
+ bool result;
+
/* Found an existing record. */
- logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
+ logprintf (LOG_VERBOSE,
+ _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
/* Remove the payload from the file. */
if (payload_offset > 0)
/* Send the original payload digest. */
payload_digest = warc_base32_sha1_digest (sha1_res_payload);
- bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body);
+ result = warc_write_revisit_record (url, timestamp_str,
+ concurrent_to_uuid, payload_digest, rec_existing->uuid,
+ ip, body);
free (payload_digest);
return result;
if (warc_write_ok && opt.warc_cdx_enabled)
{
/* Add this record to the CDX. */
- warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid);
+ warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
+ payload_digest, redirect_location, offset, warc_current_filename,
+ response_uuid);
}
- if (block_digest)
- free (block_digest);
- if (payload_digest)
- free (payload_digest);
+ free (block_digest);
+ free (payload_digest);
return warc_write_ok;
}
-/* Writes a resource record to the WARC file.
+/* Writes a resource or metadata record to the WARC file.
+ warc_type is either "resource" or "metadata",
resource_uuid is the uuid of the resource (or NULL),
url is the target uri of the resource,
timestamp_str is the timestamp (generated with warc_timestamp),
- concurrent_to_uuid is the uuid of the request for that generated this resource
- (generated with warc_uuid_str) or NULL,
+ concurrent_to_uuid is the uuid of the record that generated this,
+ resource (generated with warc_uuid_str) or NULL,
ip is the ip address of the server (or NULL),
content_type is the mime type of the body (or NULL),
body is a pointer to a file containing the resource data.
Calling this function will close body.
Returns true on success, false on error. */
-bool
-warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset)
+static bool
+warc_write_record (const char *record_type, char *resource_uuid,
+ const char *url, const char *timestamp_str,
+ const char *concurrent_to_uuid,
+ ip_address *ip, const char *content_type, FILE *body,
+ off_t payload_offset)
{
if (resource_uuid == NULL)
{
content_type = "application/octet-stream";
warc_write_start_record ();
- warc_write_header ("WARC-Type", "resource");
+ warc_write_header ("WARC-Type", record_type);
warc_write_header ("WARC-Record-ID", resource_uuid);
warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
warc_write_header ("Content-Type", content_type);
warc_write_block_from_file (body);
warc_write_end_record ();
-
+
fclose (body);
return warc_write_ok;
}
+/* Writes a resource record to the WARC file.
+ resource_uuid is the uuid of the resource (or NULL),
+ url is the target uri of the resource,
+ timestamp_str is the timestamp (generated with warc_timestamp),
+ concurrent_to_uuid is the uuid of the record that generated this,
+ resource (generated with warc_uuid_str) or NULL,
+ ip is the ip address of the server (or NULL),
+ content_type is the mime type of the body (or NULL),
+ body is a pointer to a file containing the resource data.
+ Calling this function will close body.
+ Returns true on success, false on error. */
+bool
+warc_write_resource_record (char *resource_uuid, const char *url,
+ const char *timestamp_str, const char *concurrent_to_uuid,
+ ip_address *ip, const char *content_type, FILE *body,
+ off_t payload_offset)
+{
+ return warc_write_record ("resource",
+ resource_uuid, url, timestamp_str, concurrent_to_uuid,
+ ip, content_type, body, payload_offset);
+}
+
+/* Writes a metadata record to the WARC file.
+ record_uuid is the uuid of the record (or NULL),
+ url is the target uri of the record,
+ timestamp_str is the timestamp (generated with warc_timestamp),
+ concurrent_to_uuid is the uuid of the record that generated this,
+ record (generated with warc_uuid_str) or NULL,
+ ip is the ip address of the server (or NULL),
+ content_type is the mime type of the body (or NULL),
+ body is a pointer to a file containing the record data.
+ Calling this function will close body.
+ Returns true on success, false on error. */
+bool
+warc_write_metadata_record (char *record_uuid, const char *url,
+ const char *timestamp_str, const char *concurrent_to_uuid,
+ ip_address *ip, const char *content_type, FILE *body,
+ off_t payload_offset)
+{
+ return warc_write_record ("metadata",
+ record_uuid, url, timestamp_str, concurrent_to_uuid,
+ ip, content_type, body, payload_offset);
+}