#ifndef WINDOWS
#include <libgen.h>
+#else
+#include <fcntl.h>
#endif
#include "warc.h"
+#ifndef O_TEMPORARY
+#define O_TEMPORARY 0
+#endif
+
extern char *version_string;
/* Set by main in main.c */
#ifdef HAVE_LIBZ
/* The gzip stream for the current WARC file
(or NULL, if WARC or gzip is disabled). */
-static gzFile *warc_current_gzfile;
+static gzFile warc_current_gzfile;
/* The offset of the current gzip record in the WARC file. */
static off_t warc_current_gzfile_offset;
static int warc_current_file_number;
/* The table of CDX records, if deduplication is enabled. */
-struct hash_table * warc_cdx_dedup_table;
+static struct hash_table * warc_cdx_dedup_table;
static bool warc_start_new_file (bool meta);
}
-#define EXTRA_GZIP_HEADER_SIZE 12
+#define EXTRA_GZIP_HEADER_SIZE 14
#define GZIP_STATIC_HEADER_SIZE 10
#define FLG_FEXTRA 0x04
#define OFF_FLG 3
In warc_write_end_record we will fill this space
with information about the uncompressed and
compressed size of the record. */
- fprintf (warc_current_file, "XXXXXXXXXXXX");
+ fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
fflush (warc_current_file);
/* Start a new GZIP stream. */
warc_write_block_from_file (FILE *data_in)
{
/* Add the Content-Length header. */
- char *content_length;
+ char content_length[MAX_INT_TO_STRING_LEN(off_t)];
fseeko (data_in, 0L, SEEK_END);
- if (! asprintf (&content_length, "%ld", ftello (data_in)))
- {
- warc_write_ok = false;
- return false;
- }
+ number_to_string (content_length, ftello (data_in));
warc_write_header ("Content-Length", content_length);
- free (content_length);
/* End of the WARC header section. */
warc_write_string ("\r\n");
/* The extra header field identifier for the WARC skip length. */
extra_header[2] = 's';
extra_header[3] = 'l';
+ /* The size of the field value (8 bytes). */
+ extra_header[4] = (8 & 255);
+ extra_header[5] = ((8 >> 8) & 255);
/* The size of the uncompressed record. */
- extra_header[4] = (uncompressed_size & 255);
- extra_header[5] = (uncompressed_size >> 8) & 255;
- extra_header[6] = (uncompressed_size >> 16) & 255;
- extra_header[7] = (uncompressed_size >> 24) & 255;
+ extra_header[6] = (uncompressed_size & 255);
+ extra_header[7] = (uncompressed_size >> 8) & 255;
+ extra_header[8] = (uncompressed_size >> 16) & 255;
+ extra_header[9] = (uncompressed_size >> 24) & 255;
/* The size of the compressed record. */
- extra_header[8] = (compressed_size & 255);
- extra_header[9] = (compressed_size >> 8) & 255;
- extra_header[10] = (compressed_size >> 16) & 255;
- extra_header[11] = (compressed_size >> 24) & 255;
+ extra_header[10] = (compressed_size & 255);
+ extra_header[11] = (compressed_size >> 8) & 255;
+ extra_header[12] = (compressed_size >> 16) & 255;
+ extra_header[13] = (compressed_size >> 24) & 255;
/* Write the extra header after the static header. */
fseeko (warc_current_file, warc_current_gzfile_offset
static char *
warc_base32_sha1_digest (char *sha1_digest)
{
- // length: "sha1:" + digest + "\0"
+ /* length: "sha1:" + digest + "\0" */
char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
if (warc_current_file != NULL)
fclose (warc_current_file);
- if (warc_current_warcinfo_uuid_str)
- free (warc_current_warcinfo_uuid_str);
- if (warc_current_filename)
- free (warc_current_filename);
+
+ free (warc_current_warcinfo_uuid_str);
+ free (warc_current_filename);
warc_current_file_number++;
char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
warc_current_filename = new_filename;
+#ifdef __VMS
+# define WARC_GZ "warc-gz"
+#else /* def __VMS */
+# define WARC_GZ "warc.gz"
+#endif /* def __VMS [else] */
+
#ifdef HAVE_LIBZ
- const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc");
+ const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
#else
const char *extension = "warc";
#endif
else
{
free (original_url);
- if (checksum_v != NULL)
- free (checksum_v);
+ free (checksum_v);
free (record_id);
}
}
+ else
+ {
+ xfree_null(checksum);
+ xfree_null(original_url);
+ xfree_null(record_id);
+ }
}
/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
char *lineptr = NULL;
size_t n = 0;
- size_t line_length;
+ ssize_t line_length;
/* The first line should contain the CDX header.
Format: " CDX x x x x x"
if (warc_cdx_dedup_table == NULL)
return NULL;
- char *key;
- struct warc_cdx_record *rec_existing;
- hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key,
- &rec_existing);
+ struct warc_cdx_record *rec_existing
+ = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
- if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0)
+ if (rec_existing && strcmp (rec_existing->url, url) == 0)
return rec_existing;
else
return NULL;
warc_uuid_str (manifest_uuid);
fflush (warc_manifest_fp);
- warc_write_resource_record (manifest_uuid,
+ warc_write_metadata_record (manifest_uuid,
"metadata://gnu.org/software/wget/warc/MANIFEST.txt",
NULL, NULL, NULL, "text/plain",
warc_manifest_fp, -1);
fflush (warc_tmp_fp);
fprintf (warc_tmp_fp, "%s\n", program_argstring);
- warc_write_resource_record (manifest_uuid,
+ warc_write_resource_record (NULL,
"metadata://gnu.org/software/wget/warc/wget_arguments.txt",
- NULL, NULL, NULL, "text/plain",
+ NULL, manifest_uuid, NULL, "text/plain",
warc_tmp_fp, -1);
/* warc_write_resource_record has closed warc_tmp_fp. */
if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
return NULL;
- int fd = mkstemp (filename);
+#ifdef __VMS
+ /* 2013-07-12 SMS.
+ * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
+ * mktemp() to uniquify the (VMS-style) name, and then use a normal
+ * fopen() with a "create temp file marked for delete" option.
+ */
+ {
+ char *tfn;
+
+ tfn = mktemp (filename); /* Get unique name from template. */
+ if (tfn == NULL)
+ return NULL;
+ return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */
+ }
+#else /* def __VMS */
+ int fd = mkostemp (filename, O_TEMPORARY);
if (fd < 0)
return NULL;
+#if !O_TEMPORARY
if (unlink (filename) < 0)
return NULL;
+#endif
return fdopen (fd, "wb+");
+#endif /* def __VMS [else] */
}
warc_write_cdx_record (const char *url, const char *timestamp_str,
const char *mime_type, int response_code,
const char *payload_digest, const char *redirect_location,
- off_t offset, const char *warc_filename,
+ off_t offset, const char *warc_filename _GL_UNUSED,
const char *response_uuid)
{
/* Transform the timestamp. */
if (redirect_location == NULL || strlen(redirect_location) == 0)
redirect_location = "-";
+ char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
+ number_to_string (offset_string, offset);
+
/* Print the CDX line. */
- fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url,
+ fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
timestamp_str_cdx, url, mime_type, response_code, checksum,
- redirect_location, offset, warc_current_filename, response_uuid);
+ redirect_location, offset_string, warc_current_filename,
+ response_uuid);
fflush (warc_current_cdx_file);
return true;
response_uuid);
}
- if (block_digest)
- free (block_digest);
- if (payload_digest)
- free (payload_digest);
+ free (block_digest);
+ free (payload_digest);
return warc_write_ok;
}
-/* Writes a resource record to the WARC file.
+/* Writes a resource or metadata record to the WARC file.
+ warc_type is either "resource" or "metadata",
resource_uuid is the uuid of the resource (or NULL),
url is the target uri of the resource,
timestamp_str is the timestamp (generated with warc_timestamp),
- concurrent_to_uuid is the uuid of the request for that generated this
+ concurrent_to_uuid is the uuid of the record that generated this,
resource (generated with warc_uuid_str) or NULL,
ip is the ip address of the server (or NULL),
content_type is the mime type of the body (or NULL),
body is a pointer to a file containing the resource data.
Calling this function will close body.
Returns true on success, false on error. */
-bool
-warc_write_resource_record (char *resource_uuid, const char *url,
- const char *timestamp_str, const char *concurrent_to_uuid,
+static bool
+warc_write_record (const char *record_type, char *resource_uuid,
+ const char *url, const char *timestamp_str,
+ const char *concurrent_to_uuid,
ip_address *ip, const char *content_type, FILE *body,
off_t payload_offset)
{
content_type = "application/octet-stream";
warc_write_start_record ();
- warc_write_header ("WARC-Type", "resource");
+ warc_write_header ("WARC-Type", record_type);
warc_write_header ("WARC-Record-ID", resource_uuid);
warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
return warc_write_ok;
}
+
+/* Writes a resource record to the WARC file.
+ resource_uuid is the uuid of the resource (or NULL),
+ url is the target uri of the resource,
+ timestamp_str is the timestamp (generated with warc_timestamp),
+ concurrent_to_uuid is the uuid of the record that generated this,
+ resource (generated with warc_uuid_str) or NULL,
+ ip is the ip address of the server (or NULL),
+ content_type is the mime type of the body (or NULL),
+ body is a pointer to a file containing the resource data.
+ Calling this function will close body.
+ Returns true on success, false on error. */
+bool
+warc_write_resource_record (char *resource_uuid, const char *url,
+ const char *timestamp_str, const char *concurrent_to_uuid,
+ ip_address *ip, const char *content_type, FILE *body,
+ off_t payload_offset)
+{
+ return warc_write_record ("resource",
+ resource_uuid, url, timestamp_str, concurrent_to_uuid,
+ ip, content_type, body, payload_offset);
+}
+
+/* Writes a metadata record to the WARC file.
+ record_uuid is the uuid of the record (or NULL),
+ url is the target uri of the record,
+ timestamp_str is the timestamp (generated with warc_timestamp),
+ concurrent_to_uuid is the uuid of the record that generated this,
+ record (generated with warc_uuid_str) or NULL,
+ ip is the ip address of the server (or NULL),
+ content_type is the mime type of the body (or NULL),
+ body is a pointer to a file containing the record data.
+ Calling this function will close body.
+ Returns true on success, false on error. */
+bool
+warc_write_metadata_record (char *record_uuid, const char *url,
+ const char *timestamp_str, const char *concurrent_to_uuid,
+ ip_address *ip, const char *content_type, FILE *body,
+ off_t payload_offset)
+{
+ return warc_write_record ("metadata",
+ record_uuid, url, timestamp_str, concurrent_to_uuid,
+ ip, content_type, body, payload_offset);
+}