}
-#define EXTRA_GZIP_HEADER_SIZE 12
+#define EXTRA_GZIP_HEADER_SIZE 14
#define GZIP_STATIC_HEADER_SIZE 10
#define FLG_FEXTRA 0x04
#define OFF_FLG 3
In warc_write_end_record we will fill this space
with information about the uncompressed and
compressed size of the record. */
- fprintf (warc_current_file, "XXXXXXXXXXXX");
+ fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR);
fflush (warc_current_file);
/* Start a new GZIP stream. */
warc_write_block_from_file (FILE *data_in)
{
/* Add the Content-Length header. */
- char *content_length;
+ char content_length[MAX_INT_TO_STRING_LEN(off_t)];
fseeko (data_in, 0L, SEEK_END);
- if (! asprintf (&content_length, "%ld", ftello (data_in)))
- {
- warc_write_ok = false;
- return false;
- }
+ number_to_string (content_length, ftello (data_in));
warc_write_header ("Content-Length", content_length);
- free (content_length);
/* End of the WARC header section. */
warc_write_string ("\r\n");
/* The extra header field identifier for the WARC skip length. */
extra_header[2] = 's';
extra_header[3] = 'l';
+ /* The size of the field value (8 bytes). */
+ extra_header[4] = (8 & 255);
+ extra_header[5] = ((8 >> 8) & 255);
/* The size of the uncompressed record. */
- extra_header[4] = (uncompressed_size & 255);
- extra_header[5] = (uncompressed_size >> 8) & 255;
- extra_header[6] = (uncompressed_size >> 16) & 255;
- extra_header[7] = (uncompressed_size >> 24) & 255;
+ extra_header[6] = (uncompressed_size & 255);
+ extra_header[7] = (uncompressed_size >> 8) & 255;
+ extra_header[8] = (uncompressed_size >> 16) & 255;
+ extra_header[9] = (uncompressed_size >> 24) & 255;
/* The size of the compressed record. */
- extra_header[8] = (compressed_size & 255);
- extra_header[9] = (compressed_size >> 8) & 255;
- extra_header[10] = (compressed_size >> 16) & 255;
- extra_header[11] = (compressed_size >> 24) & 255;
+ extra_header[10] = (compressed_size & 255);
+ extra_header[11] = (compressed_size >> 8) & 255;
+ extra_header[12] = (compressed_size >> 16) & 255;
+ extra_header[13] = (compressed_size >> 24) & 255;
/* Write the extra header after the static header. */
fseeko (warc_current_file, warc_current_gzfile_offset
static char *
warc_base32_sha1_digest (char *sha1_digest)
{
- // length: "sha1:" + digest + "\0"
+ /* length: "sha1:" + digest + "\0" */
char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 );
base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1);
if (warc_cdx_dedup_table == NULL)
return NULL;
- char *key;
- struct warc_cdx_record *rec_existing;
- int found = hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload,
- &key, &rec_existing);
+ struct warc_cdx_record *rec_existing
+ = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
- if (found && strcmp (rec_existing->url, url) == 0)
+ if (rec_existing && strcmp (rec_existing->url, url) == 0)
return rec_existing;
else
return NULL;
if (redirect_location == NULL || strlen(redirect_location) == 0)
redirect_location = "-";
+ char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
+ number_to_string (offset_string, offset);
+
/* Print the CDX line. */
- fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url,
+ fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
timestamp_str_cdx, url, mime_type, response_code, checksum,
- redirect_location, offset, warc_current_filename, response_uuid);
+ redirect_location, offset_string, warc_current_filename,
+ response_uuid);
fflush (warc_current_cdx_file);
return true;