X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fwarc.c;h=38ef3a1b0172f733d17da6edbf79c9dde8185c93;hp=911cebd7ee1fbbc2cee11b8b256676717a098901;hb=38a7829dcb4eb5dba28dbf0f05c6a80fea9217f8;hpb=bd4f1e60423c07475db39c979bb4c0c7b7acd22d diff --git a/src/warc.c b/src/warc.c index 911cebd7..38ef3a1b 100644 --- a/src/warc.c +++ b/src/warc.c @@ -51,10 +51,16 @@ as that of the covered work. */ #ifndef WINDOWS #include +#else +#include #endif #include "warc.h" +#ifndef O_TEMPORARY +#define O_TEMPORARY 0 +#endif + extern char *version_string; /* Set by main in main.c */ @@ -75,7 +81,7 @@ static FILE *warc_current_file; #ifdef HAVE_LIBZ /* The gzip stream for the current WARC file (or NULL, if WARC or gzip is disabled). */ -static gzFile *warc_current_gzfile; +static gzFile warc_current_gzfile; /* The offset of the current gzip record in the WARC file. */ static off_t warc_current_gzfile_offset; @@ -102,7 +108,7 @@ static char *warc_current_filename; static int warc_current_file_number; /* The table of CDX records, if deduplication is enabled. */ -struct hash_table * warc_cdx_dedup_table; +static struct hash_table * warc_cdx_dedup_table; static bool warc_start_new_file (bool meta); @@ -165,7 +171,7 @@ warc_write_string (const char *str) } -#define EXTRA_GZIP_HEADER_SIZE 12 +#define EXTRA_GZIP_HEADER_SIZE 14 #define GZIP_STATIC_HEADER_SIZE 10 #define FLG_FEXTRA 0x04 #define OFF_FLG 3 @@ -180,7 +186,7 @@ warc_write_string (const char *str) Returns false and set warc_write_ok to false if there is an error. */ static bool -warc_write_start_record () +warc_write_start_record (void) { if (!warc_write_ok) return false; @@ -200,7 +206,7 @@ warc_write_start_record () In warc_write_end_record we will fill this space with information about the uncompressed and compressed size of the record. */ - fprintf (warc_current_file, "XXXXXXXXXXXX"); + fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR); fflush (warc_current_file); /* Start a new GZIP stream. */ @@ -209,7 +215,8 @@ warc_write_start_record () if (warc_current_gzfile == NULL) { - logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n")); + logprintf (LOG_NOTQUIET, +_("Error opening GZIP stream to WARC file.\n")); warc_write_ok = false; return false; } @@ -244,15 +251,10 @@ static bool warc_write_block_from_file (FILE *data_in) { /* Add the Content-Length header. */ - char *content_length; + char content_length[MAX_INT_TO_STRING_LEN(off_t)]; fseeko (data_in, 0L, SEEK_END); - if (! asprintf (&content_length, "%ld", ftello (data_in))) - { - warc_write_ok = false; - return false; - } + number_to_string (content_length, ftello (data_in)); warc_write_header ("Content-Length", content_length); - free (content_length); /* End of the WARC header section. */ warc_write_string ("\r\n"); @@ -279,7 +281,7 @@ warc_write_block_from_file (FILE *data_in) with the uncompressed and compressed length of the record. */ static bool -warc_write_end_record () +warc_write_end_record (void) { warc_write_buffer ("\r\n\r\n", 4); @@ -298,12 +300,12 @@ warc_write_end_record () /* The WARC standard suggests that we add 'skip length' data in the extra header field of the GZIP stream. - + In warc_write_start_record we reserved space for this extra header. This extra space starts at warc_current_gzfile_offset and fills EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE. - + We need to do three things: 1. Move the static GZIP header to warc_current_gzfile_offset; 2. Set the FEXTRA flag in the GZIP header; @@ -317,11 +319,13 @@ warc_write_end_record () off_t compressed_size = warc_current_gzfile_uncompressed_size; /* Go back to the static GZIP header. */ - fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET); + fseeko (warc_current_file, warc_current_gzfile_offset + + EXTRA_GZIP_HEADER_SIZE, SEEK_SET); /* Read the header. */ char static_header[GZIP_STATIC_HEADER_SIZE]; - size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); + size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, + warc_current_file); if (result != GZIP_STATIC_HEADER_SIZE) { warc_write_ok = false; @@ -331,7 +335,8 @@ warc_write_end_record () /* Set the FEXTRA flag in the flags byte of the header. */ static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA; - /* Write the header back to the file, but starting at warc_current_gzfile_offset. */ + /* Write the header back to the file, but starting at + warc_current_gzfile_offset. */ fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET); fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); @@ -343,19 +348,23 @@ warc_write_end_record () /* The extra header field identifier for the WARC skip length. */ extra_header[2] = 's'; extra_header[3] = 'l'; + /* The size of the field value (8 bytes). */ + extra_header[4] = (8 & 255); + extra_header[5] = ((8 >> 8) & 255); /* The size of the uncompressed record. */ - extra_header[4] = (uncompressed_size & 255); - extra_header[5] = (uncompressed_size >> 8) & 255; - extra_header[6] = (uncompressed_size >> 16) & 255; - extra_header[7] = (uncompressed_size >> 24) & 255; + extra_header[6] = (uncompressed_size & 255); + extra_header[7] = (uncompressed_size >> 8) & 255; + extra_header[8] = (uncompressed_size >> 16) & 255; + extra_header[9] = (uncompressed_size >> 24) & 255; /* The size of the compressed record. */ - extra_header[8] = (compressed_size & 255); - extra_header[9] = (compressed_size >> 8) & 255; - extra_header[10] = (compressed_size >> 16) & 255; - extra_header[11] = (compressed_size >> 24) & 255; + extra_header[10] = (compressed_size & 255); + extra_header[11] = (compressed_size >> 8) & 255; + extra_header[12] = (compressed_size >> 16) & 255; + extra_header[13] = (compressed_size >> 24) & 255; /* Write the extra header after the static header. */ - fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET); + fseeko (warc_current_file, warc_current_gzfile_offset + + GZIP_STATIC_HEADER_SIZE, SEEK_SET); fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file); /* Done, move back to the end of the file. */ @@ -372,7 +381,7 @@ warc_write_end_record () the current WARC record. If timestamp is NULL, the current time will be used. */ static bool -warc_write_date_header (char *timestamp) +warc_write_date_header (const char *timestamp) { if (timestamp == NULL) { @@ -402,13 +411,14 @@ warc_write_ip_header (ip_address *ip) Compute SHA1 message digests for bytes read from STREAM. The digest of the complete file will be written into the 16 bytes beginning at RES_BLOCK. - + If payload_offset >= 0, a second digest will be calculated of the portion of the file starting at payload_offset and continuing to the end of the file. The digest number will be written into the 16 bytes beginning ad RES_PAYLOAD. */ static int -warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset) +warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, + off_t payload_offset) { #define BLOCKSIZE 32768 @@ -486,7 +496,8 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, have to start with a full block, there may still be some bytes left from the previous buffer. Therefore, we need to continue with sha1_process_bytes. */ - sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload); + sha1_process_bytes (buffer + start_of_payload, + BLOCKSIZE - start_of_payload, &ctx_payload); } } @@ -505,7 +516,8 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, start_of_payload = 0; /* Process the payload part of the buffer. */ - sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload); + sha1_process_bytes (buffer + start_of_payload, + sum - start_of_payload, &ctx_payload); } } @@ -524,9 +536,10 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, static char * warc_base32_sha1_digest (char *sha1_digest) { - // length: "sha1:" + digest + "\0" + /* length: "sha1:" + digest + "\0" */ char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 ); - base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1); + base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, + BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1); memcpy (sha1_base32, "sha1:", 5); sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0'; return sha1_base32; @@ -547,7 +560,8 @@ warc_write_digest_headers (FILE *file, long payload_offset) char sha1_res_payload[SHA1_DIGEST_SIZE]; rewind (file); - if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0) + if (warc_sha1_stream_with_payload (file, sha1_res_block, + sha1_res_payload, payload_offset) == 0) { char *digest; @@ -633,11 +647,12 @@ warc_uuid_str (char *urn_str) /* Write a warcinfo record to the current file. Updates warc_current_warcinfo_uuid_str. */ -bool +static bool warc_write_warcinfo_record (char *filename) { /* Write warc-info record as the first record of the file. */ - /* We add the record id of this info record to the other records in the file. */ + /* We add the record id of this info record to the other records in the + file. */ warc_current_warcinfo_uuid_str = (char *) malloc (48); warc_uuid_str (warc_current_warcinfo_uuid_str); @@ -666,7 +681,8 @@ warc_write_warcinfo_record (char *filename) fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE); fprintf (warc_tmp, "format: WARC File Format 1.0\r\n"); - fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"); + fprintf (warc_tmp, +"conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"); fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off")); fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring); /* Add the user headers, if any. */ @@ -683,9 +699,7 @@ warc_write_warcinfo_record (char *filename) warc_write_end_record (); if (! warc_write_ok) - { - logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n")); - } + logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n")); free (filename_copy); free (filename_basename); @@ -695,7 +709,7 @@ warc_write_warcinfo_record (char *filename) /* Opens a new WARC file. If META is true, generates a filename ending with 'meta.warc.gz'. - + This method will: 1. close the current WARC file (if there is one); 2. increment warc_current_file_number; @@ -712,10 +726,9 @@ warc_start_new_file (bool meta) if (warc_current_file != NULL) fclose (warc_current_file); - if (warc_current_warcinfo_uuid_str) - free (warc_current_warcinfo_uuid_str); - if (warc_current_filename) - free (warc_current_filename); + + free (warc_current_warcinfo_uuid_str); + free (warc_current_filename); warc_current_file_number++; @@ -724,17 +737,26 @@ warc_start_new_file (bool meta) char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); warc_current_filename = new_filename; +#ifdef __VMS +# define WARC_GZ "warc-gz" +#else /* def __VMS */ +# define WARC_GZ "warc.gz" +#endif /* def __VMS [else] */ + #ifdef HAVE_LIBZ - char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc"); + const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc"); #else - char *extension = "warc"; + const char *extension = "warc"; #endif /* If max size is enabled, we add a serial number to the file names. */ if (meta) sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension); else if (opt.warc_maxsize > 0) - sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension); + { + sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, + warc_current_file_number, extension); + } else sprintf (new_filename, "%s.%s", opt.warc_filename, extension); @@ -744,7 +766,8 @@ warc_start_new_file (bool meta) warc_current_file = fopen (new_filename, "wb+"); if (warc_current_file == NULL) { - logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename)); + logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), + quote (new_filename)); return false; } @@ -760,7 +783,7 @@ warc_start_new_file (bool meta) /* Opens the CDX file for output. */ static bool -warc_start_cdx_file () +warc_start_cdx_file (void) { int filename_length = strlen (opt.warc_filename); char *cdx_filename = alloca (filename_length + 4 + 1); @@ -794,7 +817,8 @@ warc_start_cdx_file () /* Parse the CDX header and find the field numbers of the original url, checksum and record ID fields. */ static bool -warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id) +warc_parse_cdx_header (char *lineptr, int *field_num_original_url, + int *field_num_checksum, int *field_num_record_id) { *field_num_original_url = -1; *field_num_checksum = -1; @@ -803,7 +827,7 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_nu char *token; char *save_ptr; token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); - + if (token != NULL && strcmp (token, "CDX") == 0) { int field_num = 0; @@ -836,7 +860,8 @@ warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_nu /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */ static void -warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id) +warc_process_cdx_line (char *lineptr, int field_num_original_url, + int field_num_checksum, int field_num_record_id) { char *original_url = NULL; char *checksum = NULL; @@ -874,13 +899,15 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_ bytes. */ size_t checksum_l; char * checksum_v; - base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l); + base32_decode_alloc (checksum, strlen (checksum), &checksum_v, + &checksum_l); free (checksum); if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) { /* This is a valid line with a valid checksum. */ - struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record)); + struct warc_cdx_record *rec; + rec = malloc (sizeof (struct warc_cdx_record)); rec->url = original_url; rec->uuid = record_id; memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE); @@ -890,17 +917,22 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_ else { free (original_url); - if (checksum_v != NULL) - free (checksum_v); + free (checksum_v); free (record_id); } } + else + { + xfree_null(checksum); + xfree_null(original_url); + xfree_null(record_id); + } } /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills the warc_cdx_dedup_table. */ -bool -warc_load_cdx_dedup_file () +static bool +warc_load_cdx_dedup_file (void) { FILE *f = fopen (opt.warc_cdx_dedup_filename, "r"); if (f == NULL) @@ -912,7 +944,7 @@ warc_load_cdx_dedup_file () char *lineptr = NULL; size_t n = 0; - size_t line_length; + ssize_t line_length; /* The first line should contain the CDX header. Format: " CDX x x x x x" @@ -921,7 +953,8 @@ warc_load_cdx_dedup_file () 'u' (the WARC record id). */ line_length = getline (&lineptr, &n, f); if (line_length != -1) - warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id); + warc_parse_cdx_header (lineptr, &field_num_original_url, + &field_num_checksum, &field_num_record_id); /* If the file contains all three fields, read the complete file. */ if (field_num_original_url == -1 @@ -929,22 +962,29 @@ warc_load_cdx_dedup_file () || field_num_record_id == -1) { if (field_num_original_url == -1) - logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n")); + logprintf (LOG_NOTQUIET, +_("CDX file does not list original urls. (Missing column 'a'.)\n")); if (field_num_checksum == -1) - logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n")); + logprintf (LOG_NOTQUIET, +_("CDX file does not list checksums. (Missing column 'k'.)\n")); if (field_num_record_id == -1) - logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n")); + logprintf (LOG_NOTQUIET, +_("CDX file does not list record ids. (Missing column 'u'.)\n")); } else { /* Initialize the table. */ - warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest); + warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, + warc_cmp_sha1_digest); do { line_length = getline (&lineptr, &n, f); if (line_length != -1) - warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id); + { + warc_process_cdx_line (lineptr, field_num_original_url, + field_num_checksum, field_num_record_id); + } } while (line_length != -1); @@ -952,7 +992,8 @@ warc_load_cdx_dedup_file () /* Print results. */ int nrecords = hash_table_count (warc_cdx_dedup_table); logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n", - "Loaded %d records from CDX.\n\n", nrecords), + "Loaded %d records from CDX.\n\n", + nrecords), nrecords); } @@ -972,11 +1013,10 @@ warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) if (warc_cdx_dedup_table == NULL) return NULL; - char *key; - struct warc_cdx_record *rec_existing; - hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing); + struct warc_cdx_record *rec_existing + = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload); - if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0) + if (rec_existing && strcmp (rec_existing->url, url) == 0) return rec_existing; else return NULL; @@ -985,7 +1025,7 @@ warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) /* Initializes the WARC writer (if opt.warc_filename is set). This should be called before any WARC record is written. */ void -warc_init () +warc_init (void) { warc_write_ok = true; @@ -1005,7 +1045,8 @@ warc_init () warc_manifest_fp = warc_tempfile (); if (warc_manifest_fp == NULL) { - logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n")); + logprintf (LOG_NOTQUIET, + _("Could not open temporary WARC manifest file.\n")); exit(1); } @@ -1014,7 +1055,8 @@ warc_init () warc_log_fp = warc_tempfile (); if (warc_log_fp == NULL) { - logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n")); + logprintf (LOG_NOTQUIET, + _("Could not open temporary WARC log file.\n")); exit(1); } log_set_warc_log_fp (warc_log_fp); @@ -1031,7 +1073,8 @@ warc_init () { if (! warc_start_cdx_file ()) { - logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n")); + logprintf (LOG_NOTQUIET, + _("Could not open CDX file for output.\n")); exit(1); } } @@ -1039,8 +1082,8 @@ warc_init () } /* Writes metadata (manifest, configuration, log file) to the WARC file. */ -void -warc_write_metadata () +static void +warc_write_metadata (void) { /* If there are multiple WARC files, the metadata should be written to a separate file. */ if (opt.warc_maxsize > 0) @@ -1050,7 +1093,7 @@ warc_write_metadata () warc_uuid_str (manifest_uuid); fflush (warc_manifest_fp); - warc_write_resource_record (manifest_uuid, + warc_write_metadata_record (manifest_uuid, "metadata://gnu.org/software/wget/warc/MANIFEST.txt", NULL, NULL, NULL, "text/plain", warc_manifest_fp, -1); @@ -1065,16 +1108,16 @@ warc_write_metadata () fflush (warc_tmp_fp); fprintf (warc_tmp_fp, "%s\n", program_argstring); - warc_write_resource_record (manifest_uuid, - "metadata://gnu.org/software/wget/warc/wget_arguments.txt", - NULL, NULL, NULL, "text/plain", + warc_write_resource_record (NULL, + "metadata://gnu.org/software/wget/warc/wget_arguments.txt", + NULL, manifest_uuid, NULL, "text/plain", warc_tmp_fp, -1); /* warc_write_resource_record has closed warc_tmp_fp. */ if (warc_log_fp != NULL) { warc_write_resource_record (NULL, - "metadata://gnu.org/software/wget/warc/wget.log", + "metadata://gnu.org/software/wget/warc/wget.log", NULL, manifest_uuid, NULL, "text/plain", warc_log_fp, -1); /* warc_write_resource_record has closed warc_log_fp. */ @@ -1087,7 +1130,7 @@ warc_write_metadata () /* Finishes the WARC writing. This should be called at the end of the program. */ void -warc_close () +warc_close (void) { if (warc_current_file != NULL) { @@ -1108,20 +1151,38 @@ warc_close () The temporary file will be created in opt.warc_tempdir. Returns the pointer to the temporary file, or NULL. */ FILE * -warc_tempfile () +warc_tempfile (void) { char filename[100]; if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1) return NULL; - int fd = mkstemp (filename); +#ifdef __VMS + /* 2013-07-12 SMS. + * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use + * mktemp() to uniquify the (VMS-style) name, and then use a normal + * fopen() with a "create temp file marked for delete" option. + */ + { + char *tfn; + + tfn = mktemp (filename); /* Get unique name from template. */ + if (tfn == NULL) + return NULL; + return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */ + } +#else /* def __VMS */ + int fd = mkostemp (filename, O_TEMPORARY); if (fd < 0) return NULL; +#if !O_TEMPORARY if (unlink (filename) < 0) return NULL; +#endif return fdopen (fd, "wb+"); +#endif /* def __VMS [else] */ } @@ -1134,7 +1195,8 @@ warc_tempfile () Calling this function will close body. Returns true on success, false on error. */ bool -warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset) +warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, + ip_address *ip, FILE *body, off_t payload_offset) { warc_write_start_record (); warc_write_header ("WARC-Type", "request"); @@ -1147,7 +1209,7 @@ warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip warc_write_digest_headers (body, payload_offset); warc_write_block_from_file (body); warc_write_end_record (); - + fclose (body); return warc_write_ok; @@ -1166,7 +1228,11 @@ warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip response_uuid is the uuid of the response. Returns true on success, false on error. */ static bool -warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid) +warc_write_cdx_record (const char *url, const char *timestamp_str, + const char *mime_type, int response_code, + const char *payload_digest, const char *redirect_location, + off_t offset, const char *warc_filename _GL_UNUSED, + const char *response_uuid) { /* Transform the timestamp. */ char timestamp_str_cdx [15]; @@ -1177,9 +1243,9 @@ warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int resp memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */ memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */ timestamp_str_cdx[14] = '\0'; - + /* Rewrite the checksum. */ - char *checksum; + const char *checksum; if (payload_digest != NULL) checksum = payload_digest + 5; /* Skip the "sha1:" */ else @@ -1190,8 +1256,14 @@ warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int resp if (redirect_location == NULL || strlen(redirect_location) == 0) redirect_location = "-"; + char offset_string[MAX_INT_TO_STRING_LEN(off_t)]; + number_to_string (offset_string, offset); + /* Print the CDX line. */ - fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid); + fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url, + timestamp_str_cdx, url, mime_type, response_code, checksum, + redirect_location, offset_string, warc_current_filename, + response_uuid); fflush (warc_current_cdx_file); return true; @@ -1211,7 +1283,9 @@ warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int resp Calling this function will close body. Returns true on success, false on error. */ static bool -warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body) +warc_write_revisit_record (char *url, char *timestamp_str, + char *concurrent_to_uuid, char *payload_digest, + char *refers_to, ip_address *ip, FILE *body) { char revisit_uuid [48]; warc_uuid_str (revisit_uuid); @@ -1237,7 +1311,7 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_u warc_write_header ("WARC-Payload-Digest", payload_digest); warc_write_block_from_file (body); warc_write_end_record (); - + fclose (body); free (block_digest); @@ -1258,7 +1332,10 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_u Calling this function will close body. Returns true on success, false on error. */ bool -warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location) +warc_write_response_record (char *url, char *timestamp_str, + char *concurrent_to_uuid, ip_address *ip, + FILE *body, off_t payload_offset, char *mime_type, + int response_code, char *redirect_location) { char *block_digest = NULL; char *payload_digest = NULL; @@ -1269,15 +1346,20 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_ { /* Calculate the block and payload digests. */ rewind (body); - if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0) + if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, + payload_offset) == 0) { /* Decide (based on url + payload digest) if we have seen this data before. */ - struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload); + struct warc_cdx_record *rec_existing; + rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload); if (rec_existing != NULL) { + bool result; + /* Found an existing record. */ - logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n")); + logprintf (LOG_VERBOSE, + _("Found exact match in CDX file. Saving revisit record to WARC.\n")); /* Remove the payload from the file. */ if (payload_offset > 0) @@ -1288,7 +1370,9 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_ /* Send the original payload digest. */ payload_digest = warc_base32_sha1_digest (sha1_res_payload); - bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body); + result = warc_write_revisit_record (url, timestamp_str, + concurrent_to_uuid, payload_digest, rec_existing->uuid, + ip, body); free (payload_digest); return result; @@ -1326,30 +1410,35 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_ if (warc_write_ok && opt.warc_cdx_enabled) { /* Add this record to the CDX. */ - warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid); + warc_write_cdx_record (url, timestamp_str, mime_type, response_code, + payload_digest, redirect_location, offset, warc_current_filename, + response_uuid); } - if (block_digest) - free (block_digest); - if (payload_digest) - free (payload_digest); + free (block_digest); + free (payload_digest); return warc_write_ok; } -/* Writes a resource record to the WARC file. +/* Writes a resource or metadata record to the WARC file. + warc_type is either "resource" or "metadata", resource_uuid is the uuid of the resource (or NULL), url is the target uri of the resource, timestamp_str is the timestamp (generated with warc_timestamp), - concurrent_to_uuid is the uuid of the request for that generated this resource - (generated with warc_uuid_str) or NULL, + concurrent_to_uuid is the uuid of the record that generated this, + resource (generated with warc_uuid_str) or NULL, ip is the ip address of the server (or NULL), content_type is the mime type of the body (or NULL), body is a pointer to a file containing the resource data. Calling this function will close body. Returns true on success, false on error. */ -bool -warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset) +static bool +warc_write_record (const char *record_type, char *resource_uuid, + const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) { if (resource_uuid == NULL) { @@ -1361,7 +1450,7 @@ warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, content_type = "application/octet-stream"; warc_write_start_record (); - warc_write_header ("WARC-Type", "resource"); + warc_write_header ("WARC-Type", record_type); warc_write_header ("WARC-Record-ID", resource_uuid); warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); @@ -1372,9 +1461,52 @@ warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, warc_write_header ("Content-Type", content_type); warc_write_block_from_file (body); warc_write_end_record (); - + fclose (body); return warc_write_ok; } +/* Writes a resource record to the WARC file. + resource_uuid is the uuid of the resource (or NULL), + url is the target uri of the resource, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the record that generated this, + resource (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the resource data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_resource_record (char *resource_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + return warc_write_record ("resource", + resource_uuid, url, timestamp_str, concurrent_to_uuid, + ip, content_type, body, payload_offset); +} + +/* Writes a metadata record to the WARC file. + record_uuid is the uuid of the record (or NULL), + url is the target uri of the record, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the record that generated this, + record (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the record data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_metadata_record (char *record_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + return warc_write_record ("metadata", + record_uuid, url, timestamp_str, concurrent_to_uuid, + ip, content_type, body, payload_offset); +}