X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fwarc.c;h=38ef3a1b0172f733d17da6edbf79c9dde8185c93;hp=92a49ef8a4ec98eef2bba955e8f4007212ca2462;hb=38a7829dcb4eb5dba28dbf0f05c6a80fea9217f8;hpb=1d14c18d7f81c03fdf79358055ed909c6d65caa1 diff --git a/src/warc.c b/src/warc.c index 92a49ef8..38ef3a1b 100644 --- a/src/warc.c +++ b/src/warc.c @@ -51,10 +51,16 @@ as that of the covered work. */ #ifndef WINDOWS #include +#else +#include #endif #include "warc.h" +#ifndef O_TEMPORARY +#define O_TEMPORARY 0 +#endif + extern char *version_string; /* Set by main in main.c */ @@ -102,7 +108,7 @@ static char *warc_current_filename; static int warc_current_file_number; /* The table of CDX records, if deduplication is enabled. */ -struct hash_table * warc_cdx_dedup_table; +static struct hash_table * warc_cdx_dedup_table; static bool warc_start_new_file (bool meta); @@ -165,7 +171,7 @@ warc_write_string (const char *str) } -#define EXTRA_GZIP_HEADER_SIZE 12 +#define EXTRA_GZIP_HEADER_SIZE 14 #define GZIP_STATIC_HEADER_SIZE 10 #define FLG_FEXTRA 0x04 #define OFF_FLG 3 @@ -200,7 +206,7 @@ warc_write_start_record (void) In warc_write_end_record we will fill this space with information about the uncompressed and compressed size of the record. */ - fprintf (warc_current_file, "XXXXXXXXXXXX"); + fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR); fflush (warc_current_file); /* Start a new GZIP stream. */ @@ -245,15 +251,10 @@ static bool warc_write_block_from_file (FILE *data_in) { /* Add the Content-Length header. */ - char *content_length; + char content_length[MAX_INT_TO_STRING_LEN(off_t)]; fseeko (data_in, 0L, SEEK_END); - if (! asprintf (&content_length, "%ld", ftello (data_in))) - { - warc_write_ok = false; - return false; - } + number_to_string (content_length, ftello (data_in)); warc_write_header ("Content-Length", content_length); - free (content_length); /* End of the WARC header section. */ warc_write_string ("\r\n"); @@ -347,16 +348,19 @@ warc_write_end_record (void) /* The extra header field identifier for the WARC skip length. */ extra_header[2] = 's'; extra_header[3] = 'l'; + /* The size of the field value (8 bytes). */ + extra_header[4] = (8 & 255); + extra_header[5] = ((8 >> 8) & 255); /* The size of the uncompressed record. */ - extra_header[4] = (uncompressed_size & 255); - extra_header[5] = (uncompressed_size >> 8) & 255; - extra_header[6] = (uncompressed_size >> 16) & 255; - extra_header[7] = (uncompressed_size >> 24) & 255; + extra_header[6] = (uncompressed_size & 255); + extra_header[7] = (uncompressed_size >> 8) & 255; + extra_header[8] = (uncompressed_size >> 16) & 255; + extra_header[9] = (uncompressed_size >> 24) & 255; /* The size of the compressed record. */ - extra_header[8] = (compressed_size & 255); - extra_header[9] = (compressed_size >> 8) & 255; - extra_header[10] = (compressed_size >> 16) & 255; - extra_header[11] = (compressed_size >> 24) & 255; + extra_header[10] = (compressed_size & 255); + extra_header[11] = (compressed_size >> 8) & 255; + extra_header[12] = (compressed_size >> 16) & 255; + extra_header[13] = (compressed_size >> 24) & 255; /* Write the extra header after the static header. */ fseeko (warc_current_file, warc_current_gzfile_offset @@ -532,7 +536,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, static char * warc_base32_sha1_digest (char *sha1_digest) { - // length: "sha1:" + digest + "\0" + /* length: "sha1:" + digest + "\0" */ char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 ); base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1); @@ -722,10 +726,9 @@ warc_start_new_file (bool meta) if (warc_current_file != NULL) fclose (warc_current_file); - if (warc_current_warcinfo_uuid_str) - free (warc_current_warcinfo_uuid_str); - if (warc_current_filename) - free (warc_current_filename); + + free (warc_current_warcinfo_uuid_str); + free (warc_current_filename); warc_current_file_number++; @@ -734,8 +737,14 @@ warc_start_new_file (bool meta) char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); warc_current_filename = new_filename; +#ifdef __VMS +# define WARC_GZ "warc-gz" +#else /* def __VMS */ +# define WARC_GZ "warc.gz" +#endif /* def __VMS [else] */ + #ifdef HAVE_LIBZ - const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc"); + const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc"); #else const char *extension = "warc"; #endif @@ -908,11 +917,16 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, else { free (original_url); - if (checksum_v != NULL) - free (checksum_v); + free (checksum_v); free (record_id); } } + else + { + xfree_null(checksum); + xfree_null(original_url); + xfree_null(record_id); + } } /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills @@ -999,12 +1013,10 @@ warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) if (warc_cdx_dedup_table == NULL) return NULL; - char *key; - struct warc_cdx_record *rec_existing; - int found = hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, - &key, &rec_existing); + struct warc_cdx_record *rec_existing + = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload); - if (found && strcmp (rec_existing->url, url) == 0) + if (rec_existing && strcmp (rec_existing->url, url) == 0) return rec_existing; else return NULL; @@ -1081,7 +1093,7 @@ warc_write_metadata (void) warc_uuid_str (manifest_uuid); fflush (warc_manifest_fp); - warc_write_resource_record (manifest_uuid, + warc_write_metadata_record (manifest_uuid, "metadata://gnu.org/software/wget/warc/MANIFEST.txt", NULL, NULL, NULL, "text/plain", warc_manifest_fp, -1); @@ -1096,9 +1108,9 @@ warc_write_metadata (void) fflush (warc_tmp_fp); fprintf (warc_tmp_fp, "%s\n", program_argstring); - warc_write_resource_record (manifest_uuid, + warc_write_resource_record (NULL, "metadata://gnu.org/software/wget/warc/wget_arguments.txt", - NULL, NULL, NULL, "text/plain", + NULL, manifest_uuid, NULL, "text/plain", warc_tmp_fp, -1); /* warc_write_resource_record has closed warc_tmp_fp. */ @@ -1145,14 +1157,32 @@ warc_tempfile (void) if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1) return NULL; - int fd = mkstemp (filename); +#ifdef __VMS + /* 2013-07-12 SMS. + * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use + * mktemp() to uniquify the (VMS-style) name, and then use a normal + * fopen() with a "create temp file marked for delete" option. + */ + { + char *tfn; + + tfn = mktemp (filename); /* Get unique name from template. */ + if (tfn == NULL) + return NULL; + return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */ + } +#else /* def __VMS */ + int fd = mkostemp (filename, O_TEMPORARY); if (fd < 0) return NULL; +#if !O_TEMPORARY if (unlink (filename) < 0) return NULL; +#endif return fdopen (fd, "wb+"); +#endif /* def __VMS [else] */ } @@ -1201,7 +1231,7 @@ static bool warc_write_cdx_record (const char *url, const char *timestamp_str, const char *mime_type, int response_code, const char *payload_digest, const char *redirect_location, - off_t offset, const char *warc_filename, + off_t offset, const char *warc_filename _GL_UNUSED, const char *response_uuid) { /* Transform the timestamp. */ @@ -1226,10 +1256,14 @@ warc_write_cdx_record (const char *url, const char *timestamp_str, if (redirect_location == NULL || strlen(redirect_location) == 0) redirect_location = "-"; + char offset_string[MAX_INT_TO_STRING_LEN(off_t)]; + number_to_string (offset_string, offset); + /* Print the CDX line. */ - fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, + fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, - redirect_location, offset, warc_current_filename, response_uuid); + redirect_location, offset_string, warc_current_filename, + response_uuid); fflush (warc_current_cdx_file); return true; @@ -1381,28 +1415,28 @@ warc_write_response_record (char *url, char *timestamp_str, response_uuid); } - if (block_digest) - free (block_digest); - if (payload_digest) - free (payload_digest); + free (block_digest); + free (payload_digest); return warc_write_ok; } -/* Writes a resource record to the WARC file. +/* Writes a resource or metadata record to the WARC file. + warc_type is either "resource" or "metadata", resource_uuid is the uuid of the resource (or NULL), url is the target uri of the resource, timestamp_str is the timestamp (generated with warc_timestamp), - concurrent_to_uuid is the uuid of the request for that generated this + concurrent_to_uuid is the uuid of the record that generated this, resource (generated with warc_uuid_str) or NULL, ip is the ip address of the server (or NULL), content_type is the mime type of the body (or NULL), body is a pointer to a file containing the resource data. Calling this function will close body. Returns true on success, false on error. */ -bool -warc_write_resource_record (char *resource_uuid, const char *url, - const char *timestamp_str, const char *concurrent_to_uuid, +static bool +warc_write_record (const char *record_type, char *resource_uuid, + const char *url, const char *timestamp_str, + const char *concurrent_to_uuid, ip_address *ip, const char *content_type, FILE *body, off_t payload_offset) { @@ -1416,7 +1450,7 @@ warc_write_resource_record (char *resource_uuid, const char *url, content_type = "application/octet-stream"; warc_write_start_record (); - warc_write_header ("WARC-Type", "resource"); + warc_write_header ("WARC-Type", record_type); warc_write_header ("WARC-Record-ID", resource_uuid); warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); @@ -1432,3 +1466,47 @@ warc_write_resource_record (char *resource_uuid, const char *url, return warc_write_ok; } + +/* Writes a resource record to the WARC file. + resource_uuid is the uuid of the resource (or NULL), + url is the target uri of the resource, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the record that generated this, + resource (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the resource data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_resource_record (char *resource_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + return warc_write_record ("resource", + resource_uuid, url, timestamp_str, concurrent_to_uuid, + ip, content_type, body, payload_offset); +} + +/* Writes a metadata record to the WARC file. + record_uuid is the uuid of the record (or NULL), + url is the target uri of the record, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the record that generated this, + record (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the record data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_metadata_record (char *record_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + return warc_write_record ("metadata", + record_uuid, url, timestamp_str, concurrent_to_uuid, + ip, content_type, body, payload_offset); +}