1 /* Conversion of links to local files.
2 Copyright (C) 2003-2005 Free Software Foundation, Inc.
4 This file is part of GNU Wget.
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software Foundation, Inc.,
18 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
37 #endif /* HAVE_UNISTD_H */
50 static struct hash_table *dl_file_url_map;
51 struct hash_table *dl_url_file_map;
53 /* Set of HTML files downloaded in this Wget run, used for link
54 conversion after Wget is done. */
55 struct hash_table *downloaded_html_set;
57 static struct hash_table *nonexisting_urls_hash;
59 static void convert_links (const char *, struct urlpos *);
61 /* This function is called when the retrieval is done to convert the
62 links that have been downloaded. It has to be called at the end of
63 the retrieval, because only then does Wget know conclusively which
64 URLs have been downloaded, and which not, so it can tell which
65 direction to convert to.
67 The "direction" means that the URLs to the files that have been
68 downloaded get converted to the relative URL which will point to
69 that file. And the other URLs get converted to the remote URL on
72 All the downloaded HTMLs are kept in downloaded_html_files, and
73 downloaded URLs in urls_downloaded. All the information is
74 extracted from these two lists. */
77 convert_all_links (void)
83 struct ptimer *timer = ptimer_new ();
89 if (downloaded_html_set)
90 cnt = hash_table_count (downloaded_html_set);
93 file_array = alloca_array (char *, cnt);
94 string_set_to_array (downloaded_html_set, file_array);
96 for (i = 0; i < cnt; i++)
98 struct urlpos *urls, *cur_url;
100 char *file = file_array[i];
102 /* Determine the URL of the HTML file. get_urls_html will need
104 url = hash_table_get (dl_file_url_map, file);
107 DEBUGP (("Apparently %s has been removed.\n", file));
111 DEBUGP (("Scanning %s (from %s)\n", file, url));
113 /* Parse the HTML file... */
114 urls = get_urls_html (file, url, NULL);
116 /* We don't respect meta_disallow_follow here because, even if
117 the file is not followed, we might still want to convert the
118 links that have been followed from other files. */
120 for (cur_url = urls; cur_url; cur_url = cur_url->next)
123 struct url *u = cur_url->url;
125 if (cur_url->link_base_p)
127 /* Base references have been resolved by our parser, so
128 we turn the base URL into an empty string. (Perhaps
129 we should remove the tag entirely?) */
130 cur_url->convert = CO_NULLIFY_BASE;
134 /* We decide the direction of conversion according to whether
135 a URL was downloaded. Downloaded URLs will be converted
136 ABS2REL, whereas non-downloaded will be converted REL2ABS. */
137 local_name = hash_table_get (dl_url_file_map, u->url);
139 /* Decide on the conversion type. */
142 /* We've downloaded this URL. Convert it to relative
143 form. We do this even if the URL already is in
144 relative form, because our directory structure may
145 not be identical to that on the server (think `-nd',
146 `--cut-dirs', etc.) */
147 cur_url->convert = CO_CONVERT_TO_RELATIVE;
148 cur_url->local_name = xstrdup (local_name);
149 DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
153 /* We haven't downloaded this URL. If it's not already
154 complete (including a full host name), convert it to
155 that form, so it can be reached while browsing this
157 if (!cur_url->link_complete_p)
158 cur_url->convert = CO_CONVERT_TO_COMPLETE;
159 cur_url->local_name = NULL;
160 DEBUGP (("will convert url %s to complete\n", u->url));
164 /* Convert the links in the file. */
165 convert_links (file, urls);
172 secs = ptimer_measure (timer);
173 ptimer_destroy (timer);
174 logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"),
175 file_count, print_decimal (secs));
178 static void write_backup_file (const char *, downloaded_file_t);
179 static const char *replace_attr (const char *, int, FILE *, const char *);
180 static const char *replace_attr_refresh_hack (const char *, int, FILE *,
182 static char *local_quote_string (const char *);
183 static char *construct_relative (const char *, const char *);
185 /* Change the links in one HTML file. LINKS is a list of links in the
186 document, along with their positions and the desired direction of
189 convert_links (const char *file, struct urlpos *links)
191 struct file_memory *fm;
194 downloaded_file_t downloaded_file_return;
197 int to_url_count = 0, to_file_count = 0;
199 logprintf (LOG_VERBOSE, _("Converting %s... "), file);
202 /* First we do a "dry run": go through the list L and see whether
203 any URL needs to be converted in the first place. If not, just
204 leave the file alone. */
207 for (dry = links; dry; dry = dry->next)
208 if (dry->convert != CO_NOCONVERT)
212 logputs (LOG_VERBOSE, _("nothing to do.\n"));
217 fm = read_file (file);
220 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
221 file, strerror (errno));
225 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);
226 if (opt.backup_converted && downloaded_file_return)
227 write_backup_file (file, downloaded_file_return);
229 /* Before opening the file for writing, unlink the file. This is
230 important if the data in FM is mmaped. In such case, nulling the
231 file, which is what fopen() below does, would make us read all
232 zeroes from the mmaped region. */
233 if (unlink (file) < 0 && errno != ENOENT)
235 logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),
236 file, strerror (errno));
240 /* Now open the file for writing. */
241 fp = fopen (file, "wb");
244 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),
245 file, strerror (errno));
250 /* Here we loop through all the URLs in file, replacing those of
251 them that are downloaded with relative references. */
253 for (link = links; link; link = link->next)
255 char *url_start = fm->content + link->pos;
257 if (link->pos >= fm->length)
259 DEBUGP (("Something strange is going on. Please investigate."));
262 /* If the URL is not to be converted, skip it. */
263 if (link->convert == CO_NOCONVERT)
265 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));
269 /* Echo the file contents, up to the offending URL's opening
270 quote, to the outfile. */
271 fwrite (p, 1, url_start - p, fp);
274 switch (link->convert)
276 case CO_CONVERT_TO_RELATIVE:
277 /* Convert absolute URL to relative. */
279 char *newname = construct_relative (file, link->local_name);
280 char *quoted_newname = local_quote_string (newname);
282 if (!link->link_refresh_p)
283 p = replace_attr (p, link->size, fp, quoted_newname);
285 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
286 link->refresh_timeout);
288 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
289 link->url->url, newname, link->pos, file));
291 xfree (quoted_newname);
295 case CO_CONVERT_TO_COMPLETE:
296 /* Convert the link to absolute URL. */
298 char *newlink = link->url->url;
299 char *quoted_newlink = html_quote_string (newlink);
301 if (!link->link_refresh_p)
302 p = replace_attr (p, link->size, fp, quoted_newlink);
304 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
305 link->refresh_timeout);
307 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
308 newlink, link->pos, file));
309 xfree (quoted_newlink);
313 case CO_NULLIFY_BASE:
314 /* Change the base href to "". */
315 p = replace_attr (p, link->size, fp, "");
323 /* Output the rest of the file. */
324 if (p - fm->content < fm->length)
325 fwrite (p, 1, fm->length - (p - fm->content), fp);
329 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
332 /* Construct and return a link that points from BASEFILE to LINKFILE.
333 Both files should be local file names, BASEFILE of the referrering
334 file, and LINKFILE of the referred file.
338 cr("foo", "bar") -> "bar"
339 cr("A/foo", "A/bar") -> "bar"
340 cr("A/foo", "A/B/bar") -> "B/bar"
341 cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
342 cr("X/", "Y/bar") -> "../Y/bar" (trailing slash does matter in BASE)
344 Both files should be absolute or relative, otherwise strange
345 results might ensue. The function makes no special efforts to
346 handle "." and ".." in links, so make sure they're not there
347 (e.g. using path_simplify). */
350 construct_relative (const char *basefile, const char *linkfile)
357 /* First, skip the initial directory components common to both
360 for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
363 start = (b - basefile) + 1;
368 /* With common directories out of the way, the situation we have is
370 b - b1/b2/[...]/bfile
371 l - l1/l2/[...]/lfile
373 The link we're constructing needs to be:
374 lnk - ../../l1/l2/[...]/lfile
376 Where the number of ".."'s equals the number of bN directory
379 /* Count the directory components in B. */
381 for (b = basefile; *b; b++)
387 /* Construct LINK as explained above. */
388 link = xmalloc (3 * basedirs + strlen (linkfile) + 1);
389 for (i = 0; i < basedirs; i++)
390 memcpy (link + 3 * i, "../", 3);
391 strcpy (link + 3 * i, linkfile);
395 /* Used by write_backup_file to remember which files have been
397 static struct hash_table *converted_files;
400 write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
402 /* Rather than just writing over the original .html file with the
403 converted version, save the former to *.orig. Note we only do
404 this for files we've _successfully_ downloaded, so we don't
405 clobber .orig files sitting around from previous invocations. */
407 /* Construct the backup filename as the original name plus ".orig". */
408 size_t filename_len = strlen (file);
409 char* filename_plus_orig_suffix;
411 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)
413 /* Just write "orig" over "html". We need to do it this way
414 because when we're checking to see if we've downloaded the
415 file before (to see if we can skip downloading it), we don't
416 know if it's a text/html file. Therefore we don't know yet
417 at that stage that -E is going to cause us to tack on
418 ".html", so we need to compare vs. the original URL plus
419 ".orig", not the original URL plus ".html.orig". */
420 filename_plus_orig_suffix = alloca (filename_len + 1);
421 strcpy (filename_plus_orig_suffix, file);
422 strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
424 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
426 /* Append ".orig" to the name. */
427 filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
428 strcpy (filename_plus_orig_suffix, file);
429 strcpy (filename_plus_orig_suffix + filename_len, ".orig");
432 if (!converted_files)
433 converted_files = make_string_hash_table (0);
435 /* We can get called twice on the same URL thanks to the
436 convert_all_links() call in main(). If we write the .orig file
437 each time in such a case, it'll end up containing the first-pass
438 conversion, not the original file. So, see if we've already been
439 called on this file. */
440 if (!string_set_contains (converted_files, file))
442 /* Rename <file> to <file>.orig before former gets written over. */
443 if (rename (file, filename_plus_orig_suffix) != 0)
444 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
445 file, filename_plus_orig_suffix, strerror (errno));
447 /* Remember that we've already written a .orig backup for this file.
448 Note that we never free this memory since we need it till the
449 convert_all_links() call, which is one of the last things the
450 program does before terminating. BTW, I'm not sure if it would be
451 safe to just set 'converted_file_ptr->string' to 'file' below,
452 rather than making a copy of the string... Another note is that I
453 thought I could just add a field to the urlpos structure saying
454 that we'd written a .orig file for this URL, but that didn't work,
455 so I had to make this separate list.
456 -- Dan Harkless <wget@harkless.org>
458 This [adding a field to the urlpos structure] didn't work
459 because convert_file() is called from convert_all_links at
460 the end of the retrieval with a freshly built new urlpos
462 -- Hrvoje Niksic <hniksic@xemacs.org>
464 string_set_add (converted_files, file);
468 static bool find_fragment (const char *, int, const char **, const char **);
470 /* Replace an attribute's original text with NEW_TEXT. */
473 replace_attr (const char *p, int size, FILE *fp, const char *new_text)
475 bool quote_flag = false;
476 char quote_char = '\"'; /* use "..." for quoting, unless the
477 original value is quoted, in which
478 case reuse its quoting char. */
479 const char *frag_beg, *frag_end;
481 /* Structure of our string is:
483 <--- size ---> (with quotes)
486 <--- size --> (no quotes) */
488 if (*p == '\"' || *p == '\'')
493 size -= 2; /* disregard opening and closing quote */
495 putc (quote_char, fp);
496 fputs (new_text, fp);
498 /* Look for fragment identifier, if any. */
499 if (find_fragment (p, size, &frag_beg, &frag_end))
500 fwrite (frag_beg, 1, frag_end - frag_beg, fp);
504 putc (quote_char, fp);
509 /* The same as REPLACE_ATTR, but used when replacing
510 <meta http-equiv=refresh content="new_text"> because we need to
511 append "timeout_value; URL=" before the next_text. */
514 replace_attr_refresh_hack (const char *p, int size, FILE *fp,
515 const char *new_text, int timeout)
518 char *new_with_timeout = (char *)alloca (numdigit (timeout)
522 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
524 return replace_attr (p, size, fp, new_with_timeout);
527 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
528 preceded by '&'. If the character is not found, return zero. If
529 the character is found, return true and set BP and EP to point to
530 the beginning and end of the region.
532 This is used for finding the fragment indentifiers in URLs. */
535 find_fragment (const char *beg, int size, const char **bp, const char **ep)
537 const char *end = beg + size;
538 bool saw_amp = false;
539 for (; beg < end; beg++)
561 /* Quote FILE for use as local reference to an HTML file.
563 We quote ? as %3F to avoid passing part of the file name as the
564 parameter when browsing the converted file through HTTP. However,
565 it is safe to do this only when `--html-extension' is turned on.
566 This is because converting "index.html?foo=bar" to
567 "index.html%3Ffoo=bar" would break local browsing, as the latter
568 isn't even recognized as an HTML file! However, converting
569 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
570 safe for both local and HTTP-served browsing.
572 We always quote "#" as "%23" and "%" as "%25" because those
573 characters have special meanings in URLs. */
576 local_quote_string (const char *file)
581 char *any = strpbrk (file, "?#%");
583 return html_quote_string (file);
585 /* Allocate space assuming the worst-case scenario, each character
586 having to be quoted. */
587 to = newname = (char *)alloca (3 * strlen (file) + 1);
588 for (from = file; *from; from++)
602 if (opt.html_extension)
615 return html_quote_string (newname);
618 /* Book-keeping code for dl_file_url_map, dl_url_file_map,
619 downloaded_html_list, and downloaded_html_set. Other code calls
620 these functions to let us know that a file has been downloaded. */
622 #define ENSURE_TABLES_EXIST do { \
623 if (!dl_file_url_map) \
624 dl_file_url_map = make_string_hash_table (0); \
625 if (!dl_url_file_map) \
626 dl_url_file_map = make_string_hash_table (0); \
629 /* Return true if S1 and S2 are the same, except for "/index.html".
630 The three cases in which it returns one are (substitute any
631 substring for "foo"):
633 m("foo/index.html", "foo/") ==> 1
634 m("foo/", "foo/index.html") ==> 1
635 m("foo", "foo/index.html") ==> 1
636 m("foo", "foo/" ==> 1
637 m("foo", "foo") ==> 1 */
640 match_except_index (const char *s1, const char *s2)
645 /* Skip common substring. */
646 for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)
649 /* Strings differ at the very beginning -- bail out. We need to
650 check this explicitly to avoid `lng - 1' reading outside the
655 /* Both strings hit EOF -- strings are equal. */
658 /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */
661 /* S1 is the longer one. */
664 /* S2 is the longer one. */
668 /* foo/index.html */ /* or */ /* foo/index.html */
672 /* The right-hand case. */
675 if (*lng == '/' && *(lng + 1) == '\0')
680 return 0 == strcmp (lng, "/index.html");
684 dissociate_urls_from_file_mapper (void *key, void *value, void *arg)
686 char *mapping_url = (char *)key;
687 char *mapping_file = (char *)value;
688 char *file = (char *)arg;
690 if (0 == strcmp (mapping_file, file))
692 hash_table_remove (dl_url_file_map, mapping_url);
694 xfree (mapping_file);
697 /* Continue mapping. */
701 /* Remove all associations from various URLs to FILE from dl_url_file_map. */
704 dissociate_urls_from_file (const char *file)
706 /* Can't use hash_table_iter_* because the table mutates while mapping. */
707 hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper,
711 /* Register that URL has been successfully downloaded to FILE. This
712 is used by the link conversion code to convert references to URLs
713 to references to local files. It is also being used to check if a
714 URL has already been downloaded. */
717 register_download (const char *url, const char *file)
719 char *old_file, *old_url;
723 /* With some forms of retrieval, it is possible, although not likely
724 or particularly desirable. If both are downloaded, the second
725 download will override the first one. When that happens,
726 dissociate the old file name from the URL. */
728 if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
730 if (0 == strcmp (url, old_url))
731 /* We have somehow managed to download the same URL twice.
735 if (match_except_index (url, old_url)
736 && !hash_table_contains (dl_url_file_map, url))
737 /* The two URLs differ only in the "index.html" ending. For
738 example, one is "http://www.server.com/", and the other is
739 "http://www.server.com/index.html". Don't remove the old
740 one, just add the new one as a non-canonical entry. */
743 hash_table_remove (dl_file_url_map, file);
747 /* Remove all the URLs that point to this file. Yes, there can
748 be more than one such URL, because we store redirections as
749 multiple entries in dl_url_file_map. For example, if URL1
750 redirects to URL2 which gets downloaded to FILE, we map both
751 URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map
752 only points to URL2.) When another URL gets loaded to FILE,
753 we want both URL1 and URL2 dissociated from it.
755 This is a relatively expensive operation because it performs
756 a linear search of the whole hash table, but it should be
757 called very rarely, only when two URLs resolve to the same
758 file name, *and* the "<file>.1" extensions are turned off.
759 In other words, almost never. */
760 dissociate_urls_from_file (file);
763 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));
766 /* A URL->FILE mapping is not possible without a FILE->URL mapping.
767 If the latter were present, it should have been removed by the
768 above `if'. So we could write:
770 assert (!hash_table_contains (dl_url_file_map, url));
772 The above is correct when running in recursive mode where the
773 same URL always resolves to the same file. But if you do
778 then the first URL will resolve to "FILE", and the other to
779 "FILE.1". In that case, FILE.1 will not be found in
780 dl_file_url_map, but URL will still point to FILE in
782 if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))
784 hash_table_remove (dl_url_file_map, url);
789 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));
792 /* Register that FROM has been redirected to TO. This assumes that TO
793 is successfully downloaded and already registered using
794 register_download() above. */
797 register_redirection (const char *from, const char *to)
803 file = hash_table_get (dl_url_file_map, to);
804 assert (file != NULL);
805 if (!hash_table_contains (dl_url_file_map, from))
806 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));
809 /* Register that the file has been deleted. */
812 register_delete_file (const char *file)
814 char *old_url, *old_file;
818 if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))
821 hash_table_remove (dl_file_url_map, file);
824 dissociate_urls_from_file (file);
827 /* Register that FILE is an HTML file that has been downloaded. */
830 register_html (const char *url, const char *file)
832 if (!downloaded_html_set)
833 downloaded_html_set = make_string_hash_table (0);
834 string_set_add (downloaded_html_set, file);
837 static void downloaded_files_free (void);
838 static void nonexisting_urls_free (void);
840 /* Cleanup the data structures associated with this file. */
843 convert_cleanup (void)
847 free_keys_and_values (dl_file_url_map);
848 hash_table_destroy (dl_file_url_map);
849 dl_file_url_map = NULL;
853 free_keys_and_values (dl_url_file_map);
854 hash_table_destroy (dl_url_file_map);
855 dl_url_file_map = NULL;
857 if (downloaded_html_set)
858 string_set_free (downloaded_html_set);
859 downloaded_files_free ();
860 nonexisting_urls_free ();
862 string_set_free (converted_files);
865 /* Book-keeping code for downloaded files that enables extension
868 /* This table should really be merged with dl_file_url_map and
869 downloaded_html_files. This was originally a list, but I changed
870 it to a hash table beause it was actually taking a lot of time to
871 find things in it. */
873 static struct hash_table *downloaded_files_hash;
875 /* We're storing "modes" of type downloaded_file_t in the hash table.
876 However, our hash tables only accept pointers for keys and values.
877 So when we need a pointer, we use the address of a
878 downloaded_file_t variable of static storage. */
880 static downloaded_file_t *
881 downloaded_mode_to_ptr (downloaded_file_t mode)
883 static downloaded_file_t
884 v1 = FILE_NOT_ALREADY_DOWNLOADED,
885 v2 = FILE_DOWNLOADED_NORMALLY,
886 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,
891 case FILE_NOT_ALREADY_DOWNLOADED:
893 case FILE_DOWNLOADED_NORMALLY:
895 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:
903 /* Remembers which files have been downloaded. In the standard case,
904 should be called with mode == FILE_DOWNLOADED_NORMALLY for each
905 file we actually download successfully (i.e. not for ones we have
906 failures on or that we skip due to -N).
908 When we've downloaded a file and tacked on a ".html" extension due
909 to -E, call this function with
910 FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than
911 FILE_DOWNLOADED_NORMALLY.
913 If you just want to check if a file has been previously added
914 without adding it, call with mode == CHECK_FOR_FILE. Please be
915 sure to call this function with local filenames, not remote
919 downloaded_file (downloaded_file_t mode, const char *file)
921 downloaded_file_t *ptr;
923 if (mode == CHECK_FOR_FILE)
925 if (!downloaded_files_hash)
926 return FILE_NOT_ALREADY_DOWNLOADED;
927 ptr = hash_table_get (downloaded_files_hash, file);
929 return FILE_NOT_ALREADY_DOWNLOADED;
933 if (!downloaded_files_hash)
934 downloaded_files_hash = make_string_hash_table (0);
936 ptr = hash_table_get (downloaded_files_hash, file);
940 ptr = downloaded_mode_to_ptr (mode);
941 hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);
943 return FILE_NOT_ALREADY_DOWNLOADED;
947 downloaded_files_free (void)
949 if (downloaded_files_hash)
951 hash_table_iterator iter;
952 for (hash_table_iterate (downloaded_files_hash, &iter);
953 hash_table_iter_next (&iter);
956 hash_table_destroy (downloaded_files_hash);
957 downloaded_files_hash = NULL;
961 /* Remembers broken links. */
963 struct broken_urls_list
966 struct broken_urls_list *next;
970 in_list (const struct broken_urls_list *list, const char *url)
972 const struct broken_urls_list *ptr;
974 for (ptr = list; ptr; ptr = ptr->next)
976 /* str[case]cmp is inadequate for URL comparison */
977 if (are_urls_equal (url, ptr->url) == 0) return true;
984 nonexisting_url (const char *url, const char *referrer)
986 struct broken_urls_list *list;
988 /* Ignore robots.txt URLs */
989 if (is_robots_txt_url (url))
992 if (!nonexisting_urls_hash)
993 nonexisting_urls_hash = make_string_hash_table (0);
995 list = hash_table_get (nonexisting_urls_hash, url);
998 list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
999 list->url = referrer ? xstrdup (referrer) : NULL;
1000 hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
1002 else if (list && !in_list (list, referrer))
1004 /* Append referrer at the end of the list */
1005 struct broken_urls_list *newnode;
1007 while (list->next) list = list->next;
1009 newnode = xnew0 (struct broken_urls_list);
1010 newnode->url = xstrdup (referrer);
1011 list->next = newnode;
1016 nonexisting_urls_free (void)
1018 if (nonexisting_urls_hash)
1020 hash_table_iterator iter;
1021 for (hash_table_iterate (nonexisting_urls_hash, &iter);
1022 hash_table_iter_next (&iter);
1028 hash_table_destroy (nonexisting_urls_hash);
1029 nonexisting_urls_hash = NULL;
1034 print_broken_links (void)
1036 hash_table_iterator iter;
1039 if (!nonexisting_urls_hash)
1041 logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
1045 num_elems = hash_table_count (nonexisting_urls_hash);
1046 assert (num_elems > 0);
1050 logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"),
1055 logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
1058 for (hash_table_iterate (nonexisting_urls_hash, &iter);
1059 hash_table_iter_next (&iter);
1062 struct broken_urls_list *list;
1064 logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
1066 for (list = (struct broken_urls_list *) iter.value;
1070 logprintf (LOG_NOTQUIET, _(" %s\n"), list->url);
1073 logputs (LOG_NOTQUIET, "\n");
1077 /* The function returns the pointer to the malloc-ed quoted version of
1078 string s. It will recognize and quote numeric and special graphic
1079 entities, as per RFC1866:
1087 No other entities are recognized or replaced. */
1089 html_quote_string (const char *s)
1095 /* Pass through the string, and count the new size. */
1096 for (i = 0; *s; s++, i++)
1099 i += 4; /* `amp;' */
1100 else if (*s == '<' || *s == '>')
1101 i += 3; /* `lt;' and `gt;' */
1102 else if (*s == '\"')
1103 i += 5; /* `quot;' */
1107 res = xmalloc (i + 1);
1109 for (p = res; *s; s++)
1122 *p++ = (*s == '<' ? 'l' : 'g');