/* Collect URLs from HTML source.
- Copyright (C) 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+ Copyright (C) 1998-2006 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
#include <config.h>
#include <stdio.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif
+#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "utils.h"
#include "hash.h"
#include "convert.h"
-
-#ifndef errno
-extern int errno;
-#endif
+#include "recur.h" /* declaration of get_urls_html */
struct map_context;
-typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
- struct map_context *));
+typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
-#define DECLARE_TAG_HANDLER(fun) \
- static void fun PARAMS ((int, struct taginfo *, struct map_context *))
+#define DECLARE_TAG_HANDLER(fun) \
+ static void fun (int, struct taginfo *, struct map_context *)
DECLARE_TAG_HANDLER (tag_find_urls);
DECLARE_TAG_HANDLER (tag_handle_base);
TAG_LAYER,
TAG_LINK,
TAG_META,
+ TAG_OBJECT,
TAG_OVERLAY,
TAG_SCRIPT,
TAG_TABLE,
{ TAG_LAYER, "layer", tag_find_urls },
{ TAG_LINK, "link", tag_handle_link },
{ TAG_META, "meta", tag_handle_meta },
+ { TAG_OBJECT, "object", tag_find_urls },
{ TAG_OVERLAY, "overlay", tag_find_urls },
{ TAG_SCRIPT, "script", tag_find_urls },
{ TAG_TABLE, "table", tag_find_urls },
{ TAG_IMG, "src", ATTR_INLINE },
{ TAG_INPUT, "src", ATTR_INLINE },
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
+ { TAG_OBJECT, "data", ATTR_INLINE },
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
{ TAG_SCRIPT, "src", ATTR_INLINE },
{ TAG_TABLE, "background", ATTR_INLINE },
"action" /* used by tag_handle_form */
};
-struct hash_table *interesting_tags;
-struct hash_table *interesting_attributes;
+static struct hash_table *interesting_tags;
+static struct hash_table *interesting_attributes;
static void
init_interesting (void)
changed through <base href=...>. */
const char *parent_base; /* Base of the current document. */
const char *document_file; /* File name of this document. */
- int nofollow; /* whether NOFOLLOW was specified in a
+ bool nofollow; /* whether NOFOLLOW was specified in a
<meta name=robots> tag. */
struct urlpos *head, *tail; /* List of URLs that is being
&& (0 == strcasecmp (rel, "stylesheet")
|| 0 == strcasecmp (rel, "shortcut icon")))
up->link_inline_p = 1;
+ else
+ /* The external ones usually point to HTML pages, such as
+ <link rel="next" href="..."> */
+ up->link_expect_html = 1;
}
}
}
if (!content)
return;
if (!strcasecmp (content, "none"))
- ctx->nofollow = 1;
+ ctx->nofollow = true;
else
{
while (*content)
else
end = content + strlen (content);
if (!strncasecmp (content, "nofollow", end - content))
- ctx->nofollow = 1;
+ ctx->nofollow = true;
content = end;
}
}
<base href=...> and does the right thing. */
struct urlpos *
-get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
+get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)
{
struct file_memory *fm;
struct map_context ctx;
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
- DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
ctx.text = fm->content;
ctx.head = ctx.tail = NULL;
ctx.base = NULL;
ctx.parent_base = url ? url : opt.base_href;
ctx.document_file = file;
- ctx.nofollow = 0;
+ ctx.nofollow = false;
if (!interesting_tags)
init_interesting ();
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
- generate <a href=" foo"> instead of <a href="foo"> (Netscape
- ignores spaces as well.) If you really mean space, use &32; or
- %20. */
+ generate <a href=" foo"> instead of <a href="foo"> (browsers
+ ignore spaces as well.) If you really mean space, use &32; or
+ %20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
+ e.g. in <img src="foo.[newline]html">. Such newlines are also
+ ignored by IE and Mozilla and are presumably introduced by
+ writing HTML with editors that force word wrap. */
flags = MHT_TRIM_VALUES;
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL;
}
- DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+ DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
head = tail = NULL;
text = fm->content;
url = url_parse (url_text, &up_error_code);
if (!url)
{
- logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
+ logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
file, url_text, url_error (up_error_code));
xfree (url_text);
continue;
xfree (url_text);
entry = xnew0 (struct urlpos);
- entry->next = NULL;
entry->url = url;
if (!head)