/* Collect URLs from HTML source.
Copyright (C) 1998, 2000 Free Software Foundation, Inc.
-This file is part of Wget.
+This file is part of GNU Wget.
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
+along with Wget; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include <config.h>
# include <strings.h>
#endif
#include <stdlib.h>
-#include <ctype.h>
#include <errno.h>
#include <assert.h>
{ "th", TC_LINK }
};
+
/* Flags for specific url-attr pairs handled through TC_LINK: */
+
+/* This tag points to an external document not necessary for rendering this
+ document (i.e. it's not an inlined image, stylesheet, etc.). */
#define AF_EXTERNAL 1
+
/* For tags handled by TC_LINK: attributes that contain URLs to
download. */
static struct {
/* Normally here we could say:
interesting_tags[i] = name;
But we need to respect the settings of --ignore-tags and
- --follow-tags, so the code gets a bit harier. */
+ --follow-tags, so the code gets a bit hairier. */
if (opt.ignore_tags)
{
through if there's no match. */
int j, lose = 0;
for (j = 0; opt.ignore_tags[j] != NULL; j++)
- /* Loop through all the tags this user doesn't care
- about. */
+ /* Loop through all the tags this user doesn't care about. */
if (strcasecmp(opt.ignore_tags[j], name) == EQ)
{
lose = 1;
if (opt.follow_tags)
{
- /* --follow-tags was specified. Only match these specific
- tags, so return FALSE if we don't match one of them. */
+ /* --follow-tags was specified. Only match these specific tags, so
+ continue back to top of for if we don't match one of them. */
int j, win = 0;
for (j = 0; opt.follow_tags[j] != NULL; j++)
/* Loop through all the tags this user cares about. */
break;
}
if (!win)
- continue; /* wasn't one of the explicitly
- desired tags */
+ continue; /* wasn't one of the explicitly desired tags */
}
/* If we get to here, --follow-tags isn't being used or the
- tag is among the ones that are follwed, and --ignore-tags,
+ tag is among the ones that are followed, and --ignore-tags,
if specified, didn't include this tag, so it's an
"interesting" one. */
interesting_tags[ind++] = name;
complete_uri = xstrdup (link_uri);
}
else
- complete_uri = url_concat (base, link_uri);
+ complete_uri = uri_merge (base, link_uri);
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
closure->document_file, base ? base : "(null)",
closure->tail = closure->head = newel;
}
-/* #### Document what this does.
+/* Examine name and attributes of TAG and take appropriate action.
+ What will be done depends on TAG's category and attribute values.
+ Tags of TC_LINK category have attributes that contain links to
+ follow; tags of TC_SPEC category need to be handled specially.
+
#### It would be nice to split this into several functions. */
static void
if (closure->dash_p_leaf_HTML
&& (url_tag_attr_map[i].flags & AF_EXTERNAL))
/* If we're at a -p leaf node, we don't want to retrieve
- links to references we know are external, such as <a
- href=...>. */
+ links to references we know are external to this document,
+ such as <a href=...>. */
continue;
/* This find_attr() buried in a loop may seem inefficient
if (closure->base)
xfree (closure->base);
if (closure->parent_base)
- closure->base = url_concat (closure->parent_base, newbase);
+ closure->base = uri_merge (closure->parent_base, newbase);
else
closure->base = xstrdup (newbase);
}
and we're at a leaf node (relative to the -l
max. depth) in the HTML document tree, the only
<LINK> tag we'll follow is a <LINK REL=
- "stylesheet">, as it's necessary for displaying
+ "stylesheet">, as it'll be necessary for displaying
this document properly. We won't follow other
<LINK> tags, like <LINK REL="home">, for instance,
as they refer to external documents. */
}
}
-/* Scan FILE, retrieving links to HTML documents from it. Each link is
-
- Similar to get_urls_file, but for HTML files. FILE is scanned as
- an HTML document. get_urls_html() constructs the URLs from the
- relative href-s.
+/* Analyze HTML tags FILE and construct a list of URLs referenced from
+ it. It merges relative links in FILE with THIS_URL. It is aware
+ of <base href=...> and does the right thing.
- If SILENT is non-zero, do not barf on baseless relative links. */
+ If dash_p_leaf_HTML is non-zero, only the elements needed to render
+ FILE ("non-external" links) will be returned. */
urlpos *
get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
int *meta_disallow_follow)