/* Collect URLs from HTML source.
- Copyright (C) 1998-2006 Free Software Foundation, Inc.
+ Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
+ 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
You should have received a copy of the GNU General Public License
along with Wget. If not, see <http://www.gnu.org/licenses/>.
-In addition, as a special exception, the Free Software Foundation
-gives permission to link the code of its release of Wget with the
-OpenSSL project's "OpenSSL" library (or with modified versions of it
-that use the same license as the "OpenSSL" library), and distribute
-the linked executables. You must obey the GNU General Public License
-in all respects for all of the code used other than "OpenSSL". If you
-modify this file, you may extend this exception to your version of the
-file, but you are not obligated to do so. If you do not wish to do
-so, delete this exception statement from your version. */
+Additional permission under GNU GPL version 3 section 7
-#include <config.h>
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work. */
+
+#include "wget.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "html-parse.h"
#include "url.h"
#include "utils.h"
#include "hash.h"
#include "convert.h"
#include "recur.h" /* declaration of get_urls_html */
+#include "iri.h"
struct map_context;
matches the user's preferences as specified through --ignore-tags
and --follow-tags. */
- int i;
+ size_t i;
interesting_tags = make_nocase_string_hash_table (countof (known_tags));
/* First, add all the tags we know hot to handle, mapped to their
static void
tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
{
- int i, attrind;
+ size_t i;
+ int attrind;
int first = -1;
for (i = 0; i < countof (tag_url_attributes); i++)
/* Find whether TAG/ATTRIND is a combination that contains a
URL. */
char *link = tag->attrs[attrind].value;
- const int size = countof (tag_url_attributes);
+ const size_t size = countof (tag_url_attributes);
/* If you're cringing at the inefficiency of the nested loops,
remember that they both iterate over a very small number of
if (!refresh)
return;
- for (p = refresh; ISDIGIT (*p); p++)
+ for (p = refresh; c_isdigit (*p); p++)
timeout = 10 * timeout + *p - '0';
if (*p++ != ';')
return;
- while (ISSPACE (*p))
+ while (c_isspace (*p))
++p;
- if (!( TOUPPER (*p) == 'U'
- && TOUPPER (*(p + 1)) == 'R'
- && TOUPPER (*(p + 2)) == 'L'
+ if (!( c_toupper (*p) == 'U'
+ && c_toupper (*(p + 1)) == 'R'
+ && c_toupper (*(p + 2)) == 'L'
&& *(p + 3) == '='))
return;
p += 4;
- while (ISSPACE (*p))
+ while (c_isspace (*p))
++p;
entry = append_url (p, tag, attrind, ctx);
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));
+
+ /* sXXXav: Not used yet */
+ xfree (mcharset);
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
text = line_end;
/* Strip whitespace from the beginning and end of line. */
- while (line_beg < line_end && ISSPACE (*line_beg))
+ while (line_beg < line_end && c_isspace (*line_beg))
++line_beg;
- while (line_end > line_beg && ISSPACE (*(line_end - 1)))
+ while (line_end > line_beg && c_isspace (*(line_end - 1)))
--line_end;
if (line_beg == line_end)