/* Collect URLs from HTML source.
Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007 Free Software Foundation, Inc.
+ 2007, 2008 Free Software Foundation, Inc.
This file is part of GNU Wget.
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
-#include <config.h>
+#include "wget.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
-#include "wget.h"
#include "html-parse.h"
#include "url.h"
#include "utils.h"
#include "hash.h"
#include "convert.h"
#include "recur.h" /* declaration of get_urls_html */
+#include "iri.h"
struct map_context;
matches the user's preferences as specified through --ignore-tags
and --follow-tags. */
- int i;
+ size_t i;
interesting_tags = make_nocase_string_hash_table (countof (known_tags));
/* First, add all the tags we know hot to handle, mapped to their
struct urlpos *newel;
const char *base = ctx->base ? ctx->base : ctx->parent_base;
struct url *url;
+ bool utf8_encode = false;
if (!base)
{
return NULL;
}
- url = url_parse (link_uri, NULL);
+ url = url_parse (link_uri, NULL, &utf8_encode);
if (!url)
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
ctx->document_file, base, link_uri, complete_uri));
- url = url_parse (complete_uri, NULL);
+ url = url_parse (complete_uri, NULL, &utf8_encode);
if (!url)
{
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
static void
tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
{
- int i, attrind;
+ size_t i;
+ int attrind;
int first = -1;
for (i = 0; i < countof (tag_url_attributes); i++)
/* Find whether TAG/ATTRIND is a combination that contains a
URL. */
char *link = tag->attrs[attrind].value;
- const int size = countof (tag_url_attributes);
+ const size_t size = countof (tag_url_attributes);
/* If you're cringing at the inefficiency of the nested loops,
remember that they both iterate over a very small number of
if (!refresh)
return;
- for (p = refresh; ISDIGIT (*p); p++)
+ for (p = refresh; c_isdigit (*p); p++)
timeout = 10 * timeout + *p - '0';
if (*p++ != ';')
return;
- while (ISSPACE (*p))
+ while (c_isspace (*p))
++p;
- if (!( TOUPPER (*p) == 'U'
- && TOUPPER (*(p + 1)) == 'R'
- && TOUPPER (*(p + 2)) == 'L'
+ if (!( c_toupper (*p) == 'U'
+ && c_toupper (*(p + 1)) == 'R'
+ && c_toupper (*(p + 2)) == 'L'
&& *(p + 3) == '='))
return;
p += 4;
- while (ISSPACE (*p))
+ while (c_isspace (*p))
++p;
entry = append_url (p, tag, attrind, ctx);
entry->link_expect_html = 1;
}
}
+ else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ {
+ /* Handle stuff like:
+ <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+
+ char *mcharset;
+ char *content = find_attr (tag, "content", NULL);
+ if (!content)
+ return;
+
+ mcharset = parse_charset (content);
+ if (!mcharset)
+ return;
+
+ /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
+
+ set_current_charset (mcharset);
+ xfree (mcharset);
+ }
else if (name && 0 == strcasecmp (name, "robots"))
{
/* Handle stuff like:
struct file_memory *fm;
struct urlpos *head, *tail;
const char *text, *text_end;
+ bool utf8_encode = false;
/* Load the file. */
fm = read_file (file);
text = line_end;
/* Strip whitespace from the beginning and end of line. */
- while (line_beg < line_end && ISSPACE (*line_beg))
+ while (line_beg < line_end && c_isspace (*line_beg))
++line_beg;
- while (line_end > line_beg && ISSPACE (*(line_end - 1)))
+ while (line_end > line_beg && c_isspace (*(line_end - 1)))
--line_end;
if (line_beg == line_end)
url_text = merged;
}
- url = url_parse (url_text, &up_error_code);
+ url = url_parse (url_text, &up_error_code, &utf8_encode);
if (!url)
{
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),