Automated merge.

author Saint Xavier <wget@sxav.eu>

Sun, 20 Jul 2008 19:45:09 +0000 (21:45 +0200)

committer Saint Xavier <wget@sxav.eu>

Sun, 20 Jul 2008 19:45:09 +0000 (21:45 +0200)
author Saint Xavier <wget@sxav.eu>
Sun, 20 Jul 2008 19:45:09 +0000 (21:45 +0200)
committer Saint Xavier <wget@sxav.eu>
Sun, 20 Jul 2008 19:45:09 +0000 (21:45 +0200)
diff --combined ChangeLog

index 8989841447b3d37fd88b89114f348b6b9eeb1974,d96ce3552e28057dadb989f4ae7ce26882d29f72..21d380b261252d9c27b43fe1f9efe87ad89ab99f
--- 1/ChangeLog
--- 2/ChangeLog
+++ b/ChangeLog
@@@ -1,11 -1,9 +1,17 @@@
+ 2008-06-30  Micah Cowan  <micah@cowan.name>
+ 
+       * NEWS: Entries for 1.11.4.
+ 
+       * AUTHORS: Added Steven Schubiger.
+ 
+ +2008-06-26  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * configure.ac : IRIs support required libiconv, check it.
+ +
+ +2008-06-14  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * configure.ac: Add support for IRIs
+ +
   2008-05-29  Micah Cowan  <micah@cowan.name>
   
         * po/*.po: Updated from TP (the 1.11.3 set).
@@@ -54,6 -52,19 +60,19 @@@
         md5/m4/stdint.m4, md5/md5.c, md5/md5.h, md5/stdint.in.h,
         md5/wchar.in.h: Updated from gnulib.
    
+ 2008-04-24  Micah Cowan  <micah@cowan.name>
+ 
+       * NEWS: Removed info about move to Automake, Gnulib. Added item
+       about the addition of CSS support.
+ 
+ 2008-04-22  Micah Cowan  <micah@cowan.name>
+ 
+       * ylwrap: Added via automake -ac.
+ 
+ 2008-04-22  Ted Mielczarek  <ted.mielczarek@gmail.com>
+ 
+       * configure.ac: Added check for lex.
+ 
   2008-04-14  Micah Cowan  <micah@cowan.name>
   
         * GNUmakefile, lib/Makefile.am, lib/error.c, lib/error.h,
diff --combined configure.ac

index b29234360ac09c012099105edd718fbd11abcee4,2ccc703d4c4a43aa757947269d9ce97a5f7946a7..fb0c65d10d10bef526dd59126868a463653e8c3e
--- 1/configure.ac
--- 2/configure.ac
+++ b/configure.ac
@@@ -113,6 -113,8 +113,8 @@@ md5_EARL
   
   AC_PROG_RANLIB
   
+ AC_PROG_LEX
+ 
   dnl Turn on optimization by default.  Specifically:
   dnl
   dnl if the user hasn't specified CFLAGS, then
@@@ -458,77 -460,6 +460,77 @@@ els
   fi
   AC_SUBST(COMMENT_IF_NO_POD2MAN)
   
+ +
+ +dnl
+ +dnl Check for IDN/IRIs
+ +dnl
+ +
+ +AC_ARG_ENABLE(iri,
+ +  AC_HELP_STRING([--disable-iri],[disable IDN/IRIs support]),
+ +  [case "${enable_iri}" in
+ +    no)
+ +      dnl Disable IRIs checking
+ +      AC_MSG_NOTICE([disabling IRIs at user request])
+ +      iri=no
+ +      ;;
+ +    yes)
+ +      dnl IRIs explicitly enabled
+ +      iri=yes
+ +      force_iri=yes
+ +      ;;
+ +    auto)
+ +      dnl Auto-detect IRI
+ +      iri=yes
+ +      ;;
+ +    *)
+ +      AC_MSG_ERROR([Invalid --enable-iri argument \`$enable_iri'])
+ +      ;;
+ +    esac
+ +  ], [
+ +    dnl If nothing is specified, assume auto-detection
+ +    iri=yes
+ +  ]
+ +)
+ +
+ +AC_ARG_WITH(libidn, AC_HELP_STRING([--with-libidn=[DIR]],
+ +                                   [Support IDN/IRIs (needs GNU Libidn)]),
+ +                                   libidn=$withval, libidn="")
+ +if test "X$iri" != "Xno"; then
+ +  AM_ICONV
+ +
+ +  if test "X$am_cv_func_iconv" != "Xyes"; then
+ +    iri=no
+ +    if test "X$force_iri" = "Xyes"; then
+ +      AC_MSG_ERROR([Libiconv is required for IRIs support])
+ +    else
+ +      AC_MSG_NOTICE([disabling IRIs because libiconv wasn't found])
+ +    fi
+ +  fi
+ +fi
+ +
+ +if test "X$iri" != "Xno"; then
+ +  if test "$libidn" != ""; then
+ +    LDFLAGS="${LDFLAGS} -L$libidn/lib"
+ +    CPPFLAGS="${CPPFLAGS} -I$libidn/include"
+ +  fi
+ +  AC_CHECK_HEADER(idna.h,
+ +    AC_CHECK_LIB(idn, stringprep_check_version,
+ +      [iri=yes LIBS="${LIBS} -lidn"], iri=no),
+ +    iri=no)
+ +
+ +  if test "X$iri" != "Xno" ; then
+ +    AC_DEFINE(ENABLE_IRI, 1, [Define if IRI support is enabled.])
+ +    AC_MSG_NOTICE([Enabling support for IRI.])
+ +  else
+ +    AC_MSG_WARN([Libidn not found])
+ +  fi
+ +fi
+ +
+ +
+ +dnl Needed by src/Makefile.am
+ +AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
+ +
+ +
   dnl
   dnl Create output
   dnl
diff --combined src/ChangeLog

index 7aca052701951708624a479b1f793e21f1dfe1c9,e551f1c9f176313a2c024fe971ded5402c282252..02bc331b8e4a712f29cebce774b0df63a080ff0f
--- 1/src/ChangeLog
--- 2/src/ChangeLog
+++ b/src/ChangeLog
@@@ -1,19 -1,13 +1,29 @@@
+ +2008-07-02  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * iri.c, iri.h  : New function idn_decode() to decode ASCII
+ +      encoded hostname to the locale.
+ +
+ +      * host.c : Show hostname to be resolved both in locale and
+ +      ASCII encoded.
+ +
+ 2008-06-28  Steven Schubiger  <stsc@members.fsf.org>
+ 
+       * retr.c (retrieve_from_file): Allow for reading the links from
+       an external file (HTTP/FTP).
+ 
+ +2008-06-26  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * iri.c, iri.h : New functions locale_to_utf8() and
+ +      idn_encode() adding basic capabilities of IRI/IDN.
+ +
+ +      * url.c : Convert URLs from locale to UTF-8 allowing a basic
+ +      support of IRI/IDN
+ +
+ 2008-06-25  Steven Schubiger  <stsc@members.fsf.org>
+ 
+       * ftp.c (getftp): When spidering a FTP URL, emit a diagnostic
+       message if the remote file exists.
+ 
   2008-06-24  Steven Schubiger  <stsc@members.fsf.org>
   
         * http.c (http_loop): Replace escnonprint() occurence with
@@@ -33,7 -27,7 +43,7 @@@
   
         * http.c: Make -nv --spider include the file's name when it
         exists.
- -      
+ +
   2008-06-22  Micah Cowan  <micah@cowan.name>
   
         * Makefile.am (version.c): Fixed version string invocation so it
@@@ -41,57 -35,12 +51,57 @@@
         string vars pointers-to-const, and moved line lengths
         below 80 (in Makefile.am, not in version.c).
   
+ +2008-06-19  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * iri.c, iri.h : New function check_encoding_name() as
+ +      a preliminary encoding name check.
+ +
+ +      * main.c, iri.c : Make use of check_encoding_name().
+ +
+ +2008-06-19  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * iri.c : Include missing stringprep.h file and add a
+ +      cast.
+ +
+ +      * init.c : set a default initial value for opt.enable_iri,
+ +      opt.locale and opt.encoding_remote.
+ +
+ +2008-06-19  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * iri.c, iri.h : Add a new function find_locale() to find
+ +      out the local system encoding.
+ +
+ +      * main.c : Make use of find_locale().
+ +
+ +2008-06-19  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * html-url.c : Add "content-type" meta tag parsing for
+ +      retrieving page encoding.
+ +
+ +      * iri.h : Make no-op version of parse_charset() return
+ +      NULL.
+ +
   2008-06-16  Micah Cowan  <micah@cowan.name>
   
         * http.c (http_loop): When hstat.len is higher than the
         successfully completed content's length, but it's because we
         _set_ it that way, don't abort.
   
+ +2008-06-14  Xavier Saint  <wget@sxav.eu>
+ +
+ +      * iri.c, iri.h : New files.
+ +
+ +      * Makefile.am : Add files iri.h and conditional iri.c.
+ +
+ +      * build_info.c : Add compiled feature "iri".
+ +
+ +      * http.c : include iri.h and parse charset from Content-Type
+ +      header.
+ +
+ +      * init.c, main.c, options.h : if an options isn't supported
+ +      at compiled time, don't get rid off it and show a dummy
+ +      message instead if they are used.
+ +
   2008-06-13  Micah Cowan  <micah@cowan.name>
   
         * build_info.c: ENABLE_NTLM, not HAVE_NTLM; distinguish OpenSSL
@@@ -135,11 -84,11 +145,11 @@@
         default.
   
   2008-05-17  Kenny Parnell  <k.parnell@gmail.com>
- -      
+ +
         (cmd_spec_prefer_family): Initialize prefer_family to prefer_none.
   
   2008-05-17  Micah Cowan  <micah@cowan.name>
- -      
+ +
         * main.c (main): Handle Ctrl-D on command-line.
   
   2008-05-15  Steven Schubiger  <schubiger@gmail.com>
@@@ -178,7 -127,7 +188,7 @@@
   
         * options.h: Add an according boolean member to the options
         struct.
- -      
+ +
         * sysdep.h: Comment the defines __EXTENSIONS__ and _GNU_SOURCE
         out, because they're now defined independently by config.h.
   
@@@ -210,11 -159,55 +220,55 @@@
   
         * Makefile.am: -I foo -> -Ifoo.
   
+ 2008-04-24  Micah Cowan  <micah@cowan.name>
+ 
+       * main.c: Revised usage description of --convert-links to apply
+       to CSS as well as to HTML.
+ 
   2008-04-23  Micah Cowan  <micah@cowan.name>
   
         * utils.c (test_dir_matches_p): Added a test for the case
         described in issue #20518.
   
+ 2008-04-22  Micah Cowan  <micah@cowan.name>
+ 
+       * Makefile.am, css.lex, css.l: Renamed css.lex to css.l.
+       * recur.c (retrieve_tree): Fix typo to allow text/css files to
+       be parsed.
+ 
+ 2008-04-22  Ted Mielczarek  <ted.mielczarek@gmail.com>
+ 
+       * css.lex, css-url.c, css-url.h: Added to implement support for
+       parsing CSS in Wget.
+       * convert.c: Convert links in CSS files, too.
+       * convert.h (convert_options): Added for options link_css_p,
+       link_expect_css.
+       * convert.h: Added prototype for new register_css function.
+       * html-parse.c: Added support for parsing element content, in
+       addition to tag starts and ends.
+       * html-parse.h (taginfo): Added delimiter fields for element
+       content.
+       * html-url.h: Added.
+       * html-url.c (append_url): No longer internal-linkage only. Now
+       takes position and size as explicit parameters.
+       * html-url.c: Use new html-url.h header, add support for
+       handling of "style" HTML attributes. Mark URIs obtained from
+       link tags with rel="stylesheet" with link_expect_css. Adapt
+       uses of append_url to supply the newly-added parameters for
+       position and size.
+       * http.c: Add detection for when the content-type is text/css;
+       and ensure that such files have the ".css" filename extension,
+       when --convert-links is active.
+       * recur.h: Remove declarations for functions found in
+       html-url.c (moved to html-url.h).
+       * recur.c: Add support for culling links from CSS files, too,
+       and tracking for when we're expecting the file to be CSS (even
+       when its content type isn't text/css).
+       * retr.c (retrieve_url): Add registration of CSS files.
+       * wget.h: Added TEXTCSS to dt flags enum.
+       * Makefile.am: Added css.lex, css-url.c, css-url.h, html-url.h
+       to wget_SOURCES.
+ 
   2008-04-22  Jim Paris  <jim@jtan.com>
   
         * openssl.c (ssl_init): Enable combined certificate/key in
diff --combined src/Makefile.am

index 6ae5805d5ba6400354079aa871969c107c72f9f0,6db4ac17da17a995a8961e6afd080df978bc6d36..edbb592e8990422a8ae65d5954fa2c58d7aee599
--- 1/src/Makefile.am
--- 2/src/Makefile.am
+++ b/src/Makefile.am
@@@ -30,23 -30,20 +30,24 @@@
   # Version: @VERSION@
   #
   
+ +if IRI_IS_ENABLED
+ +IRI_OBJ = iri.c
+ +endif
+ +
   # The following line is losing on some versions of make!
   DEFS     = @DEFS@ -DSYSTEM_WGETRC=\"$(sysconfdir)/wgetrc\" -DLOCALEDIR=\"$(localedir)\"
   LIBS     = @LIBSSL@ @LIBGNUTLS@ @LIBINTL@ @LIBS@
   
   bin_PROGRAMS = wget
   wget_SOURCES = build_info.c cmpt.c connect.c convert.c cookies.c ftp.c    \
+              css.l css-url.c \
                ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
                http.c init.c log.c main.c netrc.c progress.c ptimer.c     \
                recur.c res.c retr.c snprintf.c spider.c url.c             \
- -             utils.c                                    \
- -             css-url.h connect.h convert.h cookies.h \
+ +             utils.c $(IRI_OBJ)                                         \
-              connect.h convert.h cookies.h                              \
-              ftp.h gen-md5.h hash.h host.h html-parse.h                 \
-              http.h http-ntlm.h init.h iri.h log.h mswindows.h netrc.h  \
++             css-url.h connect.h convert.h cookies.h                    \
+              ftp.h gen-md5.h hash.h host.h html-parse.h html-url.h      \
+              http.h http-ntlm.h init.h log.h mswindows.h netrc.h        \
                options.h progress.h ptimer.h recur.h res.h retr.h         \
                spider.h ssl.h sysdep.h url.h utils.h wget.h
   nodist_wget_SOURCES = version.c
diff --combined src/html-url.c

index 5a0682d3f80eaab886cdc997fa08ab3c5d26ef54,75bec7d97e7c56709fea35044385480466ea8cfb..ef93a7e49c5c61ded555c2b6c022be3346bbec2c
--- 1/src/html-url.c
--- 2/src/html-url.c
+++ b/src/html-url.c
@@@ -41,11 -41,10 +41,11 @@@ as that of the covered work.  *
   #include "utils.h"
   #include "hash.h"
   #include "convert.h"
- #include "recur.h"              /* declaration of get_urls_html */
+ #include "recur.h"
+ #include "html-url.h"
+ #include "css-url.h"
+ +#include "iri.h"
   
- struct map_context;
- 
   typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
   
   #define DECLARE_TAG_HANDLER(fun)                                \
@@@ -164,11 -163,12 +164,12 @@@ static struct 
      from the information above.  However, some places in the code refer
      to the attributes not mentioned here.  We add them manually.  */
   static const char *additional_attributes[] = {
-   "rel",                        /* used by tag_handle_link */
-   "http-equiv",                 /* used by tag_handle_meta */
-   "name",                       /* used by tag_handle_meta */
-   "content",                    /* used by tag_handle_meta */
-   "action"                      /* used by tag_handle_form */
+   "rel",                        /* used by tag_handle_link  */
+   "http-equiv",                 /* used by tag_handle_meta  */
+   "name",                       /* used by tag_handle_meta  */
+   "content",                    /* used by tag_handle_meta  */
+   "action",                     /* used by tag_handle_form  */
+   "style"                       /* used by check_style_attr */
   };
   
   static struct hash_table *interesting_tags;
@@@ -247,28 -247,20 +248,20 @@@ find_attr (struct taginfo *tag, const c
     return NULL;
   }
   
- struct map_context {
-   char *text;                   /* HTML text. */
-   char *base;                   /* Base URI of the document, possibly
-                                    changed through <base href=...>. */
-   const char *parent_base;      /* Base of the current document. */
-   const char *document_file;    /* File name of this document. */
-   bool nofollow;                /* whether NOFOLLOW was specified in a
-                                    <meta name=robots> tag. */
- 
-   struct urlpos *head, *tail;   /* List of URLs that is being
-                                    built. */
- };
+ /* used for calls to append_url */
+ #define ATTR_POS(tag, attrind, ctx) \
+  (tag->attrs[attrind].value_raw_beginning - ctx->text)
+ #define ATTR_SIZE(tag, attrind) \
+  (tag->attrs[attrind].value_raw_size)
   
   /* Append LINK_URI to the urlpos structure that is being built.
   
-    LINK_URI will be merged with the current document base.  TAG and
-    ATTRIND are the necessary context to store the position and
-    size.  */
+    LINK_URI will be merged with the current document base.
+ */
   
- static struct urlpos *
- append_url (const char *link_uri,
-             struct taginfo *tag, int attrind, struct map_context *ctx)
+ struct urlpos *
+ append_url (const char *link_uri, int position, int size,
+             struct map_context *ctx)
   {
     int link_has_scheme = url_has_scheme (link_uri);
     struct urlpos *newel;
@@@ -292,9 -284,7 +285,9 @@@
             return NULL;
           }
   
+ +      set_ugly_no_encode (true);
         url = url_parse (link_uri, NULL);
+ +      set_ugly_no_encode (false);
         if (!url)
           {
             DEBUGP (("%s: link \"%s\" doesn't parse.\n",
@@@ -313,9 -303,7 +306,9 @@@
         DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
                  ctx->document_file, base, link_uri, complete_uri));
   
+ +      set_ugly_no_encode (true);
         url = url_parse (complete_uri, NULL);
+ +      set_ugly_no_encode (false);
         if (!url)
           {
             DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
@@@ -330,8 -318,8 +323,8 @@@
   
     newel = xnew0 (struct urlpos);
     newel->url = url;
-   newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
-   newel->size = tag->attrs[attrind].value_raw_size;
+   newel->pos = position;
+   newel->size = size;
   
     /* A URL is relative if the host is not named, and the name does not
        start with `/'.  */
@@@ -351,6 -339,18 +344,18 @@@
     return newel;
   }
   \f
+ static void
+ check_style_attr (struct taginfo *tag, struct map_context *ctx)
+ {
+   int attrind;
+   char *style = find_attr (tag, "style", &attrind);
+   if (!style)
+     return;
+ 
+   /* raw pos and raw size include the quotes, hence the +1 -2 */
+   get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
+ }
+ 
   /* All the tag_* functions are called from collect_tags_mapper, as
      specified by KNOWN_TAGS.  */
   
@@@ -399,7 -399,8 +404,8 @@@ tag_find_urls (int tagid, struct taginf
             if (0 == strcasecmp (tag->attrs[attrind].name,
                                  tag_url_attributes[i].attr_name))
               {
-               struct urlpos *up = append_url (link, tag, attrind, ctx);
+               struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
+                                               ATTR_SIZE(tag,attrind), ctx);
                 if (up)
                   {
                     int flags = tag_url_attributes[i].flags;
@@@ -424,7 -425,8 +430,8 @@@ tag_handle_base (int tagid, struct tagi
     if (!newbase)
       return;
   
-   base_urlpos = append_url (newbase, tag, attrind, ctx);
+   base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
+                             ATTR_SIZE(tag,attrind), ctx);
     if (!base_urlpos)
       return;
     base_urlpos->ignore_when_downloading = 1;
@@@ -445,9 -447,11 +452,11 @@@ tag_handle_form (int tagid, struct tagi
   {
     int attrind;
     char *action = find_attr (tag, "action", &attrind);
+ 
     if (action)
       {
-       struct urlpos *up = append_url (action, tag, attrind, ctx);
+       struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
+                                       ATTR_SIZE(tag,attrind), ctx);
         if (up)
           up->ignore_when_downloading = 1;
       }
@@@ -470,14 -474,23 +479,23 @@@ tag_handle_link (int tagid, struct tagi
     */
     if (href)
       {
-       struct urlpos *up = append_url (href, tag, attrind, ctx);
+       struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
+                                       ATTR_SIZE(tag,attrind), ctx);
         if (up)
           {
             char *rel = find_attr (tag, "rel", NULL);
-           if (rel
-               && (0 == strcasecmp (rel, "stylesheet")
-                   || 0 == strcasecmp (rel, "shortcut icon")))
-             up->link_inline_p = 1;
+           if (rel)
+             {
+               if (0 == strcasecmp (rel, "stylesheet"))
+                 {
+                   up->link_inline_p = 1;
+                   up->link_expect_css = 1;
+                 }
+               else if (0 == strcasecmp (rel, "shortcut icon"))
+                 {
+                   up->link_inline_p = 1;
+                 }
+             }
             else
               /* The external ones usually point to HTML pages, such as
                  <link rel="next" href="..."> */
@@@ -531,7 -544,8 +549,8 @@@ tag_handle_meta (int tagid, struct tagi
         while (c_isspace (*p))
           ++p;
   
-       entry = append_url (p, tag, attrind, ctx);
+       entry = append_url (p, ATTR_POS(tag,attrind,ctx),
+                           ATTR_SIZE(tag,attrind), ctx);
         if (entry)
           {
             entry->link_refresh_p = 1;
@@@ -539,25 -553,6 +558,25 @@@
             entry->link_expect_html = 1;
           }
       }
+ +  else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
+ +    {
+ +      /* Handle stuff like:
+ +         <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
+ +
+ +      char *mcharset;
+ +      char *content = find_attr (tag, "content", NULL);
+ +      if (!content)
+ +        return;
+ +
+ +      mcharset = parse_charset (content);
+ +      if (!mcharset)
+ +        return;
+ +
+ +      /*logprintf (LOG_VERBOSE, "Meta tag charset : %s\n", quote (mcharset));*/
+ +
+ +      set_current_charset (mcharset);
+ +      xfree (mcharset);
+ +    }
     else if (name && 0 == strcasecmp (name, "robots"))
       {
         /* Handle stuff like:
@@@ -595,11 -590,26 +614,26 @@@ collect_tags_mapper (struct taginfo *ta
     struct map_context *ctx = (struct map_context *)arg;
   
     /* Find the tag in our table of tags.  This must not fail because
-      map_html_tags only returns tags found in interesting_tags.  */
+      map_html_tags only returns tags found in interesting_tags.
+      
+      I've changed this for now, I'm passing NULL as interesting_tags
+      to map_html_tags.  This way we can check all tags for a style
+      attribute.
+   */
     struct known_tag *t = hash_table_get (interesting_tags, tag->name);
-   assert (t != NULL);
   
-   t->handler (t->tagid, tag, ctx);
+   if (t != NULL)
+     t->handler (t->tagid, tag, ctx);
+ 
+   check_style_attr (tag, ctx);
+ 
+   if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
+       tag->contents_begin && tag->contents_end)
+   {
+     /* parse contents */
+     get_urls_css (ctx, tag->contents_begin - ctx->text,
+                   tag->contents_end - tag->contents_begin);
+   }
   }
   \f
   /* Analyze HTML tags FILE and construct a list of URLs referenced from
@@@ -643,8 -653,9 +677,9 @@@ get_urls_html (const char *file, const 
     if (opt.strict_comments)
       flags |= MHT_STRICT_COMMENTS;
   
+   /* the NULL here used to be interesting_tags */
     map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
-                  interesting_tags, interesting_attributes);
+                  NULL, interesting_attributes);
   
     DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
     if (meta_disallow_follow)
@@@ -715,9 -726,7 +750,9 @@@ get_urls_file (const char *file
             url_text = merged;
           }
   
+ +      set_ugly_no_encode (true);
         url = url_parse (url_text, &up_error_code);
+ +      set_ugly_no_encode (false);
         if (!url)
           {
             logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
diff --combined src/http.c

index f79327c32ebfd4b30d5deb700fe593eda753219c,52f65fed9215cd8381a427c3eeb480648fef0405..5ec70d27ba206132a53446aab5bae0f370891f4f
--- 1/src/http.c
--- 2/src/http.c
+++ b/src/http.c
@@@ -49,7 -49,6 +49,7 @@@ as that of the covered work.  *
   #include "retr.h"
   #include "connect.h"
   #include "netrc.h"
+ +#include "iri.h"
   #ifdef HAVE_SSL
   # include "ssl.h"
   #endif
@@@ -70,11 -69,13 +70,13 @@@
   extern char *version_string;
   
   /* Forward decls. */
+ struct http_stat;
   static char *create_authorization_line (const char *, const char *,
                                           const char *, const char *,
                                           const char *, bool *);
   static char *basic_authentication_encode (const char *, const char *);
   static bool known_authentication_scheme_p (const char *, const char *);
+ static void ensure_extension (struct http_stat *, const char *, int *);
   static void load_cookies (void);
   
   #ifndef MIN
@@@ -87,6 -88,7 +89,7 @@@ static struct cookie_jar *wget_cookie_j
   
   #define TEXTHTML_S "text/html"
   #define TEXTXHTML_S "application/xhtml+xml"
+ #define TEXTCSS_S "text/css"
   
   /* Some status code validation macros: */
   #define H_20X(x)        (((x) >= 200) && ((x) < 300))
@@@ -1825,7 -1827,7 +1828,7 @@@ gethttp (struct url *u, struct http_sta
             hs->local_file = url_file_name (u);
           }
       }
- -  
+ +
     /* TODO: perform this check only once. */
     if (!hs->existence_checked && file_exists_p (hs->local_file))
       {
@@@ -1894,7 -1896,7 +1897,7 @@@ File %s already there; not retrieving.\
                 local_dot_orig_file_exists = true;
                 local_filename = filename_plus_orig_suffix;
               }
- -        }      
+ +        }
   
         if (!local_dot_orig_file_exists)
           /* Couldn't stat() <file>.orig, so try to stat() <file>. */
@@@ -2046,16 -2048,9 +2049,16 @@@
         char *tmp = strchr (type, ';');
         if (tmp)
           {
+ +          /* sXXXav: only needed if IRI support is enabled */
+ +          char *tmp2 = tmp + 1;
+ +
             while (tmp > type && c_isspace (tmp[-1]))
               --tmp;
             *tmp = '\0';
+ +
+ +          /* Try to get remote encoding if needed */
+ +          if (opt.enable_iri && !opt.encoding_remote)
+ +            set_current_charset (parse_charset (tmp2));
           }
       }
     hs->newloc = resp_header_strdup (resp, "Location");
@@@ -2130,34 -2125,25 +2133,25 @@@
     else
       *dt &= ~TEXTHTML;
   
-   if (opt.html_extension && (*dt & TEXTHTML))
-     /* -E / --html-extension / html_extension = on was specified, and this is a
-        text/html file.  If some case-insensitive variation on ".htm[l]" isn't
-        already the file's suffix, tack on ".html". */
-     {
-       char *last_period_in_local_filename = strrchr (hs->local_file, '.');
+   if (type &&
+       0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S)))
+     *dt |= TEXTCSS;
+   else
+     *dt &= ~TEXTCSS;
   
-       if (last_period_in_local_filename == NULL
-           || !(0 == strcasecmp (last_period_in_local_filename, ".htm")
-                || 0 == strcasecmp (last_period_in_local_filename, ".html")))
+   if (opt.html_extension)
+     {
+       if (*dt & TEXTHTML)
+         /* -E / --html-extension / html_extension = on was specified,
+            and this is a text/html file.  If some case-insensitive
+            variation on ".htm[l]" isn't already the file's suffix,
+            tack on ".html". */
           {
-           int local_filename_len = strlen (hs->local_file);
-           /* Resize the local file, allowing for ".html" preceded by
-              optional ".NUMBER".  */
-           hs->local_file = xrealloc (hs->local_file,
-                                      local_filename_len + 24 + sizeof (".html"));
-           strcpy(hs->local_file + local_filename_len, ".html");
-           /* If clobbering is not allowed and the file, as named,
-              exists, tack on ".NUMBER.html" instead. */
-           if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
-             {
-               int ext_num = 1;
-               do
-                 sprintf (hs->local_file + local_filename_len,
-                          ".%d.html", ext_num++);
-               while (file_exists_p (hs->local_file));
-             }
-           *dt |= ADDED_HTML_EXTENSION;
+           ensure_extension (hs, ".html", dt);
+         }
+       else if (*dt & TEXTCSS)
+         {
+           ensure_extension (hs, ".css", dt);
           }
       }
   
@@@ -2350,16 -2336,16 +2344,16 @@@ http_loop (struct url *u, char **newloc
     uerr_t err, ret = TRYLIMEXC;
     time_t tmr = -1;               /* remote time-stamp */
     struct http_stat hstat;        /* HTTP status */
- -  struct_stat st;  
+ +  struct_stat st;
     bool send_head_first = true;
   
     /* Assert that no value for *LOCAL_FILE was passed. */
     assert (local_file == NULL || *local_file == NULL);
- -  
+ +
     /* Set LOCAL_FILE parameter. */
     if (local_file && opt.output_document)
       *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
- -  
+ +
     /* Reset NEWLOC parameter. */
     *newloc = NULL;
   
@@@ -2396,7 -2382,7 +2390,7 @@@
            retrieve the file. But if the output_document was given, then this
            test was already done and the file didn't exist. Hence the !opt.output_document */
         logprintf (LOG_VERBOSE, _("\
- -File %s already there; not retrieving.\n\n"), 
+ +File %s already there; not retrieving.\n\n"),
                    quote (hstat.local_file));
         /* If the file is there, we suppose it's retrieved OK.  */
         *dt |= RETROKF;
@@@ -2412,10 -2398,10 +2406,10 @@@
   
     /* Reset the counter. */
     count = 0;
- -  
+ +
     /* Reset the document type. */
     *dt = 0;
- -  
+ +
     /* Skip preliminary HEAD request if we're not in spider mode AND
      * if -O was given or HTTP Content-Disposition support is disabled. */
     if (!opt.spider
@@@ -2424,21 -2410,21 +2418,21 @@@
   
     /* Send preliminary HEAD request if -N is given and we have an existing 
      * destination file. */
- -  if (opt.timestamping 
+ +  if (opt.timestamping
         && !opt.content_disposition
         && file_exists_p (url_file_name (u)))
       send_head_first = true;
- -  
+ +
     /* THE loop */
     do
       {
         /* Increment the pass counter.  */
         ++count;
         sleep_between_retrievals (count);
- -      
+ +
         /* Get the current time string.  */
         tms = datetime_str (time (NULL));
- -      
+ +
         if (opt.spider && !got_head)
           logprintf (LOG_VERBOSE, _("\
   Spider mode enabled. Check if remote file exists.\n"));
@@@ -2447,20 -2433,20 +2441,20 @@@
         if (opt.verbose)
           {
             char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
- -          
- -          if (count > 1) 
+ +
+ +          if (count > 1)
               {
                 char tmp[256];
                 sprintf (tmp, _("(try:%2d)"), count);
                 logprintf (LOG_NOTQUIET, "--%s--  %s  %s\n",
                            tms, tmp, hurl);
               }
- -          else 
+ +          else
               {
                 logprintf (LOG_NOTQUIET, "--%s--  %s\n",
                            tms, hurl);
               }
- -          
+ +
   #ifdef WINDOWS
             ws_changetitle (hurl);
   #endif
@@@ -2470,7 -2456,7 +2464,7 @@@
         /* Default document type is empty.  However, if spider mode is
            on or time-stamping is employed, HEAD_ONLY commands is
            encoded within *dt.  */
- -      if (send_head_first && !got_head) 
+ +      if (send_head_first && !got_head)
           *dt |= HEAD_ONLY;
         else
           *dt &= ~HEAD_ONLY;
@@@ -2507,7 -2493,7 +2501,7 @@@
   
         /* Time?  */
         tms = datetime_str (time (NULL));
- -      
+ +
         /* Get the new location (with or without the redirection).  */
         if (hstat.newloc)
           *newloc = xstrdup (hstat.newloc);
@@@ -2546,7 -2532,7 +2540,7 @@@
                            hstat.statcode);
                 ret = WRONGCODE;
               }
- -          else 
+ +          else
               {
                 ret = NEWLOCATION;
               }
@@@ -2562,7 -2548,7 +2556,7 @@@
             /* All possibilities should have been exhausted.  */
             abort ();
           }
- -      
+ +
         if (!(*dt & RETROKF))
           {
             char *hurl = NULL;
@@@ -2581,13 -2567,11 +2575,13 @@@
                 continue;
               }
             /* Maybe we should always keep track of broken links, not just in
- -           * spider mode.  */
- -          else if (opt.spider)
+ +           * spider mode.
+ +           * Don't log error if it was utf8 encoded because we will try
+ +           * one unencoded. */
+ +          else if (opt.spider && !get_utf8_encode ())
               {
                 /* #### Again: ugly ugly ugly! */
- -              if (!hurl) 
+ +              if (!hurl)
                   hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
                 nonexisting_url (hurl);
                 logprintf (LOG_NOTQUIET, _("\
@@@ -2596,7 -2580,7 +2590,7 @@@ Remote file does not exist -- broken li
             else
               {
                 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
- -                         tms, hstat.statcode, 
+ +                         tms, hstat.statcode,
                            quotearg_style (escape_quoting_style, hstat.error));
               }
             logputs (LOG_VERBOSE, "\n");
@@@ -3222,6 -3206,42 +3216,42 @@@ http_cleanup (void
       cookie_jar_delete (wget_cookie_jar);
   }
   
+ void
+ ensure_extension (struct http_stat *hs, const char *ext, int *dt)
+ {
+   char *last_period_in_local_filename = strrchr (hs->local_file, '.');
+   char shortext[8];
+   int len = strlen (ext);
+   if (len == 5)
+     {
+       strncpy (shortext, ext, len - 1);
+       shortext[len - 2] = '\0';
+     }
+ 
+   if (last_period_in_local_filename == NULL
+       || !(0 == strcasecmp (last_period_in_local_filename, shortext)
+            || 0 == strcasecmp (last_period_in_local_filename, ext)))
+     {
+       int local_filename_len = strlen (hs->local_file);
+       /* Resize the local file, allowing for ".html" preceded by
+          optional ".NUMBER".  */
+       hs->local_file = xrealloc (hs->local_file,
+                                  local_filename_len + 24 + len);
+       strcpy (hs->local_file + local_filename_len, ext);
+       /* If clobbering is not allowed and the file, as named,
+          exists, tack on ".NUMBER.html" instead. */
+       if (!ALLOW_CLOBBER && file_exists_p (hs->local_file))
+         {
+           int ext_num = 1;
+           do
+             sprintf (hs->local_file + local_filename_len,
+                      ".%d%s", ext_num++, ext);
+           while (file_exists_p (hs->local_file));
+         }
+       *dt |= ADDED_HTML_EXTENSION;
+     }
+ }
+ 
   
   #ifdef TESTING
   
diff --combined src/main.c

index bf49bf8973b3bff84882ecd49f023bb9ea0a21a8,70387c9c5d6999fe38a3d9a582979e55f29ba641..6135a67d8fb1e17894b6d55ad575799bf523be25
--- 1/src/main.c
--- 2/src/main.c
+++ b/src/main.c
@@@ -43,9 -43,6 +43,9 @@@ as that of the covered work.  *
   #include <assert.h>
   #include <errno.h>
   #include <time.h>
+ +#ifdef ENABLE_IRI
+ +#include <langinfo.h>
+ +#endif
   
   #include "utils.h"
   #include "init.h"
@@@ -57,7 -54,6 +57,7 @@@
   #include "convert.h"
   #include "spider.h"
   #include "http.h"               /* for save_cookies */
+ +#include "iri.h"
   
   #include <getopt.h>
   #include <getpass.h>
@@@ -204,12 -200,10 +204,12 @@@ static struct cmdline_option option_dat
       { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 },
   #endif
       { "input-file", 'i', OPT_VALUE, "input", -1 },
+ +    { "iri", 0, OPT_BOOLEAN, "iri", -1 },
       { "keep-session-cookies", 0, OPT_BOOLEAN, "keepsessioncookies", -1 },
       { "level", 'l', OPT_VALUE, "reclevel", -1 },
       { "limit-rate", 0, OPT_VALUE, "limitrate", -1 },
       { "load-cookies", 0, OPT_VALUE, "loadcookies", -1 },
+ +    { "locale", 0, OPT_VALUE, "locale", -1 },
       { "max-redirect", 0, OPT_VALUE, "maxredirect", -1 },
       { "mirror", 'm', OPT_BOOLEAN, "mirror", -1 },
       { "no", 'n', OPT__NO, NULL, required_argument },
@@@ -243,7 -237,6 +243,7 @@@
       { "referer", 0, OPT_VALUE, "referer", -1 },
       { "reject", 'R', OPT_VALUE, "reject", -1 },
       { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
+ +    { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1},
       { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
       { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 },
       { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 },
@@@ -422,7 -415,7 +422,7 @@@ Logging and input file:\n")
       N_("\
     -nv, --no-verbose          turn off verboseness, without being quiet.\n"),
       N_("\
-   -i,  --input-file=FILE     download URLs found in FILE.\n"),
+   -i,  --input-file=FILE     download URLs found in local or external FILE.\n"),
       N_("\
     -F,  --force-html          treat input file as HTML.\n"),
       N_("\
@@@ -615,7 -608,8 +615,8 @@@ Recursive download:\n")
       N_("\
          --delete-after       delete files locally after downloading them.\n"),
       N_("\
-   -k,  --convert-links      make links in downloaded HTML point to local files.\n"),
+   -k,  --convert-links      make links in downloaded HTML or CSS point to\n\
+                             local files.\n"),
       N_("\
     -K,  --backup-converted   before converting file X, back up as X.orig.\n"),
       N_("\
@@@ -1064,29 -1058,6 +1065,29 @@@ for details.\n\n"))
         exit (1);
       }
   
+ +#ifdef ENABLE_IRI
+ +  if (opt.enable_iri)
+ +    {
+ +      if (opt.locale && !check_encoding_name (opt.locale))
+ +        opt.locale = NULL;
+ +
+ +      if (!opt.locale)
+ +        opt.locale = find_locale ();
+ +
+ +      if (opt.encoding_remote && !check_encoding_name (opt.encoding_remote))
+ +        opt.encoding_remote = NULL;
+ +
+ +      /*logprintf (LOG_VERBOSE, "Locale = %s\n", quote (opt.locale));*/
+ +    }
+ +#else
+ +  if (opt.enable_iri || opt.locale || opt.encoding_remote)
+ +    {
+ +      /* sXXXav : be more specific... */
+ +      printf(_("This version does not have support for IRIs\n"));
+ +      exit(1);
+ +    }
+ +#endif
+ +
     if (opt.ask_passwd)
       {
         opt.passwd = prompt_for_password ();
@@@ -1190,27 -1161,21 +1191,27 @@@ WARNING: Can't reopen standard output i
         char *filename = NULL, *redirected_URL = NULL;
         int dt;
   
+ +      set_current_as_locale ();
+ +      set_ugly_no_encode (false);
+ +
         if ((opt.recursive || opt.page_requisites)
             && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
           {
             int old_follow_ftp = opt.follow_ftp;
   
             /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- -          if (url_scheme (*t) == SCHEME_FTP) 
+ +          if (url_scheme (*t) == SCHEME_FTP)
               opt.follow_ftp = 1;
- -          
+ +
             status = retrieve_tree (*t);
   
             opt.follow_ftp = old_follow_ftp;
           }
         else
- -        status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ +        {
+ +          set_remote_as_current ();
+ +          status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
+ +        }
   
         if (opt.delete_after && file_exists_p(filename))
           {
diff --combined src/recur.c

index 6f5da2ae77b9b33dfa612a10c2e8a847adc47f2f,729a14e91d9cc2e57dbdd54894a954bfcca41d5f..24b80ad475878203680ec355f9ea818758e00b0c
--- 1/src/recur.c
--- 2/src/recur.c
+++ b/src/recur.c
@@@ -48,9 -48,10 +48,11 @@@ as that of the covered work.  *
   #include "hash.h"
   #include "res.h"
   #include "convert.h"
+ #include "html-url.h"
+ #include "css-url.h"
   #include "spider.h"
- -
+ +#include "iri.h"
+ +\f
   /* Functions for maintaining the URL queue.  */
   
   struct queue_element {
@@@ -59,7 -60,8 +61,9 @@@
     int depth;                    /* the depth */
     bool html_allowed;            /* whether the document is allowed to
                                      be treated as HTML. */
+ +  char *remote_encoding;
+   bool css_allowed;             /* whether the document is allowed to
+                                    be treated as CSS. */
     struct queue_element *next;   /* next element in queue */
   };
   
@@@ -92,21 -94,17 +96,23 @@@ url_queue_delete (struct url_queue *que
   
   static void
   url_enqueue (struct url_queue *queue,
-              const char *url, const char *referer, int depth, bool html_allowed)
+              const char *url, const char *referer, int depth,
+              bool html_allowed, bool css_allowed)
   {
     struct queue_element *qel = xnew (struct queue_element);
+ +  char *charset = get_current_charset ();
     qel->url = url;
     qel->referer = referer;
     qel->depth = depth;
     qel->html_allowed = html_allowed;
+   qel->css_allowed = css_allowed;
     qel->next = NULL;
   
+ +  if (charset)
+ +    qel->remote_encoding = xstrdup (charset);
+ +  else
+ +    qel->remote_encoding = NULL;
+ +
     ++queue->count;
     if (queue->count > queue->maxcount)
       queue->maxcount = queue->count;
@@@ -114,8 -112,6 +120,8 @@@
     DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
     DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
   
+ +  /*printf ("[Enqueuing %s with %s\n", url, qel->remote_encoding);*/
+ +
     if (queue->tail)
       queue->tail->next = qel;
     queue->tail = qel;
@@@ -130,7 -126,7 +136,7 @@@
   static bool
   url_dequeue (struct url_queue *queue,
                const char **url, const char **referer, int *depth,
-              bool *html_allowed)
+              bool *html_allowed, bool *css_allowed)
   {
     struct queue_element *qel = queue->head;
   
@@@ -141,14 -137,11 +147,15 @@@
     if (!queue->head)
       queue->tail = NULL;
   
+ +  set_remote_charset (qel->remote_encoding);
+ +  if (qel->remote_encoding)
+ +    xfree (qel->remote_encoding);
+ +
     *url = qel->url;
     *referer = qel->referer;
     *depth = qel->depth;
     *html_allowed = qel->html_allowed;
+   *css_allowed = qel->css_allowed;
   
     --queue->count;
   
@@@ -199,11 -192,8 +206,11 @@@ retrieve_tree (const char *start_url
     struct hash_table *blacklist;
   
     int up_error_code;
- -  struct url *start_url_parsed = url_parse (start_url, &up_error_code);
+ +  struct url *start_url_parsed;
   
+ +  set_ugly_no_encode (true);
+ +  start_url_parsed= url_parse (start_url, &up_error_code);
+ +  set_ugly_no_encode (false);
     if (!start_url_parsed)
       {
         logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
@@@ -216,7 -206,7 +223,7 @@@
   
     /* Enqueue the starting URL.  Use start_url_parsed->url rather than
        just URL so we enqueue the canonical form of the URL.  */
-   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true);
+   url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, true, false);
     string_set_add (blacklist, start_url_parsed->url);
   
     while (1)
@@@ -224,7 -214,8 +231,8 @@@
         bool descend = false;
         char *url, *referer, *file = NULL;
         int depth;
-       bool html_allowed;
+       bool html_allowed, css_allowed;
+       bool is_css = false;
         bool dash_p_leaf_HTML = false;
   
         if (opt.quota && total_downloaded_bytes > opt.quota)
@@@ -236,7 -227,7 +244,7 @@@
   
         if (!url_dequeue (queue,
                           (const char **)&url, (const char **)&referer,
-                         &depth, &html_allowed))
+                         &depth, &html_allowed, &css_allowed))
           break;
   
         /* ...and download it.  Note that this download is in most cases
@@@ -254,10 -245,21 +262,21 @@@
             DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
                      url, file));
   
+           /* this sucks, needs to be combined! */
             if (html_allowed
                 && downloaded_html_set
                 && string_set_contains (downloaded_html_set, file))
-             descend = true;
+             {
+               descend = true;
+               is_css = false;
+             }
+           if (css_allowed
+               && downloaded_css_set
+               && string_set_contains (downloaded_css_set, file))
+             {
+               descend = true;
+               is_css = true;
+             }
           }
         else
           {
@@@ -268,7 -270,21 +287,21 @@@
   
             if (html_allowed && file && status == RETROK
                 && (dt & RETROKF) && (dt & TEXTHTML))
-             descend = true;
+             {
+               descend = true;
+               is_css = false;
+             }
+ 
+           /* a little different, css_allowed can override content type
+              lots of web servers serve css with an incorrect content type
+           */
+           if (file && status == RETROK
+               && (dt & RETROKF) &&
+               ((dt & TEXTCSS) || css_allowed))
+             {
+               descend = true;
+               is_css = true;
+             }
   
             if (redirected)
               {
@@@ -322,14 -338,15 +355,15 @@@
               }
           }
   
-       /* If the downloaded document was HTML, parse it and enqueue the
+       /* If the downloaded document was HTML or CSS, parse it and enqueue the
            links it contains. */
   
         if (descend)
           {
             bool meta_disallow_follow = false;
             struct urlpos *children
-             = get_urls_html (file, url, &meta_disallow_follow);
+             = is_css ? get_urls_css_file (file, url) :
+                        get_urls_html (file, url, &meta_disallow_follow);
   
             if (opt.use_robots && meta_disallow_follow)
               {
@@@ -340,9 -357,7 +374,9 @@@
             if (children)
               {
                 struct urlpos *child = children;
- -              struct url *url_parsed = url_parsed = url_parse (url, NULL);
+ +              set_ugly_no_encode (true);
+ +              struct url *url_parsed = url_parse (url, NULL);
+ +              set_ugly_no_encode (false);
                 char *referer_url = url;
                 bool strip_auth = (url_parsed != NULL
                                    && url_parsed->user != NULL);
@@@ -363,7 -378,8 +397,8 @@@
                       {
                         url_enqueue (queue, xstrdup (child->url->url),
                                      xstrdup (referer_url), depth + 1,
-                                    child->link_expect_html);
+                                    child->link_expect_html,
+                                    child->link_expect_css);
                         /* We blacklist the URL we have enqueued, because we
                            don't want to enqueue (and hence download) the
                            same URL twice.  */
@@@ -378,18 -394,18 +413,18 @@@
               }
           }
   
- -      if (file 
- -          && (opt.delete_after 
+ +      if (file
+ +          && (opt.delete_after
                 || opt.spider /* opt.recursive is implicitely true */
                 || !acceptable (file)))
           {
             /* Either --delete-after was specified, or we loaded this
- -             (otherwise unneeded because of --spider or rejected by -R) 
- -             HTML file just to harvest its hyperlinks -- in either case, 
+ +             (otherwise unneeded because of --spider or rejected by -R)
+ +             HTML file just to harvest its hyperlinks -- in either case,
                delete the local file. */
             DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
                      opt.delete_after ? "--delete-after" :
- -                   (opt.spider ? "--spider" : 
+ +                   (opt.spider ? "--spider" :
                       "recursive rejection criteria")));
             logprintf (LOG_VERBOSE,
                        (opt.delete_after || opt.spider
@@@ -412,9 -428,9 +447,9 @@@
     {
       char *d1, *d2;
       int d3;
-     bool d4;
+     bool d4, d5;
       while (url_dequeue (queue,
-                         (const char **)&d1, (const char **)&d2, &d3, &d4))
+                         (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
         {
           xfree (d1);
           xfree_null (d2);
@@@ -454,7 -470,7 +489,7 @@@ download_child_p (const struct urlpos *
   
     if (string_set_contains (blacklist, url))
       {
- -      if (opt.spider) 
+ +      if (opt.spider)
           {
             char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
             DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
@@@ -646,13 -662,11 +681,13 @@@ descend_redirect_p (const char *redirec
     struct urlpos *upos;
     bool success;
   
+ +  set_ugly_no_encode (true);
     orig_parsed = url_parse (original, NULL);
     assert (orig_parsed != NULL);
   
     new_parsed = url_parse (redirected, NULL);
     assert (new_parsed != NULL);
+ +  set_ugly_no_encode (false);
   
     upos = xnew0 (struct urlpos);
     upos->url = new_parsed;
diff --combined src/retr.c

index dd4978a7acb17297d54b81e51fe4abc50b4f7384,58e00d2fe74102909f78cfdaa57ae5cc7caa304e..7a28ea32e76e825d452417d104fb538a178e79a8
--- 1/src/retr.c
--- 2/src/retr.c
+++ b/src/retr.c
@@@ -51,7 -51,7 +51,8 @@@ as that of the covered work.  *
   #include "hash.h"
   #include "convert.h"
   #include "ptimer.h"
+ +#include "iri.h"
+ #include "html-url.h"
   
   /* Total size of downloaded files.  Used to enforce quota.  */
   SUM_SIZE_INT total_downloaded_bytes;
@@@ -625,9 -625,6 +626,9 @@@ retrieve_url (const char *origurl, cha
     if (file)
       *file = NULL;
   
+ +  reset_utf8_encode ();
+ +
+ + second_try:
     u = url_parse (url, &up_error_code);
     if (!u)
       {
@@@ -636,8 -633,6 +637,8 @@@
         return URLERROR;
       }
   
+ +  /*printf ("[Retrieving %s with %s (UTF-8=%d)\n", url, get_remote_charset (), utf8_encoded);*/
+ +
     if (!refurl)
       refurl = opt.referer;
   
@@@ -651,11 -646,8 +652,11 @@@
     proxy = getproxy (u);
     if (proxy)
       {
+ +      /* sXXXav : support IRI for proxy */
         /* Parse the proxy URL.  */
+ +      set_ugly_no_encode (true);
         proxy_url = url_parse (proxy, &up_error_code);
+ +      set_ugly_no_encode (false);
         if (!proxy_url)
           {
             logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
@@@ -730,8 -722,6 +731,8 @@@
         xfree (mynewloc);
         mynewloc = construced_newloc;
   
+ +      reset_utf8_encode ();
+ +
         /* Now, see if this new location makes sense. */
         newloc_parsed = url_parse (mynewloc, &up_error_code);
         if (!newloc_parsed)
@@@ -780,21 -770,18 +781,31 @@@
         goto redirected;
       }
   
- -  if (local_file)
+ +  /* Try to not encode in UTF-8 if fetching failed */
+ +  if (!(*dt & RETROKF) && get_utf8_encode ())
       {
+ +      set_utf8_encode (false);
+ +      /*printf ("[Fallbacking to non-utf8 for `%s'\n", url);*/
+ +      goto second_try;
+ +    }
+ +
+ +  if (local_file && *dt & RETROKF)
+ +    {
+ +      register_download (u->url, local_file);
+ +      if (redirection_count && 0 != strcmp (origurl, u->url))
+ +        register_redirection (origurl, u->url);
+ +      if (*dt & TEXTHTML)
+ +        register_html (u->url, local_file);
+       if (*dt & RETROKF)
+         {
+           register_download (u->url, local_file);
+           if (redirection_count && 0 != strcmp (origurl, u->url))
+             register_redirection (origurl, u->url);
+           if (*dt & TEXTHTML)
+             register_html (u->url, local_file);
+           if (*dt & TEXTCSS)
+             register_css (u->url, local_file);
+         }
       }
   
     if (file)
@@@ -835,10 -822,24 +846,24 @@@ retrieve_from_file (const char *file, b
     uerr_t status;
     struct urlpos *url_list, *cur_url;
   
-   url_list = (html ? get_urls_html (file, NULL, NULL)
-               : get_urls_file (file));
+   char *input_file = NULL;
+   const char *url = file;
+ 
     status = RETROK;             /* Suppose everything is OK.  */
     *count = 0;                  /* Reset the URL count.  */
+   
+   if (url_has_scheme (url))
+     {
+       uerr_t status;
+       status = retrieve_url (url, &input_file, NULL, NULL, NULL, false);
+       if (status != RETROK)
+         return status;
+     }
+   else
+     input_file = (char *) file;
+ 
+   url_list = (html ? get_urls_html (input_file, NULL, NULL)
+               : get_urls_file (input_file));
   
     for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
       {
@@@ -859,9 -860,9 +884,9 @@@
             int old_follow_ftp = opt.follow_ftp;
   
             /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- -          if (cur_url->url->scheme == SCHEME_FTP) 
+ +          if (cur_url->url->scheme == SCHEME_FTP)
               opt.follow_ftp = 1;
- -          
+ +
             status = retrieve_tree (cur_url->url->url);
   
             opt.follow_ftp = old_follow_ftp;
@@@ -1038,10 -1039,7 +1063,10 @@@ boo
   url_uses_proxy (const char *url)
   {
     bool ret;
- -  struct url *u = url_parse (url, NULL);
+ +  struct url *u;
+ +  set_ugly_no_encode(true);
+ +  u= url_parse (url, NULL);
+ +  set_ugly_no_encode(false);
     if (!u)
       return false;
     ret = getproxy (u) != NULL;
author	Saint Xavier <wget@sxav.eu>
	Sun, 20 Jul 2008 19:45:09 +0000 (21:45 +0200)
committer	Saint Xavier <wget@sxav.eu>
	Sun, 20 Jul 2008 19:45:09 +0000 (21:45 +0200)
		1	2
ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
configure.ac	patch \|	diff1 \|	diff2 \|	blob \| history
src/ChangeLog	patch \|	diff1 \|	diff2 \|	blob \| history
src/Makefile.am	patch \|	diff1 \|	diff2 \|	blob \| history
src/html-url.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/http.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/recur.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/retr.c	patch \|	diff1 \|	diff2 \|	blob \| history