[svn] Treat the "shortcut icon" link as inline.

[wget] / src / html-url.c
diff --git a/src/html-url.c b/src/html-url.c

index 58cbabfe37de07ca835760efcf94aeb32ee00a07..74703ce6da932bd20782ad8dcdcede764437d242 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -1,20 +1,20 @@
  /* Collect URLs from HTML source.
  /* Collect URLs from HTML source.
-   Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+   Copyright (C) 1998, 2000, 2001 Free Software Foundation, Inc.
  
  
-This file is part of Wget.
+This file is part of GNU Wget.
  
  
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
+along with Wget; if not, write to the Free Software
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
  #include <config.h>
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
  #include <config.h>
@@ -26,7 +26,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  # include <strings.h>
  #endif
  #include <stdlib.h>
  # include <strings.h>
  #endif
  #include <stdlib.h>
-#include <ctype.h>
  #include <errno.h>
  #include <assert.h>
  
  #include <errno.h>
  #include <assert.h>
  
@@ -39,74 +38,89 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  extern int errno;
  #endif
  
  extern int errno;
  #endif
  
-enum tag_category { TC_LINK, TC_SPEC };
+struct map_context;
  
  
-/* Here we try to categorize the known tags.  Each tag has its ID and
-   cetegory.  Category TC_LINK means that one or more of its
-   attributes contain links that should be retrieved.  TC_SPEC means
-   that the tag is specific in some way, and has to be handled
-   specially. */
+typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,
+                                      struct map_context *));
+
+#define DECLARE_TAG_HANDLER(fun)                                       \
+  static void fun PARAMS ((int, struct taginfo *, struct map_context *))
+
+DECLARE_TAG_HANDLER (tag_find_urls);
+DECLARE_TAG_HANDLER (tag_handle_base);
+DECLARE_TAG_HANDLER (tag_handle_link);
+DECLARE_TAG_HANDLER (tag_handle_meta);
+
+/* The list of known tags and functions used for handling them.  Most
+   tags are simply harvested for URLs. */
  static struct {
    const char *name;
  static struct {
    const char *name;
-  enum tag_category category;
+  tag_handler_t handler;
  } known_tags[] = {
  #define TAG_A          0
  } known_tags[] = {
  #define TAG_A          0
-  { "a",       TC_LINK },
+  { "a",       tag_find_urls },
  #define TAG_APPLET     1
  #define TAG_APPLET     1
-  { "applet",  TC_LINK },
+  { "applet",  tag_find_urls },
  #define TAG_AREA       2
  #define TAG_AREA       2
-  { "area",    TC_LINK },
+  { "area",    tag_find_urls },
  #define TAG_BASE       3
  #define TAG_BASE       3
-  { "base",    TC_SPEC },
+  { "base",    tag_handle_base },
  #define TAG_BGSOUND    4
  #define TAG_BGSOUND    4
-  { "bgsound", TC_LINK },
+  { "bgsound", tag_find_urls },
  #define TAG_BODY       5
  #define TAG_BODY       5
-  { "body",    TC_LINK },
+  { "body",    tag_find_urls },
  #define TAG_EMBED      6
  #define TAG_EMBED      6
-  { "embed",   TC_LINK },
+  { "embed",   tag_find_urls },
  #define TAG_FIG                7
  #define TAG_FIG                7
-  { "fig",     TC_LINK },
+  { "fig",     tag_find_urls },
  #define TAG_FRAME      8
  #define TAG_FRAME      8
-  { "frame",   TC_LINK },
+  { "frame",   tag_find_urls },
  #define TAG_IFRAME     9
  #define TAG_IFRAME     9
-  { "iframe",  TC_LINK },
+  { "iframe",  tag_find_urls },
  #define TAG_IMG                10
  #define TAG_IMG                10
-  { "img",     TC_LINK },
+  { "img",     tag_find_urls },
  #define TAG_INPUT      11
  #define TAG_INPUT      11
-  { "input",   TC_LINK },
+  { "input",   tag_find_urls },
  #define TAG_LAYER      12
  #define TAG_LAYER      12
-  { "layer",   TC_LINK },
+  { "layer",   tag_find_urls },
  #define TAG_LINK       13
  #define TAG_LINK       13
-  { "link",    TC_SPEC },
+  { "link",    tag_handle_link },
  #define TAG_META       14
  #define TAG_META       14
-  { "meta",    TC_SPEC },
+  { "meta",    tag_handle_meta },
  #define TAG_OVERLAY    15
  #define TAG_OVERLAY    15
-  { "overlay", TC_LINK },
+  { "overlay", tag_find_urls },
  #define TAG_SCRIPT     16
  #define TAG_SCRIPT     16
-  { "script",  TC_LINK },
+  { "script",  tag_find_urls },
  #define TAG_TABLE      17
  #define TAG_TABLE      17
-  { "table",   TC_LINK },
+  { "table",   tag_find_urls },
  #define TAG_TD         18
  #define TAG_TD         18
-  { "td",      TC_LINK },
+  { "td",      tag_find_urls },
  #define TAG_TH         19
  #define TAG_TH         19
-  { "th",      TC_LINK }
+  { "th",      tag_find_urls }
  };
  
  };
  
-/* Flags for specific url-attr pairs handled through TC_LINK: */
-#define AF_EXTERNAL 1
+/* tag_url_attributes documents which attributes of which tags contain
+   URLs to harvest.  It is used by tag_find_urls.  */
  
  
-/* For tags handled by TC_LINK: attributes that contain URLs to
+/* Defines for the FLAGS field; currently only one flag is defined. */
+
+/* This tag points to an external document not necessary for rendering this 
+   document (i.e. it's not an inlined image, stylesheet, etc.). */
+#define TUA_EXTERNAL 1
+
+/* For tags handled by tag_find_urls: attributes that contain URLs to
     download. */
  static struct {
    int tagid;
    const char *attr_name;
    int flags;
     download. */
  static struct {
    int tagid;
    const char *attr_name;
    int flags;
-} url_tag_attr_map[] = {
-  { TAG_A,             "href",         AF_EXTERNAL },
+} tag_url_attributes[] = {
+  { TAG_A,             "href",         TUA_EXTERNAL },
    { TAG_APPLET,                "code",         0 },
    { TAG_APPLET,                "code",         0 },
-  { TAG_AREA,          "href",         AF_EXTERNAL },
+  { TAG_AREA,          "href",         TUA_EXTERNAL },
    { TAG_BGSOUND,       "src",          0 },
    { TAG_BODY,          "background",   0 },
    { TAG_BGSOUND,       "src",          0 },
    { TAG_BODY,          "background",   0 },
+  { TAG_EMBED,         "href",         TUA_EXTERNAL },
    { TAG_EMBED,         "src",          0 },
    { TAG_FIG,           "src",          0 },
    { TAG_FRAME,         "src",          0 },
    { TAG_EMBED,         "src",          0 },
    { TAG_FIG,           "src",          0 },
    { TAG_FRAME,         "src",          0 },
@@ -136,7 +150,7 @@ static const char *additional_attributes[] = {
  static const char **interesting_tags;
  static const char **interesting_attributes;
  
  static const char **interesting_tags;
  static const char **interesting_attributes;
  
-void
+static void
  init_interesting (void)
  {
    /* Init the variables interesting_tags and interesting_attributes
  init_interesting (void)
  {
    /* Init the variables interesting_tags and interesting_attributes
@@ -146,7 +160,10 @@ init_interesting (void)
  
       Here we also make sure that what we put in interesting_tags
       matches the user's preferences as specified through --ignore-tags
  
       Here we also make sure that what we put in interesting_tags
       matches the user's preferences as specified through --ignore-tags
-     and --follow-tags.  */
+     and --follow-tags.
+
+     This function is as large as this only because of the glorious
+     expressivity of the C programming language.  */
  
    {
      int i, ind = 0;
  
    {
      int i, ind = 0;
@@ -160,7 +177,7 @@ init_interesting (void)
         /* Normally here we could say:
            interesting_tags[i] = name;
            But we need to respect the settings of --ignore-tags and
         /* Normally here we could say:
            interesting_tags[i] = name;
            But we need to respect the settings of --ignore-tags and
-          --follow-tags, so the code gets a bit harier.  */
+          --follow-tags, so the code gets a bit hairier.  */
  
         if (opt.ignore_tags)
           {
  
         if (opt.ignore_tags)
           {
@@ -170,8 +187,7 @@ init_interesting (void)
                through if there's no match. */
             int j, lose = 0;
             for (j = 0; opt.ignore_tags[j] != NULL; j++)
                through if there's no match. */
             int j, lose = 0;
             for (j = 0; opt.ignore_tags[j] != NULL; j++)
-             /* Loop through all the tags this user doesn't care
-                 about. */
+             /* Loop through all the tags this user doesn't care about. */
               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
                 {
                   lose = 1;
               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
                 {
                   lose = 1;
@@ -183,8 +199,8 @@ init_interesting (void)
  
         if (opt.follow_tags)
           {
  
         if (opt.follow_tags)
           {
-           /* --follow-tags was specified.  Only match these specific
-              tags, so return FALSE if we don't match one of them. */
+           /* --follow-tags was specified.  Only match these specific tags, so
+              continue back to top of for if we don't match one of them. */
             int j, win = 0;
             for (j = 0; opt.follow_tags[j] != NULL; j++)
               /* Loop through all the tags this user cares about. */
             int j, win = 0;
             for (j = 0; opt.follow_tags[j] != NULL; j++)
               /* Loop through all the tags this user cares about. */
@@ -194,12 +210,11 @@ init_interesting (void)
                   break;
                 }
             if (!win)
                   break;
                 }
             if (!win)
-             continue;         /* wasn't one of the explicitly
-                                   desired tags */
+             continue;  /* wasn't one of the explicitly desired tags */
           }
  
         /* If we get to here, --follow-tags isn't being used or the
           }
  
         /* If we get to here, --follow-tags isn't being used or the
-          tag is among the ones that are follwed, and --ignore-tags,
+          tag is among the ones that are followed, and --ignore-tags,
            if specified, didn't include this tag, so it's an
            "interesting" one. */
         interesting_tags[ind++] = name;
            if specified, didn't include this tag, so it's an
            "interesting" one. */
         interesting_tags[ind++] = name;
@@ -207,7 +222,7 @@ init_interesting (void)
      interesting_tags[ind] = NULL;
    }
  
      interesting_tags[ind] = NULL;
    }
  
-  /* The same for attributes, except we loop through url_tag_attr_map.
+  /* The same for attributes, except we loop through tag_url_attributes.
       Here we also need to make sure that the list of attributes is
       unique, and to include the attributes from additional_attributes.  */
    {
       Here we also need to make sure that the list of attributes is
       unique, and to include the attributes from additional_attributes.  */
    {
@@ -219,10 +234,10 @@ init_interesting (void)
        att[i] = additional_attributes[i];
      ind = i;
      att[ind] = NULL;
        att[i] = additional_attributes[i];
      ind = i;
      att[ind] = NULL;
-    for (i = 0; i < ARRAY_SIZE (url_tag_attr_map); i++)
+    for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
        {
         int j, seen = 0;
        {
         int j, seen = 0;
-       const char *look_for = url_tag_attr_map[i].attr_name;
+       const char *look_for = tag_url_attributes[i].attr_name;
         for (j = 0; j < ind - 1; j++)
           if (!strcmp (att[j], look_for))
             {
         for (j = 0; j < ind - 1; j++)
           if (!strcmp (att[j], look_for))
             {
@@ -262,278 +277,331 @@ find_tag (const char *tag_name)
  }
  
  /* Find the value of attribute named NAME in the taginfo TAG.  If the
  }
  
  /* Find the value of attribute named NAME in the taginfo TAG.  If the
-   attribute is not present, return NULL.  If ATTRID is non-NULL, the
-   exact identity of the attribute will be returned.  */
+   attribute is not present, return NULL.  If ATTRIND is non-NULL, the
+   index of the attribute in TAG will be stored there.  */
  static char *
  static char *
-find_attr (struct taginfo *tag, const char *name, int *attrid)
+find_attr (struct taginfo *tag, const char *name, int *attrind)
  {
    int i;
    for (i = 0; i < tag->nattrs; i++)
      if (!strcasecmp (tag->attrs[i].name, name))
        {
  {
    int i;
    for (i = 0; i < tag->nattrs; i++)
      if (!strcasecmp (tag->attrs[i].name, name))
        {
-       if (attrid)
-         *attrid = i;
+       if (attrind)
+         *attrind = i;
         return tag->attrs[i].value;
        }
    return NULL;
  }
  
         return tag->attrs[i].value;
        }
    return NULL;
  }
  
-struct collect_urls_closure {
+struct map_context {
    char *text;                  /* HTML text. */
    char *base;                  /* Base URI of the document, possibly
                                    changed through <base href=...>. */
    char *text;                  /* HTML text. */
    char *base;                  /* Base URI of the document, possibly
                                    changed through <base href=...>. */
-  urlpos *head, *tail;         /* List of URLs */
    const char *parent_base;     /* Base of the current document. */
    const char *document_file;   /* File name of this document. */
    const char *parent_base;     /* Base of the current document. */
    const char *document_file;   /* File name of this document. */
-  int dash_p_leaf_HTML;                /* Whether -p is specified, and this
-                                   document is the "leaf" node of the
-                                   HTML tree. */
    int nofollow;                        /* whether NOFOLLOW was specified in a
                                     <meta name=robots> tag. */
    int nofollow;                        /* whether NOFOLLOW was specified in a
                                     <meta name=robots> tag. */
-};
  
  
-/* Resolve LINK_URI and append it to closure->tail.  TAG and ATTRID
-   are the necessary context to store the position and size.  */
-
-static void
-handle_link (struct collect_urls_closure *closure, const char *link_uri,
-            struct taginfo *tag, int attrid)
-{
-  int no_proto = !has_proto (link_uri);
-  urlpos *newel;
+  struct urlpos *head, *tail;  /* List of URLs that is being
+                                  built. */
+};
  
  
-  const char *base = closure->base ? closure->base : closure->parent_base;
-  char *complete_uri;
+/* Append LINK_URI to the urlpos structure that is being built.
  
  
-  char *fragment = strrchr (link_uri, '#');
+   LINK_URI will be merged with the current document base.  TAG and
+   ATTRIND are the necessary context to store the position and
+   size.  */
  
  
-  if (fragment)
-    {
-      /* Nullify the fragment identifier, i.e. everything after the
-         last occurrence of `#', inclusive.  This copying is
-         relatively inefficient, but it doesn't matter because
-         fragment identifiers don't come up all that often.  */
-      int hashlen = fragment - link_uri;
-      char *p = alloca (hashlen + 1);
-      memcpy (p, link_uri, hashlen);
-      p[hashlen] = '\0';
-      link_uri = p;
-    }
+static struct urlpos *
+append_one_url (const char *link_uri, int inlinep,
+               struct taginfo *tag, int attrind, struct map_context *ctx)
+{
+  int link_has_scheme = url_has_scheme (link_uri);
+  struct urlpos *newel;
+  const char *base = ctx->base ? ctx->base : ctx->parent_base;
+  struct url *url;
  
    if (!base)
      {
  
    if (!base)
      {
-      if (no_proto)
+      DEBUGP (("%s: no base, merge will use \"%s\".\n",
+              ctx->document_file, link_uri));
+
+      if (!link_has_scheme)
         {
         {
-         /* We have no base, and the link does not have a protocol or
-             a host attached to it.  Nothing we can do.  */
-         /* #### Should we print a warning here?  Wget 1.5.x used to.  */
-         return;
+         /* Base URL is unavailable, and the link does not have a
+            location attached to it -- we have to give up.  Since
+            this can only happen when using `--force-html -i', print
+            a warning.  */
+         logprintf (LOG_NOTQUIET,
+                    _("%s: Cannot resolve incomplete link %s.\n"),
+                    ctx->document_file, link_uri);
+         return NULL;
+       }
+
+      url = url_parse (link_uri, NULL);
+      if (!url)
+       {
+         DEBUGP (("%s: link \"%s\" doesn't parse.\n",
+                  ctx->document_file, link_uri));
+         return NULL;
         }
         }
-      else
-       complete_uri = xstrdup (link_uri);
      }
    else
      }
    else
-    complete_uri = url_concat (base, link_uri);
+    {
+      /* Merge BASE with LINK_URI, but also make sure the result is
+        canonicalized, i.e. that "../" have been resolved.
+        (parse_url will do that for us.) */
  
  
-  DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
-          closure->document_file, base ? base : "(null)",
-          link_uri, complete_uri));
+      char *complete_uri = uri_merge (base, link_uri);
  
  
-  newel = (urlpos *)xmalloc (sizeof (urlpos));
+      DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
+              ctx->document_file, base, link_uri, complete_uri));
  
  
+      url = url_parse (complete_uri, NULL);
+      if (!url)
+       {
+         DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
+                  ctx->document_file, complete_uri));
+         xfree (complete_uri);
+         return NULL;
+       }
+      xfree (complete_uri);
+    }
+
+  DEBUGP (("appending \"%s\" to urlpos.\n", url->url));
+
+  newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
    memset (newel, 0, sizeof (*newel));
    memset (newel, 0, sizeof (*newel));
-  newel->next = NULL;
-  newel->url = complete_uri;
-  newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
-  newel->size = tag->attrs[attrid].value_raw_size;
  
  
-  /* A URL is relative if the host and protocol are not named, and the
-     name does not start with `/'.  */
-  if (no_proto && *link_uri != '/')
+  newel->next = NULL;
+  newel->url = url;
+  newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text;
+  newel->size = tag->attrs[attrind].value_raw_size;
+  newel->link_inline_p = inlinep;
+
+  /* A URL is relative if the host is not named, and the name does not
+     start with `/'.  */
+  if (!link_has_scheme && *link_uri != '/')
      newel->link_relative_p = 1;
      newel->link_relative_p = 1;
-  else if (!no_proto)
+  else if (link_has_scheme)
      newel->link_complete_p = 1;
  
      newel->link_complete_p = 1;
  
-  if (closure->tail)
+  if (ctx->tail)
      {
      {
-      closure->tail->next = newel;
-      closure->tail = newel;
+      ctx->tail->next = newel;
+      ctx->tail = newel;
      }
    else
      }
    else
-    closure->tail = closure->head = newel;
+    ctx->tail = ctx->head = newel;
+
+  return newel;
  }
  }
+\f
+/* All the tag_* functions are called from collect_tags_mapper, as
+   specified by KNOWN_TAGS.  */
  
  
-/* #### Document what this does.
-   #### It would be nice to split this into several functions.  */
+/* Default tag handler: collect URLs from attributes specified for
+   this tag by tag_url_attributes.  */
  
  static void
  
  static void
-collect_tags_mapper (struct taginfo *tag, void *arg)
+tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
  {
  {
-  struct collect_urls_closure *closure = (struct collect_urls_closure *)arg;
-  int tagid = find_tag (tag->name);
-  assert (tagid != -1);
+  int i, attrind, first = -1;
+  int size = ARRAY_SIZE (tag_url_attributes);
  
  
-  switch (known_tags[tagid].category)
-    {
-    case TC_LINK:
+  for (i = 0; i < size; i++)
+    if (tag_url_attributes[i].tagid == tagid)
        {
        {
-       int i;
-       int size = ARRAY_SIZE (url_tag_attr_map);
-       for (i = 0; i < size; i++)
-         if (url_tag_attr_map[i].tagid == tagid)
-           break;
-       /* We've found the index of url_tag_attr_map where the
-           attributes of our tags begin.  Now, look for every one of
-           them, and handle it.  */
-       for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
-         {
-           char *attr_value;
-           int id;
-           if (closure->dash_p_leaf_HTML
-               && (url_tag_attr_map[i].flags & AF_EXTERNAL))
-             /* If we're at a -p leaf node, we don't want to retrieve
-                 links to references we know are external, such as <a
-                 href=...>.  */
-             continue;
-
-           /* This find_attr() buried in a loop may seem inefficient
-               (O(n^2)), but it's not, since the number of attributes
-               (n) we loop over is extremely small.  In the worst case
-               of IMG with all its possible attributes, n^2 will be
-               only 9.  */
-           attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
-           if (attr_value)
-             handle_link (closure, attr_value, tag, id);
-         }
+       /* We've found the index of tag_url_attributes where the
+          attributes of our tag begin.  */
+       first = i;
+       break;
        }
        }
-      break;
-    case TC_SPEC:
-      switch (tagid)
+  assert (first != -1);
+
+  /* Loop over the "interesting" attributes of this tag.  In this
+     example, it will loop over "src" and "lowsrc".
+
+       <img src="foo.png" lowsrc="bar.png">
+
+     This has to be done in the outer loop so that the attributes are
+     processed in the same order in which they appear in the page.
+     This is required when converting links.  */
+
+  for (attrind = 0; attrind < tag->nattrs; attrind++)
+    {
+      /* Find whether TAG/ATTRIND is a combination that contains a
+        URL. */
+      char *link = tag->attrs[attrind].value;
+
+      /* If you're cringing at the inefficiency of the nested loops,
+        remember that they both iterate over a laughably small
+        quantity of items.  The worst-case inner loop is for the IMG
+        tag, which has three attributes.  */
+      for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
         {
         {
-       case TAG_BASE:
-         {
-           char *newbase = find_attr (tag, "href", NULL);
-           if (!newbase)
-             break;
-           if (closure->base)
-             xfree (closure->base);
-           if (closure->parent_base)
-             closure->base = url_concat (closure->parent_base, newbase);
-           else
-             closure->base = xstrdup (newbase);
-         }
-         break;
-       case TAG_LINK:
-         {
-           int id;
-           char *rel  = find_attr (tag, "rel", NULL);
-           char *href = find_attr (tag, "href", &id);
-           if (href)
-             {
-               /* In the normal case, all <link href=...> tags are
-                  fair game.
-
-                  In the special case of when -p is active, however,
-                  and we're at a leaf node (relative to the -l
-                  max. depth) in the HTML document tree, the only
-                  <LINK> tag we'll follow is a <LINK REL=
-                  "stylesheet">, as it's necessary for displaying
-                  this document properly.  We won't follow other
-                  <LINK> tags, like <LINK REL="home">, for instance,
-                  as they refer to external documents.  */
-               if (!closure->dash_p_leaf_HTML
-                   || (rel && !strcasecmp (rel, "stylesheet")))
-                 handle_link (closure, href, tag, id);
-             }
-         }
-         break;
-       case TAG_META:
-         /* Some pages use a META tag to specify that the page be
-            refreshed by a new page after a given number of seconds.
-            The general format for this is:
+         if (0 == strcasecmp (tag->attrs[attrind].name,
+                              tag_url_attributes[i].attr_name))
+           {
+             int flags = tag_url_attributes[i].flags;
+             append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx);
+           }
+       }
+    }
+}
  
  
-            <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
+/* Handle the BASE tag, for <base href=...>. */
  
  
-            So we just need to skip past the "NUMBER; URL=" garbage
-            to get to the URL.  */
-         {
-           int id;
-           char *name = find_attr (tag, "name", NULL);
-           char *http_equiv = find_attr (tag, "http-equiv", &id);
-           if (http_equiv && !strcasecmp (http_equiv, "refresh"))
-             {
-               char *refresh = find_attr (tag, "content", NULL);
-               char *p = refresh;
-               int offset;
-               while (ISDIGIT (*p))
-                 ++p;
-               if (*p++ != ';')
-                 return;
-               while (ISSPACE (*p))
-                 ++p;
-               if (!(TOUPPER (*p) == 'U'
-                     && TOUPPER (*(p + 1)) == 'R'
-                     && TOUPPER (*(p + 2)) == 'L'
-                     && *(p + 3) == '='))
-                 return;
-               p += 4;
-               while (ISSPACE (*p))
-                 ++p;
-               offset = p - refresh;
-               tag->attrs[id].value_raw_beginning += offset;
-               tag->attrs[id].value_raw_size -= offset;
-               handle_link (closure, p, tag, id);
-             }
-           else if (name && !strcasecmp (name, "robots"))
-             {
-               /* Handle stuff like:
-                  <meta name="robots" content="index,nofollow"> */
-               char *content = find_attr (tag, "content", NULL);
-               if (!content)
-                 return;
-               if (!strcasecmp (content, "none"))
-                 closure->nofollow = 1;
-               else
-                 {
-                   while (*content)
-                     {
-                       /* Find the next occurrence of ',' or the end of
-                          the string.  */
-                       char *end = strchr (content, ',');
-                       if (end)
-                         ++end;
-                       else
-                         end = content + strlen (content);
-                       if (!strncasecmp (content, "nofollow", end - content))
-                         closure->nofollow = 1;
-                       content = end;
-                     }
-                 }
-             }
-         }
-         break;
-       default:
-         /* Category is TC_SPEC, but tag name is unhandled.  This
-             must not be.  */
-         abort ();
+static void
+tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
+{
+  struct urlpos *base_urlpos;
+  int attrind;
+  char *newbase = find_attr (tag, "href", &attrind);
+  if (!newbase)
+    return;
+
+  base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx);
+  if (!base_urlpos)
+    return;
+  base_urlpos->ignore_when_downloading = 1;
+  base_urlpos->link_base_p = 1;
+
+  if (ctx->base)
+    xfree (ctx->base);
+  if (ctx->parent_base)
+    ctx->base = uri_merge (ctx->parent_base, newbase);
+  else
+    ctx->base = xstrdup (newbase);
+}
+
+/* Handle the LINK tag.  It requires special handling because how its
+   links will be followed in -p mode depends on the REL attribute.  */
+
+static void
+tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
+{
+  int attrind;
+  char *href = find_attr (tag, "href", &attrind);
+
+  /* All <link href="..."> link references are external, except those
+     known not to be, such as style sheet and shortcut icon:
+
+       <link rel="stylesheet" href="...">
+       <link rel="shortcut icon" href="...">
+  */
+  if (href)
+    {
+      char *rel  = find_attr (tag, "rel", NULL);
+      int inlinep = (rel
+                    && (0 == strcasecmp (rel, "stylesheet")
+                        || 0 == strcasecmp (rel, "shortcut icon")));
+      append_one_url (href, inlinep, tag, attrind, ctx);
+    }
+}
+
+/* Handle the META tag.  This requires special handling because of the
+   refresh feature and because of robot exclusion.  */
+
+static void
+tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
+{
+  char *name = find_attr (tag, "name", NULL);
+  char *http_equiv = find_attr (tag, "http-equiv", NULL);
+
+  if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
+    {
+      /* Some pages use a META tag to specify that the page be
+        refreshed by a new page after a given number of seconds.  The
+        general format for this is:
+
+          <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
+
+        So we just need to skip past the "NUMBER; URL=" garbage to
+        get to the URL.  */
+
+      struct urlpos *entry;
+
+      int attrind;
+      char *p, *refresh = find_attr (tag, "content", &attrind);
+      int timeout = 0;
+
+      for (p = refresh; ISDIGIT (*p); p++)
+       timeout = 10 * timeout + *p - '0';
+      if (*p++ != ';')
+       return;
+
+      while (ISSPACE (*p))
+       ++p;
+      if (!(   TOUPPER (*p)       == 'U'
+           && TOUPPER (*(p + 1)) == 'R'
+           && TOUPPER (*(p + 2)) == 'L'
+           &&          *(p + 3)  == '='))
+       return;
+      p += 4;
+      while (ISSPACE (*p))
+       ++p;
+
+      entry = append_one_url (p, 0, tag, attrind, ctx);
+      if (entry)
+       {
+         entry->link_refresh_p = 1;
+         entry->refresh_timeout = timeout;
+       }
+    }
+  else if (name && 0 == strcasecmp (name, "robots"))
+    {
+      /* Handle stuff like:
+        <meta name="robots" content="index,nofollow"> */
+      char *content = find_attr (tag, "content", NULL);
+      if (!content)
+       return;
+      if (!strcasecmp (content, "none"))
+       ctx->nofollow = 1;
+      else
+       {
+         while (*content)
+           {
+             /* Find the next occurrence of ',' or the end of
+                the string.  */
+             char *end = strchr (content, ',');
+             if (end)
+               ++end;
+             else
+               end = content + strlen (content);
+             if (!strncasecmp (content, "nofollow", end - content))
+               ctx->nofollow = 1;
+             content = end;
+           }
         }
         }
-      break;
      }
  }
  
      }
  }
  
-/* Scan FILE, retrieving links to HTML documents from it.  Each link is 
+/* Examine name and attributes of TAG and take appropriate action
+   according to the tag.  */
  
  
-  Similar to get_urls_file, but for HTML files.  FILE is scanned as
-   an HTML document.  get_urls_html() constructs the URLs from the
-   relative href-s.
+static void
+collect_tags_mapper (struct taginfo *tag, void *arg)
+{
+  struct map_context *ctx = (struct map_context *)arg;
+  int tagid;
+  tag_handler_t handler;
+
+  tagid = find_tag (tag->name);
+  assert (tagid != -1);
+  handler = known_tags[tagid].handler;
  
  
-   If SILENT is non-zero, do not barf on baseless relative links.  */
-urlpos *
-get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
-              int *meta_disallow_follow)
+  handler (tagid, tag, ctx);
+}
+\f
+/* Analyze HTML tags FILE and construct a list of URLs referenced from
+   it.  It merges relative links in FILE with URL.  It is aware of
+   <base href=...> and does the right thing.  */
+struct urlpos *
+get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
  {
    struct file_memory *fm;
  {
    struct file_memory *fm;
-  struct collect_urls_closure closure;
+  struct map_context ctx;
  
    /* Load the file. */
    fm = read_file (file);
  
    /* Load the file. */
    fm = read_file (file);
@@ -544,25 +612,31 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
      }
    DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
  
      }
    DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
  
-  closure.text = fm->content;
-  closure.head = closure.tail = NULL;
-  closure.base = NULL;
-  closure.parent_base = this_url ? this_url : opt.base_href;
-  closure.document_file = file;
-  closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
-  closure.nofollow = 0;
+  ctx.text = fm->content;
+  ctx.head = ctx.tail = NULL;
+  ctx.base = NULL;
+  ctx.parent_base = url ? url : opt.base_href;
+  ctx.document_file = file;
+  ctx.nofollow = 0;
  
    if (!interesting_tags)
      init_interesting ();
  
    map_html_tags (fm->content, fm->length, interesting_tags,
  
    if (!interesting_tags)
      init_interesting ();
  
    map_html_tags (fm->content, fm->length, interesting_tags,
-                interesting_attributes, collect_tags_mapper, &closure);
+                interesting_attributes, collect_tags_mapper, &ctx);
  
  
-  DEBUGP (("no-follow in %s: %d\n", file, closure.nofollow));
+  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
    if (meta_disallow_follow)
    if (meta_disallow_follow)
-    *meta_disallow_follow = closure.nofollow;
+    *meta_disallow_follow = ctx.nofollow;
  
  
-  FREE_MAYBE (closure.base);
+  FREE_MAYBE (ctx.base);
    read_file_free (fm);
    read_file_free (fm);
-  return closure.head;
+  return ctx.head;
+}
+
+void
+cleanup_html_url (void)
+{
+  FREE_MAYBE (interesting_tags);
+  FREE_MAYBE (interesting_attributes);
  }
  }