[svn] Handle <base href=...> when converting links.

[wget] / src / html-url.c
diff --git a/src/html-url.c b/src/html-url.c

index 433c9dcae78e59f6c4e4778b79888c9e138d233c..051f5057439fcefd6e31b16aabaaf0603f1a3eff 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -1,20 +1,20 @@
  /* Collect URLs from HTML source.
     Copyright (C) 1998, 2000 Free Software Foundation, Inc.
  
-This file is part of Wget.
+This file is part of GNU Wget.
  
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
+along with Wget; if not, write to the Free Software
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
  #include <config.h>
@@ -284,7 +284,7 @@ struct collect_urls_closure {
    char *text;                  /* HTML text. */
    char *base;                  /* Base URI of the document, possibly
                                    changed through <base href=...>. */
-  urlpos *head, *tail;         /* List of URLs */
+  struct urlpos *head, *tail;  /* List of URLs */
    const char *parent_base;     /* Base of the current document. */
    const char *document_file;   /* File name of this document. */
    int dash_p_leaf_HTML;                /* Whether -p is specified, and this
@@ -297,63 +297,71 @@ struct collect_urls_closure {
  /* Resolve LINK_URI and append it to closure->tail.  TAG and ATTRID
     are the necessary context to store the position and size.  */
  
-static void
+static struct urlpos *
  handle_link (struct collect_urls_closure *closure, const char *link_uri,
              struct taginfo *tag, int attrid)
  {
-  int no_proto = !has_proto (link_uri);
-  urlpos *newel;
-
+  int link_has_scheme = url_has_scheme (link_uri);
+  struct urlpos *newel;
    const char *base = closure->base ? closure->base : closure->parent_base;
-  char *complete_uri;
-
-  char *fragment = strrchr (link_uri, '#');
-
-  if (fragment)
-    {
-      /* Nullify the fragment identifier, i.e. everything after the
-         last occurrence of `#', inclusive.  This copying is
-         relatively inefficient, but it doesn't matter because
-         fragment identifiers don't come up all that often.  */
-      int hashlen = fragment - link_uri;
-      char *p = alloca (hashlen + 1);
-      memcpy (p, link_uri, hashlen);
-      p[hashlen] = '\0';
-      link_uri = p;
-    }
+  struct url *url;
  
    if (!base)
      {
-      if (no_proto)
+      DEBUGP (("%s: no base, merge will use \"%s\".\n",
+              closure->document_file, link_uri));
+
+      if (!link_has_scheme)
         {
-         /* We have no base, and the link does not have a protocol or
-             a host attached to it.  Nothing we can do.  */
+         /* We have no base, and the link does not have a host
+            attached to it.  Nothing we can do.  */
           /* #### Should we print a warning here?  Wget 1.5.x used to.  */
-         return;
+         return NULL;
+       }
+
+      url = url_parse (link_uri, NULL);
+      if (!url)
+       {
+         DEBUGP (("%s: link \"%s\" doesn't parse.\n",
+                  closure->document_file, link_uri));
+         return NULL;
         }
-      else
-       complete_uri = xstrdup (link_uri);
      }
    else
-    complete_uri = uri_merge (base, link_uri);
+    {
+      /* Merge BASE with LINK_URI, but also make sure the result is
+        canonicalized, i.e. that "../" have been resolved.
+        (parse_url will do that for us.) */
  
-  DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
-          closure->document_file, base ? base : "(null)",
-          link_uri, complete_uri));
+      char *complete_uri = uri_merge (base, link_uri);
+
+      DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
+              closure->document_file, base, link_uri, complete_uri));
+
+      url = url_parse (complete_uri, NULL);
+      if (!url)
+       {
+         DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
+                  closure->document_file, complete_uri));
+         xfree (complete_uri);
+         return NULL;
+       }
+      xfree (complete_uri);
+    }
  
-  newel = (urlpos *)xmalloc (sizeof (urlpos));
+  newel = (struct urlpos *)xmalloc (sizeof (struct urlpos));
  
    memset (newel, 0, sizeof (*newel));
    newel->next = NULL;
-  newel->url = complete_uri;
+  newel->url = url;
    newel->pos = tag->attrs[attrid].value_raw_beginning - closure->text;
    newel->size = tag->attrs[attrid].value_raw_size;
  
-  /* A URL is relative if the host and protocol are not named, and the
-     name does not start with `/'.  */
-  if (no_proto && *link_uri != '/')
+  /* A URL is relative if the host is not named, and the name does not
+     start with `/'.  */
+  if (!link_has_scheme && *link_uri != '/')
      newel->link_relative_p = 1;
-  else if (!no_proto)
+  else if (link_has_scheme)
      newel->link_complete_p = 1;
  
    if (closure->tail)
@@ -363,6 +371,8 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
      }
    else
      closure->tail = closure->head = newel;
+
+  return newel;
  }
  
  /* Examine name and attributes of TAG and take appropriate action.
@@ -383,7 +393,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
      {
      case TC_LINK:
        {
-       int i;
+       int i, id, first;
         int size = ARRAY_SIZE (url_tag_attr_map);
         for (i = 0; i < size; i++)
           if (url_tag_attr_map[i].tagid == tagid)
@@ -391,25 +401,34 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
         /* We've found the index of url_tag_attr_map where the
             attributes of our tags begin.  Now, look for every one of
             them, and handle it.  */
-       for (; (i < size && url_tag_attr_map[i].tagid == tagid); i++)
+       /* Need to process the attributes in the order they appear in
+          the tag, as this is required if we convert links.  */
+       first = i;
+       for (id = 0; id < tag->nattrs; id++)
           {
-           char *attr_value;
-           int id;
-           if (closure->dash_p_leaf_HTML
-               && (url_tag_attr_map[i].flags & AF_EXTERNAL))
-             /* If we're at a -p leaf node, we don't want to retrieve
-                 links to references we know are external to this document,
-                such as <a href=...>.  */
-             continue;
-
-           /* This find_attr() buried in a loop may seem inefficient
-               (O(n^2)), but it's not, since the number of attributes
-               (n) we loop over is extremely small.  In the worst case
-               of IMG with all its possible attributes, n^2 will be
-               only 9.  */
-           attr_value = find_attr (tag, url_tag_attr_map[i].attr_name, &id);
-           if (attr_value)
-             handle_link (closure, attr_value, tag, id);
+           /* This nested loop may seem inefficient (O(n^2)), but it's
+              not, since the number of attributes (n) we loop over is
+              extremely small.  In the worst case of IMG with all its
+              possible attributes, n^2 will be only 9.  */
+           for (i = first; (i < size && url_tag_attr_map[i].tagid == tagid);
+                i++)
+             {
+               char *attr_value;
+               if (closure->dash_p_leaf_HTML
+                   && (url_tag_attr_map[i].flags & AF_EXTERNAL))
+                 /* If we're at a -p leaf node, we don't want to retrieve
+                    links to references we know are external to this document,
+                    such as <a href=...>.  */
+                 continue;
+
+               if (!strcasecmp (tag->attrs[id].name,
+                                url_tag_attr_map[i].attr_name))
+                 {
+                   attr_value = tag->attrs[id].value;
+                   if (attr_value)
+                     handle_link (closure, attr_value, tag, id);
+                 }
+             }
           }
        }
        break;
@@ -418,9 +437,18 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
         {
         case TAG_BASE:
           {
-           char *newbase = find_attr (tag, "href", NULL);
+           struct urlpos *base_urlpos;
+           int id;
+           char *newbase = find_attr (tag, "href", &id);
             if (!newbase)
               break;
+
+           base_urlpos = handle_link (closure, newbase, tag, id);
+           if (!base_urlpos)
+             break;
+           base_urlpos->ignore_when_downloading = 1;
+           base_urlpos->link_base_p = 1;
+
             if (closure->base)
               xfree (closure->base);
             if (closure->parent_base)
@@ -528,13 +556,13 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
  }
  
  /* Analyze HTML tags FILE and construct a list of URLs referenced from
-   it.  It merges relative links in FILE with THIS_URL.  It is aware
-   of <base href=...> and does the right thing.
+   it.  It merges relative links in FILE with URL.  It is aware of
+   <base href=...> and does the right thing.
  
     If dash_p_leaf_HTML is non-zero, only the elements needed to render
     FILE ("non-external" links) will be returned.  */
-urlpos *
-get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
+struct urlpos *
+get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
                int *meta_disallow_follow)
  {
    struct file_memory *fm;
@@ -552,7 +580,7 @@ get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
    closure.text = fm->content;
    closure.head = closure.tail = NULL;
    closure.base = NULL;
-  closure.parent_base = this_url ? this_url : opt.base_href;
+  closure.parent_base = url ? url : opt.base_href;
    closure.document_file = file;
    closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
    closure.nofollow = 0;