[svn] Update copyright notices.

[wget] / src / html-url.c
diff --git a/src/html-url.c b/src/html-url.c

index 5d37cf0ad981f0690611812a49fbb67430670ecb..0659edd51c4427e129d57006cdc5f83f27a20433 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -1,20 +1,20 @@
  /* Collect URLs from HTML source.
     Copyright (C) 1998, 2000 Free Software Foundation, Inc.
  
-This file is part of Wget.
+This file is part of GNU Wget.
  
-This program is free software; you can redistribute it and/or modify
+GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
-This program is distributed in the hope that it will be useful,
+GNU Wget is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
+along with Wget; if not, write to the Free Software
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
  #include <config.h>
@@ -26,7 +26,6 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  # include <strings.h>
  #endif
  #include <stdlib.h>
-#include <ctype.h>
  #include <errno.h>
  #include <assert.h>
  
@@ -92,9 +91,14 @@ static struct {
    { "th",      TC_LINK }
  };
  
+
  /* Flags for specific url-attr pairs handled through TC_LINK: */
+
+/* This tag points to an external document not necessary for rendering this 
+   document (i.e. it's not an inlined image, stylesheet, etc.). */
  #define AF_EXTERNAL 1
  
+
  /* For tags handled by TC_LINK: attributes that contain URLs to
     download. */
  static struct {
@@ -160,7 +164,7 @@ init_interesting (void)
         /* Normally here we could say:
            interesting_tags[i] = name;
            But we need to respect the settings of --ignore-tags and
-          --follow-tags, so the code gets a bit harier.  */
+          --follow-tags, so the code gets a bit hairier.  */
  
         if (opt.ignore_tags)
           {
@@ -170,8 +174,7 @@ init_interesting (void)
                through if there's no match. */
             int j, lose = 0;
             for (j = 0; opt.ignore_tags[j] != NULL; j++)
-             /* Loop through all the tags this user doesn't care
-                 about. */
+             /* Loop through all the tags this user doesn't care about. */
               if (strcasecmp(opt.ignore_tags[j], name) == EQ)
                 {
                   lose = 1;
@@ -183,8 +186,8 @@ init_interesting (void)
  
         if (opt.follow_tags)
           {
-           /* --follow-tags was specified.  Only match these specific
-              tags, so return FALSE if we don't match one of them. */
+           /* --follow-tags was specified.  Only match these specific tags, so
+              continue back to top of for if we don't match one of them. */
             int j, win = 0;
             for (j = 0; opt.follow_tags[j] != NULL; j++)
               /* Loop through all the tags this user cares about. */
@@ -194,12 +197,11 @@ init_interesting (void)
                   break;
                 }
             if (!win)
-             continue;         /* wasn't one of the explicitly
-                                   desired tags */
+             continue;  /* wasn't one of the explicitly desired tags */
           }
  
         /* If we get to here, --follow-tags isn't being used or the
-          tag is among the ones that are follwed, and --ignore-tags,
+          tag is among the ones that are followed, and --ignore-tags,
            if specified, didn't include this tag, so it's an
            "interesting" one. */
         interesting_tags[ind++] = name;
@@ -333,7 +335,7 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
         complete_uri = xstrdup (link_uri);
      }
    else
-    complete_uri = url_concat (base, link_uri);
+    complete_uri = uri_merge (base, link_uri);
  
    DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
            closure->document_file, base ? base : "(null)",
@@ -363,7 +365,11 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri,
      closure->tail = closure->head = newel;
  }
  
-/* #### Document what this does.
+/* Examine name and attributes of TAG and take appropriate action.
+   What will be done depends on TAG's category and attribute values.
+   Tags of TC_LINK category have attributes that contain links to
+   follow; tags of TC_SPEC category need to be handled specially.
+
     #### It would be nice to split this into several functions.  */
  
  static void
@@ -392,8 +398,8 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
             if (closure->dash_p_leaf_HTML
                 && (url_tag_attr_map[i].flags & AF_EXTERNAL))
               /* If we're at a -p leaf node, we don't want to retrieve
-                 links to references we know are external, such as <a
-                 href=...>.  */
+                 links to references we know are external to this document,
+                such as <a href=...>.  */
               continue;
  
             /* This find_attr() buried in a loop may seem inefficient
@@ -418,7 +424,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
             if (closure->base)
               xfree (closure->base);
             if (closure->parent_base)
-             closure->base = url_concat (closure->parent_base, newbase);
+             closure->base = uri_merge (closure->parent_base, newbase);
             else
               closure->base = xstrdup (newbase);
           }
@@ -437,7 +443,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
                    and we're at a leaf node (relative to the -l
                    max. depth) in the HTML document tree, the only
                    <LINK> tag we'll follow is a <LINK REL=
-                  "stylesheet">, as it's necessary for displaying
+                  "stylesheet">, as it'll be necessary for displaying
                    this document properly.  We won't follow other
                    <LINK> tags, like <LINK REL="home">, for instance,
                    as they refer to external documents.  */
@@ -521,13 +527,12 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
      }
  }
  
-/* Scan FILE, retrieving links to HTML documents from it.  Each link is 
-
-  Similar to get_urls_file, but for HTML files.  FILE is scanned as
-   an HTML document.  get_urls_html() constructs the URLs from the
-   relative href-s.
+/* Analyze HTML tags FILE and construct a list of URLs referenced from
+   it.  It merges relative links in FILE with THIS_URL.  It is aware
+   of <base href=...> and does the right thing.
  
-   If SILENT is non-zero, do not barf on baseless relative links.  */
+   If dash_p_leaf_HTML is non-zero, only the elements needed to render
+   FILE ("non-external" links) will be returned.  */
  urlpos *
  get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
                int *meta_disallow_follow)