[svn] Split off non-URL related stuff from url.c to convert.c.

[wget] / src / html-url.c
diff --git a/src/html-url.c b/src/html-url.c

index 11789e59f017561f966b989a2a1d731b76d12c55..756bf2abb128180e539bade30815027430cf9c9f 100644 (file)
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -1,5 +1,5 @@
  /* Collect URLs from HTML source.
-   Copyright (C) 1998, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
@@ -15,7 +15,17 @@ GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
  
  #include <config.h>
  
@@ -33,6 +43,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  #include "html-parse.h"
  #include "url.h"
  #include "utils.h"
+#include "convert.h"
  
  #ifndef errno
  extern int errno;
@@ -171,7 +182,7 @@ init_interesting (void)
  
    {
      int i, ind = 0;
-    int size = ARRAY_SIZE (known_tags);
+    int size = countof (known_tags);
      interesting_tags = (const char **)xmalloc ((size + 1) * sizeof (char *));
  
      for (i = 0; i < size; i++)
@@ -231,14 +242,14 @@ init_interesting (void)
       unique, and to include the attributes from additional_attributes.  */
    {
      int i, ind;
-    const char **att = xmalloc ((ARRAY_SIZE (additional_attributes) + 1)
+    const char **att = xmalloc ((countof (additional_attributes) + 1)
                                 * sizeof (char *));
      /* First copy the "additional" attributes. */
-    for (i = 0; i < ARRAY_SIZE (additional_attributes); i++)
+    for (i = 0; i < countof (additional_attributes); i++)
        att[i] = additional_attributes[i];
      ind = i;
      att[ind] = NULL;
-    for (i = 0; i < ARRAY_SIZE (tag_url_attributes); i++)
+    for (i = 0; i < countof (tag_url_attributes); i++)
        {
         int j, seen = 0;
         const char *look_for = tag_url_attributes[i].attr_name;
@@ -267,7 +278,7 @@ find_tag (const char *tag_name)
    /* This is linear search; if the number of tags grow, we can switch
       to binary search.  */
  
-  for (i = 0; i < ARRAY_SIZE (known_tags); i++)
+  for (i = 0; i < countof (known_tags); i++)
      {
        int cmp = strcasecmp (known_tags[i].name, tag_name);
        /* known_tags are sorted alphabetically, so we can
@@ -411,7 +422,7 @@ static void
  tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
  {
    int i, attrind, first = -1;
-  int size = ARRAY_SIZE (tag_url_attributes);
+  int size = countof (tag_url_attributes);
  
    for (i = 0; i < size; i++)
      if (tag_url_attributes[i].tagid == tagid)
@@ -620,6 +631,7 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
  /* Analyze HTML tags FILE and construct a list of URLs referenced from
     it.  It merges relative links in FILE with URL.  It is aware of
     <base href=...> and does the right thing.  */
+
  struct urlpos *
  get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
  {
@@ -657,6 +669,91 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
    return ctx.head;
  }
  
+/* This doesn't really have anything to do with HTML, but it's similar
+   to get_urls_html, so we put it here.  */
+
+struct urlpos *
+get_urls_file (const char *file)
+{
+  struct file_memory *fm;
+  struct urlpos *head, *tail;
+  const char *text, *text_end;
+
+  /* Load the file.  */
+  fm = read_file (file);
+  if (!fm)
+    {
+      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
+      return NULL;
+    }
+  DEBUGP (("Loaded %s (size %ld).\n", file, fm->length));
+
+  head = tail = NULL;
+  text = fm->content;
+  text_end = fm->content + fm->length;
+  while (text < text_end)
+    {
+      int up_error_code;
+      char *url_text;
+      struct urlpos *entry;
+      struct url *url;
+
+      const char *line_beg = text;
+      const char *line_end = memchr (text, '\n', text_end - text);
+      if (!line_end)
+       line_end = text_end;
+      else
+       ++line_end;
+      text = line_end;
+
+      /* Strip whitespace from the beginning and end of line. */
+      while (line_beg < line_end && ISSPACE (*line_beg))
+       ++line_beg;
+      while (line_end > line_beg && ISSPACE (*(line_end - 1)))
+       --line_end;
+
+      if (line_beg == line_end)
+       continue;
+
+      /* The URL is in the [line_beg, line_end) region. */
+
+      /* We must copy the URL to a zero-terminated string, and we
+        can't use alloca because we're in a loop.  *sigh*.  */
+      url_text = strdupdelim (line_beg, line_end);
+
+      if (opt.base_href)
+       {
+         /* Merge opt.base_href with URL. */
+         char *merged = uri_merge (opt.base_href, url_text);
+         xfree (url_text);
+         url_text = merged;
+       }
+
+      url = url_parse (url_text, &up_error_code);
+      if (!url)
+       {
+         logprintf (LOG_NOTQUIET, "%s: Invalid URL %s: %s\n",
+                    file, url_text, url_error (up_error_code));
+         xfree (url_text);
+         continue;
+       }
+      xfree (url_text);
+
+      entry = (struct urlpos *)xmalloc (sizeof (struct urlpos));
+      memset (entry, 0, sizeof (*entry));
+      entry->next = NULL;
+      entry->url = url;
+
+      if (!head)
+       head = entry;
+      else
+       tail->next = entry;
+      tail = entry;
+    }
+  read_file_free (fm);
+  return head;
+}
+
  void
  cleanup_html_url (void)
  {