[svn] Rewrite parsing and handling of URLs.

[wget] / src / recur.c
diff --git a/src/recur.c b/src/recur.c

index 497c455fe2a8a39bd9002e1eb1de737d5a357e92..11c30a2157e00ba2f433cba0e98045b3bb9bb2a4 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -120,9 +120,8 @@ recursive_retrieve (const char *file, const char *this_url)
    int dt, inl, dash_p_leaf_HTML = FALSE;
    int meta_disallow_follow;
    int this_url_ftp;            /* See below the explanation */
-  uerr_t err;
    urlpos *url_list, *cur_url;
-  struct urlinfo *u;
+  struct url *u;
  
    assert (this_url != NULL);
    assert (file != NULL);
@@ -140,9 +139,8 @@ recursive_retrieve (const char *file, const char *this_url)
        hash_table_clear (undesirable_urls);
        string_set_add (undesirable_urls, this_url);
        /* Enter this_url to the hash table, in original and "enhanced" form.  */
-      u = newurl ();
-      err = parseurl (this_url, u, 0);
-      if (err == URLOK)
+      u = url_parse (this_url, NULL);
+      if (u)
         {
           string_set_add (undesirable_urls, u->url);
           if (opt.no_parent)
@@ -156,7 +154,7 @@ recursive_retrieve (const char *file, const char *this_url)
           DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
           base_dir = NULL;
         }
-      freeurl (u, 1);
+      url_free (u);
        depth = 1;
        first_time = 0;
      }
@@ -187,7 +185,7 @@ recursive_retrieve (const char *file, const char *this_url)
       that the retrieval is done through proxy.  In that case, FTP
       links will be followed by default and recursion will not be
       turned off when following them.  */
-  this_url_ftp = (urlproto (this_url) == URLFTP);
+  this_url_ftp = (url_scheme (this_url) == SCHEME_FTP);
  
    /* Get the URL-s from an HTML file: */
    url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
@@ -210,17 +208,10 @@ recursive_retrieve (const char *file, const char *this_url)
         break;
        /* Parse the URL for convenient use in other functions, as well
          as to get the optimized form.  It also checks URL integrity.  */
-      u = newurl ();
-      if (parseurl (cur_url->url, u, 0) != URLOK)
+      u = url_parse (cur_url->url, NULL);
+      if (!u)
         {
           DEBUGP (("Yuck!  A bad URL.\n"));
-         freeurl (u, 1);
-         continue;
-       }
-      if (u->proto == URLFILE)
-       {
-         DEBUGP (("Nothing to do with file:// around here.\n"));
-         freeurl (u, 1);
           continue;
         }
        assert (u->url != NULL);
@@ -254,7 +245,7 @@ recursive_retrieve (const char *file, const char *this_url)
  
        /* If it is FTP, and FTP is not followed, chuck it out.  */
        if (!inl)
-       if (u->proto == URLFTP && !opt.follow_ftp && !this_url_ftp)
+       if (u->scheme == SCHEME_FTP && !opt.follow_ftp && !this_url_ftp)
           {
             DEBUGP (("Uh, it is FTP but i'm not in the mood to follow FTP.\n"));
             string_set_add (undesirable_urls, constr);
@@ -262,7 +253,7 @@ recursive_retrieve (const char *file, const char *this_url)
           }
        /* If it is absolute link and they are not followed, chuck it
          out.  */
-      if (!inl && u->proto != URLFTP)
+      if (!inl && u->scheme != SCHEME_FTP)
         if (opt.relative_only && !cur_url->link_relative_p)
           {
             DEBUGP (("It doesn't really look like a relative link.\n"));
@@ -281,14 +272,14 @@ recursive_retrieve (const char *file, const char *this_url)
        if (!inl && opt.no_parent
           /* If the new URL is FTP and the old was not, ignore
               opt.no_parent.  */
-         && !(!this_url_ftp && u->proto == URLFTP))
+         && !(!this_url_ftp && u->scheme == SCHEME_FTP))
         {
           /* Check for base_dir first.  */
           if (!(base_dir && frontcmp (base_dir, u->dir)))
             {
               /* Failing that, check for parent dir.  */
-             struct urlinfo *ut = newurl ();
-             if (parseurl (this_url, ut, 0) != URLOK)
+             struct url *ut = url_parse (this_url, NULL);
+             if (!ut)
                 DEBUGP (("Double yuck!  The *base* URL is broken.\n"));
               else if (!frontcmp (ut->dir, u->dir))
                 {
@@ -297,7 +288,7 @@ recursive_retrieve (const char *file, const char *this_url)
                   string_set_add (undesirable_urls, constr);
                   inl = 1;
                 }
-             freeurl (ut, 1);
+             url_free (ut);
             }
         }
        /* If the file does not match the acceptance list, or is on the
@@ -349,7 +340,16 @@ recursive_retrieve (const char *file, const char *this_url)
        if (!inl)
         {
           if (!opt.simple_check)
-           opt_url (u);
+           {
+             /* Find the "true" host.  */
+             char *host = realhost (u->host);
+             xfree (u->host);
+             u->host = host;
+
+             /* Refresh the printed representation of the URL.  */
+             xfree (u->url);
+             u->url = url_string (u, 0);
+           }
           else
             {
               char *p;
@@ -357,7 +357,7 @@ recursive_retrieve (const char *file, const char *this_url)
               for (p = u->host; *p; p++)
                 *p = TOLOWER (*p);
               xfree (u->url);
-             u->url = str_url (u, 0);
+             u->url = url_string (u, 0);
             }
           xfree (constr);
           constr = xstrdup (u->url);
@@ -368,7 +368,7 @@ recursive_retrieve (const char *file, const char *this_url)
           /* This line is bogus. */
           /*string_set_add (undesirable_urls, constr);*/
  
-         if (!inl && !((u->proto == URLFTP) && !this_url_ftp))
+         if (!inl && !((u->scheme == SCHEME_FTP) && !this_url_ftp))
             if (!opt.spanhost && this_url && !same_host (this_url, constr))
               {
                 DEBUGP (("This is not the same hostname as the parent's.\n"));
@@ -377,7 +377,7 @@ recursive_retrieve (const char *file, const char *this_url)
               }
         }
        /* What about robots.txt?  */
-      if (!inl && opt.use_robots && u->proto == URLHTTP)
+      if (!inl && opt.use_robots && u->scheme == SCHEME_FTP)
         {
           struct robot_specs *specs = res_get_specs (u->host, u->port);
           if (!specs)
@@ -418,7 +418,7 @@ recursive_retrieve (const char *file, const char *this_url)
           string_set_add (undesirable_urls, constr);
           /* Automatically followed FTPs will *not* be downloaded
              recursively.  */
-         if (u->proto == URLFTP)
+         if (u->scheme == SCHEME_FTP)
             {
               /* Don't you adore side-effects?  */
               opt.recursive = 0;
@@ -428,7 +428,7 @@ recursive_retrieve (const char *file, const char *this_url)
           /* Retrieve it.  */
           retrieve_url (constr, &filename, &newloc,
                        canon_this_url ? canon_this_url : this_url, &dt);
-         if (u->proto == URLFTP)
+         if (u->scheme == SCHEME_FTP)
             {
               /* Restore...  */
               opt.recursive = 1;
@@ -479,7 +479,7 @@ recursive_retrieve (const char *file, const char *this_url)
        /* Free filename and constr.  */
        FREE_MAYBE (filename);
        FREE_MAYBE (constr);
-      freeurl (u, 1);
+      url_free (u);
        /* Increment the pbuf for the appropriate size.  */
      }
    if (opt.convert_links && !opt.delete_after)
@@ -579,13 +579,9 @@ convert_all_links (void)
           char *local_name;
  
           /* The URL must be in canonical form to be compared.  */
-         struct urlinfo *u = newurl ();
-         uerr_t res = parseurl (cur_url->url, u, 0);
-         if (res != URLOK)
-           {
-             freeurl (u, 1);
-             continue;
-           }
+         struct url *u = url_parse (cur_url->url, NULL);
+         if (!u)
+           continue;
           /* We decide the direction of conversion according to whether
              a URL was downloaded.  Downloaded URLs will be converted
              ABS2REL, whereas non-downloaded will be converted REL2ABS.  */
@@ -614,7 +610,7 @@ convert_all_links (void)
                 cur_url->convert = CO_CONVERT_TO_COMPLETE;
               cur_url->local_name = NULL;
             }
-         freeurl (u, 1);
+         url_free (u);
         }
        /* Convert the links in the file.  */
        convert_links (html->string, urls);