Published in <sxsadxaae3t.fsf@florida.arsdigita.de>.
\f
* Changes in Wget 1.8.
-** "Recursive retrieval" now uses a breadth-first algorithm.
-Recursive downloads are faster and consume *significantly* less memory
-than before.
-
** A new progress indicator is now available. Try it with
--progress=bar or using `progress = bar' in `.wgetrc'.
+** "Recursive retrieval" has been revamped:
+
+*** Wget now traverses links breadth-first. This makes the
+calculation of depth much more reliable than before. Also, recursive
+downloads are faster and consume *significantly* less memory than
+before.
+
+*** Links are converted only when the entire retrieval is complete.
+This is the only safe thing to do, as only then is it known what URLs
+have been downloaded.
+
+*** BASE tags are handled correctly when converting links. Since Wget
+already resolves <base href="..."> when resolving handling URLs, link
+conversion now makes the BASE tags point to an empty string.
+
** Host directories now contain port information if the URL is at a
non-standard port.
* Make -K compare X.orig to X and move the former on top of the latter if
they're the same, rather than leaving identical .orig files laying around.
-* Make `-k' convert <base href=...> too.
-
* Make `-k' check for files that were downloaded in the past and convert links
to them in newly-downloaded documents.
+2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * url.c (convert_links): Handle CO_NULLIFY_BASE.
+
+ * recur.c (retrieve_tree): Ignore download-ignorable children.
+ (convert_all_links): Specify CO_NULLIFY_BASE when link_base_p.
+
+ * html-url.c (handle_link): Return the newly created urlpos.
+ (collect_tags_mapper): When dealing with BASE, store the base
+ reference and mark it as download-ignorable.
+
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Attempt to quote '?' as "%3F" when
/* Resolve LINK_URI and append it to closure->tail. TAG and ATTRID
are the necessary context to store the position and size. */
-static void
+static struct urlpos *
handle_link (struct collect_urls_closure *closure, const char *link_uri,
struct taginfo *tag, int attrid)
{
/* We have no base, and the link does not have a host
attached to it. Nothing we can do. */
/* #### Should we print a warning here? Wget 1.5.x used to. */
- return;
+ return NULL;
}
url = url_parse (link_uri, NULL);
{
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
closure->document_file, link_uri));
- return;
+ return NULL;
}
}
else
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
closure->document_file, complete_uri));
xfree (complete_uri);
- return;
+ return NULL;
}
xfree (complete_uri);
}
}
else
closure->tail = closure->head = newel;
+
+ return newel;
}
/* Examine name and attributes of TAG and take appropriate action.
{
case TAG_BASE:
{
- char *newbase = find_attr (tag, "href", NULL);
+ struct urlpos *base_urlpos;
+ int id;
+ char *newbase = find_attr (tag, "href", &id);
if (!newbase)
break;
+
+ base_urlpos = handle_link (closure, newbase, tag, id);
+ if (!base_urlpos)
+ break;
+ base_urlpos->ignore_when_downloading = 1;
+ base_urlpos->link_base_p = 1;
+
if (closure->base)
xfree (closure->base);
if (closure->parent_base)
}
/* Analyze HTML tags FILE and construct a list of URLs referenced from
- it. It merges relative links in FILE with THIS_URL. It is aware
- of <base href=...> and does the right thing.
+ it. It merges relative links in FILE with URL. It is aware of
+ <base href=...> and does the right thing.
If dash_p_leaf_HTML is non-zero, only the elements needed to render
FILE ("non-external" links) will be returned. */
struct urlpos *
-get_urls_html (const char *file, const char *this_url, int dash_p_leaf_HTML,
+get_urls_html (const char *file, const char *url, int dash_p_leaf_HTML,
int *meta_disallow_follow)
{
struct file_memory *fm;
closure.text = fm->content;
closure.head = closure.tail = NULL;
closure.base = NULL;
- closure.parent_base = this_url ? this_url : opt.base_href;
+ closure.parent_base = url ? url : opt.base_href;
closure.document_file = file;
closure.dash_p_leaf_HTML = dash_p_leaf_HTML;
closure.nofollow = 0;
for (; child; child = child->next)
{
+ if (child->ignore_when_downloading)
+ continue;
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
blacklist))
{
char *local_name;
struct url *u = cur_url->url;
+ if (cur_url->link_base_p)
+ {
+ /* Base references have been resolved by our parser, so
+ we turn the base URL into an empty string. (Perhaps
+ we should remove the tag entirely?) */
+ cur_url->convert = CO_NULLIFY_BASE;
+ continue;
+ }
+
/* We decide the direction of conversion according to whether
a URL was downloaded. Downloaded URLs will be converted
ABS2REL, whereas non-downloaded will be converted REL2ABS. */
DEBUGP (("%s marked for conversion, local %s\n",
u->url, local_name));
- /* Decide on the conversion direction. */
+ /* Decide on the conversion type. */
if (local_name)
{
/* We've downloaded this URL. Convert it to relative
char *filename = NULL, *new_file;
int dt;
+ if (cur_url->ignore_when_downloading)
+ continue;
+
if (downloaded_exceeds_quota ())
{
status = QUOTEXC;
}
\f
static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static void replace_attr PARAMS ((const char **, int, FILE *, const char *));
+static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
static char *local_quote_string PARAMS ((const char *));
/* Change the links in one HTML file. LINKS is a list of links in the
read_file_free (fm);
return;
}
+
/* Here we loop through all the URLs in file, replacing those of
them that are downloaded with relative references. */
p = fm->content;
quote, to the outfile. */
fwrite (p, 1, url_start - p, fp);
p = url_start;
- if (link->convert == CO_CONVERT_TO_RELATIVE)
+
+ switch (link->convert)
{
+ case CO_CONVERT_TO_RELATIVE:
/* Convert absolute URL to relative. */
- char *newname = construct_relative (file, link->local_name);
- char *quoted_newname = local_quote_string (newname);
- replace_attr (&p, link->size, fp, quoted_newname);
- DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
- link->url->url, newname, link->pos, file));
- xfree (newname);
- xfree (quoted_newname);
- ++to_file_count;
- }
- else if (link->convert == CO_CONVERT_TO_COMPLETE)
- {
+ {
+ char *newname = construct_relative (file, link->local_name);
+ char *quoted_newname = local_quote_string (newname);
+ p = replace_attr (p, link->size, fp, quoted_newname);
+ DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
+ link->url->url, newname, link->pos, file));
+ xfree (newname);
+ xfree (quoted_newname);
+ ++to_file_count;
+ break;
+ }
+ case CO_CONVERT_TO_COMPLETE:
/* Convert the link to absolute URL. */
- char *newlink = link->url->url;
- char *quoted_newlink = html_quote_string (newlink);
- replace_attr (&p, link->size, fp, quoted_newlink);
- DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
- newlink, link->pos, file));
- xfree (quoted_newlink);
- ++to_url_count;
+ {
+ char *newlink = link->url->url;
+ char *quoted_newlink = html_quote_string (newlink);
+ p = replace_attr (p, link->size, fp, quoted_newlink);
+ DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
+ newlink, link->pos, file));
+ xfree (quoted_newlink);
+ ++to_url_count;
+ break;
+ }
+ case CO_NULLIFY_BASE:
+ /* Change the base href to "". */
+ p = replace_attr (p, link->size, fp, "");
+ break;
+ case CO_NOCONVERT:
+ abort ();
+ break;
}
}
+
/* Output the rest of the file. */
if (p - fm->content < fm->length)
fwrite (p, 1, fm->length - (p - fm->content), fp);
fclose (fp);
read_file_free (fm);
+
logprintf (LOG_VERBOSE,
_("%d-%d\n"), to_file_count, to_url_count);
}
static int find_fragment PARAMS ((const char *, int, const char **,
const char **));
-static void
-replace_attr (const char **pp, int raw_size, FILE *fp, const char *new_str)
+/* Replace an attribute's original text with NEW_TEXT. */
+
+static const char *
+replace_attr (const char *p, int size, FILE *fp, const char *new_text)
{
- const char *p = *pp;
int quote_flag = 0;
- int size = raw_size;
- char quote_char = '\"';
+ char quote_char = '\"'; /* use "..." for quoting, unless the
+ original value is quoted, in which
+ case reuse its quoting char. */
const char *frag_beg, *frag_end;
/* Structure of our string is:
size -= 2; /* disregard opening and closing quote */
}
putc (quote_char, fp);
- fputs (new_str, fp);
+ fputs (new_text, fp);
/* Look for fragment identifier, if any. */
if (find_fragment (p, size, &frag_beg, &frag_end))
if (quote_flag)
++p;
putc (quote_char, fp);
- *pp = p;
+
+ return p;
}
/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
CO_NOCONVERT = 0, /* don't convert this URL */
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
"../../otherdir/foo.gif" */
- CO_CONVERT_TO_COMPLETE /* convert to absolute, e.g. to
+ CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
"http://orighost/somedir/bar.jpg". */
+ CO_NULLIFY_BASE /* change to empty string. */
};
/* A structure that defines the whereabouts of a URL, i.e. its
char *local_name; /* local file to which it was saved
(used by convert_links) */
+ int ignore_when_downloading; /* reserved for special links such as
+ <base href="..."> which are used
+ when converting links, but ignored
+ when downloading. */
+
/* Information about the original link: */
int link_relative_p; /* was the link relative? */
int link_complete_p; /* was the link complete (with the
host name, etc.) */
+ int link_base_p; /* was the link <base href=...> */
/* Conversion requirements: */
enum convert_options convert; /* is conversion required? */