Published in <sxsd727cvc0.fsf@florida.arsdigita.de>.
+2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
+
+ * recur.c (descend_url_p): Be more conservative with blacklisting
+ URLs.
+ (convert_all_links): Print how many files have been converted, and
+ how long it took.
+
+ * progress.c (create_image): Place the number of downloaded bytes
+ right after the progress bar.
+
+ * utils.c (suffix): Return a pointer into the string.
+
2001-11-25 Hrvoje Niksic <hniksic@arsdigita.com>
* url.c (convert_links): Handle CO_NULLIFY_BASE.
&& (!strcmp (suf, "html") || !strcmp (suf, "htm")))
*dt |= TEXTHTML;
- FREE_MAYBE (suf);
FREE_MAYBE (dummy);
return RETROK;
}
long size = bp->initial_length + bp->count;
/* The progress bar should look like this:
- xx% [=======> ] xx KB/s nnnnn ETA 00:00
+ xx% [=======> ] nn.nnn rrK/s ETA 00:00
Calculate its geometry:
- "xx% " or "100%" - percentage - 4 chars exactly
- "[]" - progress bar decorations - 2 chars exactly
- "1012.56K/s " - dl rate - 11 chars exactly
- "n,nnn,nnn,nnn " - downloaded bytes - 14 or less chars
- "ETA xx:xx:xx" - ETA - 12 or less chars
+ "xx% " or "100%" - percentage - 4 chars exactly
+ "[]" - progress bar decorations - 2 chars exactly
+ " n,nnn,nnn,nnn" - downloaded bytes - 14 or less chars
+ " 1012.56K/s" - dl rate - 11 chars exactly
+ " ETA xx:xx:xx" - ETA - 13 or less chars
- "=====>..." - progress bar content - the rest
+ "=====>..." - progress bar content - the rest
*/
- int progress_size = screen_width - (4 + 2 + 11 + 14 + 12);
+ int progress_size = screen_width - (4 + 2 + 14 + 11 + 13);
if (progress_size < 5)
progress_size = 0;
- /* "xxx%" */
+ /* "xx% " */
if (bp->total_length > 0)
{
int percentage = (int)(100.0 * size / bp->total_length);
}
else
{
- int i = 5;
- while (i--)
- *p++ = ' ';
+ *p++ = ' ';
+ *p++ = ' ';
+ *p++ = ' ';
+ *p++ = ' ';
}
- /* The progress bar: "|====> |" */
+ /* The progress bar: "[====> ]" */
if (progress_size && bp->total_length > 0)
{
double fraction = (double)size / bp->total_length;
++bp->tick;
}
- /* "1012.45K/s " */
+ /* " 1,234,567" */
+ /* If there are 7 or less digits (9 because of "legible" comas),
+ print the number in constant space. This will prevent the rest
+ of the line jerking at the beginning of download, but without
+ assigning maximum width in all cases. */
+ sprintf (p, " %9s", legible (size));
+ p += strlen (p);
+
+ /* " 1012.45K/s" */
if (dltime && bp->count)
{
static char *short_units[] = { "B/s", "K/s", "M/s", "G/s" };
int units = 0;
double dlrate = calc_rate (bp->count, dltime, &units);
- sprintf (p, "%7.2f%s ", dlrate, short_units[units]);
+ sprintf (p, " %7.2f%s", dlrate, short_units[units]);
p += strlen (p);
}
else
{
- strcpy (p, " --.-- K/s ");
- p += 12;
+ strcpy (p, " --.--K/s");
+ p += 11;
}
- /* "1,234,567 " */
- /* If there are 7 or less digits (9 because of "legible" comas),
- print the number in constant space. This will prevent the "ETA"
- string from jerking as the data begins to arrive. */
- sprintf (p, "%9s", legible (size));
- p += strlen (p);
- *p++ = ' ';
-
- /* "ETA xx:xx:xx" */
+ /* " ETA xx:xx:xx" */
if (bp->total_length > 0 && bp->count > 0)
{
int eta, eta_hrs, eta_min, eta_sec;
/*printf ("\neta: %d, %d %d %d\n", eta, eta_hrs, eta_min, eta_sec);*/
/*printf ("\n%ld %f %ld %ld\n", dltime, tm_sofar, bytes_remaining, bp->count);*/
+ *p++ = ' ';
*p++ = 'E';
*p++ = 'T';
*p++ = 'A';
}
else if (bp->total_length > 0)
{
- strcpy (p, "ETA --:--");
- p += 9;
+ strcpy (p, " ETA --:--");
+ p += 10;
}
assert (p - bp->buffer <= screen_width);
xfree (qel);
return 1;
}
-
+\f
static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
struct url *, struct hash_table *));
/* The queue of URLs we need to load. */
struct url_queue *queue = url_queue_new ();
- /* The URLs we decided we don't want to load. */
+ /* The URLs we do not wish to enqueue, because they are already in
+ the queue, but haven't been downloaded yet. */
struct hash_table *blacklist = make_string_hash_table (0);
/* We'll need various components of this, so better get it over with
tree. The recursion is partial in that we won't
traverse any <A> or <AREA> tags, nor any <LINK> tags
except for <LINK REL="stylesheet">. */
- /* #### This would be the place to implement the TODO
- entry saying that -p should do two more hops on
- framesets. */
dash_p_leaf_HTML = TRUE;
else
{
/* Based on the context provided by retrieve_tree, decide whether a
URL is to be descended to. This is only ever called from
- retrieve_tree, but is in a separate function for clarity. */
+ retrieve_tree, but is in a separate function for clarity.
+
+ The most expensive checks (such as those for robots) are memoized
+ by storing these URLs to BLACKLIST. This may or may not help. It
+ will help if those URLs are encountered many times. */
static int
descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
&& !(u->scheme == SCHEME_FTP && opt.follow_ftp))
{
DEBUGP (("Not following non-HTTP schemes.\n"));
- goto blacklist;
+ goto out;
}
/* 2. If it is an absolute link and they are not followed, throw it
if (opt.relative_only && !upos->link_relative_p)
{
DEBUGP (("It doesn't really look like a relative link.\n"));
- goto blacklist;
+ goto out;
}
/* 3. If its domain is not to be accepted/looked-up, chuck it
if (!accept_domain (u))
{
DEBUGP (("The domain was not accepted.\n"));
- goto blacklist;
+ goto out;
}
/* 4. Check for parent directory.
if (!frontcmp (parent->dir, u->dir))
{
DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
- goto blacklist;
+ goto out;
}
}
if (!accdir (u->dir, ALLABS))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
- goto blacklist;
+ goto out;
}
}
/* 6. */
{
- char *suf = NULL;
+ char *suf;
/* Check for acceptance/rejection rules. We ignore these rules
for HTML documents because they might lead to other files which
need to be downloaded. Of course, we don't know which
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
url, u->file));
- FREE_MAYBE (suf);
- goto blacklist;
+ goto out;
}
}
- FREE_MAYBE (suf);
}
/* 7. */
{
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
u->host, parent->host));
- goto blacklist;
+ goto out;
}
/* 8. */
if (!res_match_path (specs, u->path))
{
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
- goto blacklist;
+ string_set_add (blacklist, url);
+ goto out;
}
}
return 1;
- blacklist:
- string_set_add (blacklist, url);
-
out:
DEBUGP (("Decided NOT to load it.\n"));
convert_all_links (void)
{
slist *html;
+ struct wget_timer *timer;
+ long msecs;
+ int file_count = 0;
+
+ timer = wtimer_new ();
/* Destructively reverse downloaded_html_files to get it in the right order.
recursive_retrieve() used slist_prepend() consistently. */
cur_url->local_name = NULL;
}
}
+
/* Convert the links in the file. */
convert_links (html->string, urls);
+ ++file_count;
+
/* Free the data. */
free_urlpos (urls);
}
+
+ msecs = wtimer_elapsed (timer);
+ wtimer_delete (timer);
+ logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
+ file_count, (double)msecs / 1000);
}
/* Cleanup the data structures associated with recursive retrieving
char *suf = suffix (u->local);
if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
*dt |= TEXTHTML;
- FREE_MAYBE (suf);
}
#endif
}
return 0;
}
-/* Return the malloc-ed suffix of STR. For instance:
+/* Return the location of STR's suffix (file extension). Examples:
suffix ("foo.bar") -> "bar"
suffix ("foo.bar.baz") -> "baz"
suffix ("/foo/bar") -> NULL
{
int i;
- for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--);
+ for (i = strlen (str); i && str[i] != '/' && str[i] != '.'; i--)
+ ;
+
if (str[i++] == '.')
- return xstrdup (str + i);
+ return (char *)str + i;
else
return NULL;
}