From cfd7b9a95112926333757b2f35e8861e69059502 Mon Sep 17 00:00:00 2001 From: abbotti Date: Fri, 12 Apr 2002 11:53:39 -0700 Subject: [PATCH] [svn] Use new function to test filename for common html suffixes. Submitted by Ian Abbott in <3CB72D29.4898.1F34872@localhost> with minor changes to formatting and comments. --- src/ChangeLog | 13 +++++++++++++ src/http.c | 7 +++---- src/recur.c | 6 ++---- src/retr.c | 7 +++---- src/utils.c | 25 +++++++++++++++++++++++++ src/utils.h | 2 ++ 6 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 00a440b1..5fa69110 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,16 @@ +2002-04-12 Ian Abbott + + * utils.c (has_html_suffix_p): New function to test filename for + common html extensions. + + * utils.h: Declare it. + + * http.c (http_loop): Use it instead of previous test. + + * retr.c (retrieve_url): Ditto. + + * recur.c (download_child_p): Ditto. + 2002-04-12 Hrvoje Niksic * config.h.in: Define _VA_LIST on Solaris to prevent stdio.h from diff --git a/src/http.c b/src/http.c index 6dacacab..3e62856d 100644 --- a/src/http.c +++ b/src/http.c @@ -1405,7 +1405,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, int use_ts, got_head = 0; /* time-stamping info */ char *filename_plus_orig_suffix; char *local_filename = NULL; - char *tms, *suf, *locf, *tmrate; + char *tms, *locf, *tmrate; uerr_t err; time_t tml = -1, tmr = -1; /* local and remote time-stamps */ long local_size = 0; /* the size of the local file */ @@ -1465,9 +1465,8 @@ File `%s' already there, will not retrieve.\n"), *hstat.local_file); *dt |= RETROKF; /* #### Bogusness alert. */ - /* If its suffix is "html" or "htm", assume text/html. */ - if (((suf = suffix (*hstat.local_file)) != NULL) - && (!strcmp (suf, "html") || !strcmp (suf, "htm"))) + /* If its suffix is "html" or "htm" or similar, assume text/html. */ + if (has_html_suffix_p (*hstat.local_file)) *dt |= TEXTHTML; FREE_MAYBE (dummy); diff --git a/src/recur.c b/src/recur.c index a1fe72ae..1d6a6988 100644 --- a/src/recur.c +++ b/src/recur.c @@ -510,7 +510,6 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, /* 6. */ { - char *suf; /* Check for acceptance/rejection rules. We ignore these rules for HTML documents because they might lead to other files which need to be downloaded. Of course, we don't know which @@ -521,14 +520,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, * u->file is not "" (i.e. it is not a directory) and either: + there is no file suffix, - + or there is a suffix, but is not "html" or "htm", + + or there is a suffix, but is not "html" or "htm" or similar, + both: - recursion is not infinite, - and we are at its very end. */ if (u->file[0] != '\0' - && ((suf = suffix (url)) == NULL - || (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm")) + && (!has_html_suffix_p (url) || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel))) { if (!acceptable (u->file)) diff --git a/src/retr.c b/src/retr.c index 36eb3481..c35dde11 100644 --- a/src/retr.c +++ b/src/retr.c @@ -384,12 +384,11 @@ retrieve_url (const char *origurl, char **file, char **newloc, /* There is a possibility of having HTTP being redirected to FTP. In these cases we must decide whether the text is HTML - according to the suffix. The HTML suffixes are `.html' and - `.htm', case-insensitive. */ + according to the suffix. The HTML suffixes are `.html', + `.htm' and a few others, case-insensitive. */ if (redirection_count && local_file && u->scheme == SCHEME_FTP) { - char *suf = suffix (local_file); - if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm"))) + if (has_html_suffix_p (local_file)) *dt |= TEXTHTML; } } diff --git a/src/utils.c b/src/utils.c index 3f04edaf..ca8505ac 100644 --- a/src/utils.c +++ b/src/utils.c @@ -792,6 +792,31 @@ suffix (const char *str) return NULL; } +/* Return non-zero if FNAME ends with a typical HTML suffix. The + following (case-insensitive) suffixes are presumed to be HTML files: + + html + htm + ?html (`?' matches one character) + + #### CAVEAT. This is not necessarily a good indication that FNAME + refers to a file that contains HTML! */ +int +has_html_suffix_p (const char *fname) +{ + char *suf; + + if ((suf = suffix (fname)) == NULL) + return 0; + if (!strcasecmp (suf, "html")) + return 1; + if (!strcasecmp (suf, "htm")) + return 1; + if (suf[0] && !strcasecmp (suf + 1, "html")) + return 1; + return 0; +} + /* Read a line from FP and return the pointer to freshly allocated storage. The stoarage space is obtained through malloc() and should be freed with free() when it is no longer needed. diff --git a/src/utils.h b/src/utils.h index 0cba3018..162ddd70 100644 --- a/src/utils.h +++ b/src/utils.h @@ -70,6 +70,8 @@ int accdir PARAMS ((const char *s, enum accd)); char *suffix PARAMS ((const char *s)); int match_tail PARAMS ((const char *, const char *)); +int has_html_suffix_p PARAMS ((const char *)); + char *read_whole_line PARAMS ((FILE *)); struct file_memory *read_file PARAMS ((const char *)); void read_file_free PARAMS ((struct file_memory *)); -- 2.39.2