From f5a10978710a3e9907fbef31b7df9f414acccb16 Mon Sep 17 00:00:00 2001 From: Gijs van Tulder Date: Wed, 9 May 2012 21:18:23 +0200 Subject: [PATCH] Add support for -accept-regex and --reject-regex. --- ChangeLog | 5 +++ bootstrap.conf | 1 + configure.ac | 12 ++++++ src/ChangeLog | 9 +++++ src/init.c | 29 ++++++++++++++ src/main.c | 43 +++++++++++++++++++++ src/options.h | 13 +++++++ src/recur.c | 5 +++ src/utils.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++ src/utils.h | 9 +++++ 10 files changed, 227 insertions(+) diff --git a/ChangeLog b/ChangeLog index f3e4e566..51632aa2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2012-04-11 Gijs van Tulder + + * bootstrap.conf (gnulib_modules): Include module `regex'. + * configure.ac: Check for PCRE library. + 2012-03-25 Ray Satiro * configure.ac: Fix build under mingw when OpenSSL is used. diff --git a/bootstrap.conf b/bootstrap.conf index 7ab3ad0a..56b7b278 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -58,6 +58,7 @@ pipe quote quotearg recv +regex select send setsockopt diff --git a/configure.ac b/configure.ac index eb1b8b7d..45cebcab 100644 --- a/configure.ac +++ b/configure.ac @@ -532,6 +532,18 @@ AC_CHECK_HEADER(uuid/uuid.h, ]) ) +dnl +dnl Check for PCRE +dnl + +AC_CHECK_HEADER(pcre.h, + AC_CHECK_LIB(pcre, pcre_compile, + [LIBS="${LIBS} -lpcre" + AC_DEFINE([HAVE_LIBPCRE], 1, + [Define if libpcre is available.]) + ]) +) + dnl Needed by src/Makefile.am AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) diff --git a/src/ChangeLog b/src/ChangeLog index a1d2c2a5..32d58192 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2012-04-11 Gijs van Tulder + + * init.c: Add --accept-regex, --reject-regex and --regex-type. + * main.c: Likewise. + * options.c: Likewise. + * recur.c: Likewise. + * utils.c: Add regex-related functions. + * utils.h: Add regex-related functions. + 2012-03-30 Tim Ruehsen * convert.c (convert_links_in_hashtable): Mmake it static. diff --git a/src/init.c b/src/init.c index 76cb2295..57a4f00d 100644 --- a/src/init.c +++ b/src/init.c @@ -46,6 +46,10 @@ as that of the covered work. */ # endif #endif +#include +#ifdef HAVE_LIBPCRE +# include +#endif #ifdef HAVE_PWD_H # include @@ -94,6 +98,7 @@ CMD_DECLARE (cmd_spec_mirror); CMD_DECLARE (cmd_spec_prefer_family); CMD_DECLARE (cmd_spec_progress); CMD_DECLARE (cmd_spec_recursive); +CMD_DECLARE (cmd_spec_regex_type); CMD_DECLARE (cmd_spec_restrict_file_names); #ifdef HAVE_SSL CMD_DECLARE (cmd_spec_secure_protocol); @@ -116,6 +121,7 @@ static const struct { } commands[] = { /* KEEP THIS LIST ALPHABETICALLY SORTED */ { "accept", &opt.accepts, cmd_vector }, + { "acceptregex", &opt.acceptregex_s, cmd_string }, { "addhostdir", &opt.add_hostdir, cmd_boolean }, { "adjustextension", &opt.adjust_extension, cmd_boolean }, { "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */ @@ -236,7 +242,9 @@ static const struct { { "reclevel", &opt.reclevel, cmd_number_inf }, { "recursive", NULL, cmd_spec_recursive }, { "referer", &opt.referer, cmd_string }, + { "regextype", &opt.regex_type, cmd_spec_regex_type }, { "reject", &opt.rejects, cmd_vector }, + { "rejectregex", &opt.rejectregex_s, cmd_string }, { "relativeonly", &opt.relative_only, cmd_boolean }, { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, @@ -361,6 +369,8 @@ defaults (void) opt.restrict_files_nonascii = false; opt.restrict_files_case = restrict_no_case_restriction; + opt.regex_type = regex_type_posix; + opt.max_redirect = 20; opt.waitretry = 10; @@ -1368,6 +1378,25 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored) return true; } +/* Validate --regex-type and set the choice. */ + +static bool +cmd_spec_regex_type (const char *com, const char *val, void *place_ignored) +{ + static const struct decode_item choices[] = { + { "posix", regex_type_posix }, +#ifdef HAVE_LIBPCRE + { "pcre", regex_type_pcre }, +#endif + }; + int regex_type = regex_type_posix; + int ok = decode_string (val, choices, countof (choices), ®ex_type); + if (!ok) + fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val)); + opt.regex_type = regex_type; + return ok; +} + static bool cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored) { diff --git a/src/main.c b/src/main.c index 438085e4..aac01ac8 100644 --- a/src/main.c +++ b/src/main.c @@ -158,6 +158,7 @@ struct cmdline_option { static struct cmdline_option option_data[] = { { "accept", 'A', OPT_VALUE, "accept", -1 }, + { "accept-regex", 0, OPT_VALUE, "acceptregex", -1 }, { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 }, { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument }, { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 }, @@ -262,7 +263,9 @@ static struct cmdline_option option_data[] = { "read-timeout", 0, OPT_VALUE, "readtimeout", -1 }, { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 }, + { "regex-type", 0, OPT_VALUE, "regextype", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, + { "reject-regex", 0, OPT_VALUE, "rejectregex", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, @@ -722,6 +725,17 @@ Recursive accept/reject:\n"), -A, --accept=LIST comma-separated list of accepted extensions.\n"), N_("\ -R, --reject=LIST comma-separated list of rejected extensions.\n"), + N_("\ + --accept-regex=REGEX regex matching accepted URLs.\n"), + N_("\ + --reject-regex=REGEX regex matching rejected URLs.\n"), +#ifdef HAVE_LIBPCRE + N_("\ + --regex-type=TYPE regex type (posix|pcre).\n"), +#else + N_("\ + --regex-type=TYPE regex type (posix).\n"), +#endif N_("\ -D, --domains=LIST comma-separated list of accepted domains.\n"), N_("\ @@ -1323,6 +1337,35 @@ for details.\n\n")); exit (1); } + /* Compile the regular expressions. */ + switch (opt.regex_type) + { +#ifdef HAVE_LIBPCRE + case regex_type_pcre: + opt.regex_compile_fun = compile_pcre_regex; + opt.regex_match_fun = match_pcre_regex; + break; +#endif + + case regex_type_posix: + default: + opt.regex_compile_fun = compile_posix_regex; + opt.regex_match_fun = match_posix_regex; + break; + } + if (opt.acceptregex_s) + { + opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s); + if (!opt.acceptregex) + exit (1); + } + if (opt.rejectregex_s) + { + opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s); + if (!opt.rejectregex) + exit (1); + } + #ifdef ENABLE_IRI if (opt.enable_iri) { diff --git a/src/options.h b/src/options.h index 1f429906..0da79379 100644 --- a/src/options.h +++ b/src/options.h @@ -74,6 +74,19 @@ struct options bool ignore_case; /* Whether to ignore case when matching dirs and files */ + char *acceptregex_s; /* Patterns to accept (a regex string). */ + char *rejectregex_s; /* Patterns to reject (a regex string). */ + void *acceptregex; /* Patterns to accept (a regex struct). */ + void *rejectregex; /* Patterns to reject (a regex struct). */ + enum { +#ifdef HAVE_LIBPCRE + regex_type_pcre, +#endif + regex_type_posix + } regex_type; /* The regex library. */ + void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */ + bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */ + char **domains; /* See host.c */ char **exclude_domains; bool dns_cache; /* whether we cache DNS lookups. */ diff --git a/src/recur.c b/src/recur.c index 139fe2e3..72274fb5 100644 --- a/src/recur.c +++ b/src/recur.c @@ -586,6 +586,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, goto out; } } + if (!accept_url (url)) + { + DEBUGP (("%s is excluded/not-included through regex.\n", url)); + goto out; + } /* 6. Check for acceptance/rejection rules. We ignore these rules for directories (no file name to match) and for non-leaf HTMLs, diff --git a/src/utils.c b/src/utils.c index 4188ced7..55a8a8d2 100644 --- a/src/utils.c +++ b/src/utils.c @@ -73,6 +73,11 @@ as that of the covered work. */ #include #include +#include +#ifdef HAVE_LIBPCRE +# include +#endif + #ifndef HAVE_SIGSETJMP /* If sigsetjmp is a macro, configure won't pick it up. */ # ifdef sigsetjmp @@ -917,6 +922,19 @@ acceptable (const char *s) return true; } +/* Determine whether an URL is acceptable to be followed, according to + regex patterns to accept/reject. */ +bool +accept_url (const char *s) +{ + if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s)) + return false; + if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s)) + return false; + + return true; +} + /* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p() will return true if and only if D2 begins with `/something/' or is exactly '/something'. */ @@ -2309,6 +2327,89 @@ base64_decode (const char *base64, void *dest) return q - (char *) dest; } +#ifdef HAVE_LIBPCRE +/* Compiles the PCRE regex. */ +void * +compile_pcre_regex (const char *str) +{ + const char *errbuf; + int erroffset; + pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0); + if (! regex) + { + fprintf (stderr, _("Invalid regular expression %s, %s\n"), + quote (str), errbuf); + return false; + } + return regex; +} +#endif + +/* Compiles the POSIX regex. */ +void * +compile_posix_regex (const char *str) +{ + regex_t *regex = xmalloc (sizeof (regex_t)); + int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB); + if (errcode != 0) + { + int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0); + char *errbuf = xmalloc (errbuf_size); + errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size); + fprintf (stderr, _("Invalid regular expression %s, %s\n"), + quote (str), errbuf); + xfree (errbuf); + return NULL; + } + + return regex; +} + +#ifdef HAVE_LIBPCRE +#define OVECCOUNT 30 +/* Matches a PCRE regex. */ +bool +match_pcre_regex (const void *regex, const char *str) +{ + int l = strlen (str); + int ovector[OVECCOUNT]; + + int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT); + if (rc == PCRE_ERROR_NOMATCH) + return false; + else if (rc < 0) + { + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (str), rc); + return false; + } + else + return true; +} +#undef OVECCOUNT +#endif + +/* Matches a POSIX regex. */ +bool +match_posix_regex (const void *regex, const char *str) +{ + int rc = regexec ((regex_t *) regex, str, 0, NULL, 0); + if (rc == REG_NOMATCH) + return false; + else if (rc == 0) + return true; + else + { + int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0); + char *errbuf = xmalloc (errbuf_size); + errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size); + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (str), rc); + xfree (errbuf); + return false; + } +} + #undef IS_ASCII #undef NEXT_CHAR diff --git a/src/utils.h b/src/utils.h index 514c5f26..409cdc5a 100644 --- a/src/utils.h +++ b/src/utils.h @@ -90,6 +90,7 @@ char *file_merge (const char *, const char *); int fnmatch_nocase (const char *, const char *, int); bool acceptable (const char *); +bool accept_url (const char *); bool accdir (const char *s); char *suffix (const char *s); bool match_tail (const char *, const char *, bool); @@ -142,6 +143,14 @@ void xsleep (double); int base64_encode (const void *, int, char *); int base64_decode (const char *, void *); +#ifdef HAVE_LIBPCRE +void *compile_pcre_regex (const char *); +bool match_pcre_regex (const void *, const char *); +#endif + +void *compile_posix_regex (const char *); +bool match_posix_regex (const void *, const char *); + void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *)); const char *print_decimal (double); -- 2.39.2