You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables. You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL". If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so. If you do not wish to do
+so, delete this exception statement from your version. */
/* This file implements the Robot Exclusion Standard (RES).
#include <stdio.h>
#include <stdlib.h>
-#ifdef HAVE_STRING_H
-# include <string.h>
-#else
-# include <strings.h>
-#endif /* HAVE_STRING_H */
+#include <string.h>
#include <errno.h>
#include <assert.h>
struct path_info {
char *path;
- int allowedp;
- int user_agent_exact_p;
+ bool allowedp;
+ bool user_agent_exact_p;
};
struct robot_specs {
static void
match_user_agent (const char *agent, int length,
- int *matches, int *exact_match)
+ bool *matches, bool *exact_match)
{
if (length == 1 && *agent == '*')
{
- *matches = 1;
- *exact_match = 0;
+ *matches = true;
+ *exact_match = false;
}
else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
{
- *matches = 1;
- *exact_match = 1;
+ *matches = true;
+ *exact_match = true;
}
else
{
- *matches = 0;
- *exact_match = 0;
+ *matches = false;
+ *exact_match = false;
}
}
static void
add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
- int allowedp, int exactp)
+ bool allowedp, bool exactp)
{
struct path_info pp;
if (path_b < path_e && *path_b == '/')
specs->paths[specs->count - 1] = pp;
}
-/* Recreate SPECS->paths with only those paths that have non-zero
- user_agent_exact_p. */
+/* Recreate SPECS->paths with only those paths that have
+ user_agent_exact_p set to true. */
static void
prune_non_exact (struct robot_specs *specs)
for (i = 0; i < specs->count; i++)
if (specs->paths[i].user_agent_exact_p)
++cnt;
- newpaths = xmalloc (cnt * sizeof (struct path_info));
+ newpaths = xnew_array (struct path_info, cnt);
for (i = 0, j = 0; i < specs->count; i++)
if (specs->paths[i].user_agent_exact_p)
newpaths[j++] = specs->paths[i];
const char *p = source;
const char *end = source + length;
- /* non-zero if last applicable user-agent field matches Wget. */
- int user_agent_applies = 0;
+ /* true if last applicable user-agent field matches Wget. */
+ bool user_agent_applies = false;
- /* non-zero if last applicable user-agent field *exactly* matches
+ /* true if last applicable user-agent field *exactly* matches
Wget. */
- int user_agent_exact = 0;
+ bool user_agent_exact = false;
/* whether we ever encountered exact user agent. */
- int found_exact = 0;
+ bool found_exact = false;
/* count of allow/disallow lines in the current "record", i.e. after
the last `user-agent' instructions. */
int record_count = 0;
- struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
- memset (specs, '\0', sizeof (struct robot_specs));
+ struct robot_specs *specs = xnew0 (struct robot_specs);
while (1)
{
until it matches, and if that happens, we must not call
it any more, until the next record. Hence the other part
of the condition. */
- if (record_count != 0 || user_agent_applies == 0)
+ if (record_count != 0 || user_agent_applies == false)
match_user_agent (value_b, value_e - value_b,
&user_agent_applies, &user_agent_exact);
if (user_agent_exact)
- found_exact = 1;
+ found_exact = true;
record_count = 0;
}
else if (FIELD_IS ("allow"))
{
if (user_agent_applies)
{
- add_path (specs, value_b, value_e, 1, user_agent_exact);
+ add_path (specs, value_b, value_e, true, user_agent_exact);
}
++record_count;
}
{
if (user_agent_applies)
{
- int allowed = 0;
+ bool allowed = false;
if (value_b == value_e)
- /* Empty "disallow" line means everything is
- *allowed*! */
- allowed = 1;
+ /* Empty "disallow" line means everything is *allowed*! */
+ allowed = true;
add_path (specs, value_b, value_e, allowed, user_agent_exact);
}
++record_count;
struct file_memory *fm = read_file (filename);
if (!fm)
{
- logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
+ logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
filename, strerror (errno));
return NULL;
}
int i;
for (i = 0; i < specs->count; i++)
xfree (specs->paths[i].path);
- FREE_MAYBE (specs->paths);
+ xfree_null (specs->paths);
xfree (specs);
}
\f
that number is not a numerical representation of '/', decode C and
advance the pointer. */
-#define DECODE_MAYBE(c, ptr) do { \
- if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
- { \
- char decoded \
- = (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \
- if (decoded != '/') \
- { \
- c = decoded; \
- ptr += 2; \
- } \
- } \
+#define DECODE_MAYBE(c, ptr) do { \
+ if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
+ { \
+ char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
+ if (decoded != '/') \
+ { \
+ c = decoded; \
+ ptr += 2; \
+ } \
+ } \
} while (0)
-/* The inner matching engine: return non-zero if RECORD_PATH matches
+/* The inner matching engine: return true if RECORD_PATH matches
URL_PATH. The rules for matching are described at
- <http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html>,
- section 3.2.2. */
+ <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */
-static int
+static bool
matches (const char *record_path, const char *url_path)
{
const char *rp = record_path;
char rc = *rp;
char uc = *up;
if (!rc)
- return 1;
+ return true;
if (!uc)
- return 0;
+ return false;
DECODE_MAYBE(rc, rp);
DECODE_MAYBE(uc, up);
if (rc != uc)
- return 0;
+ return false;
}
}
matches, return its allow/reject status. If none matches,
retrieval is by default allowed. */
-int
+bool
res_match_path (const struct robot_specs *specs, const char *path)
{
int i;
if (!specs)
- return 1;
+ return true;
for (i = 0; i < specs->count; i++)
if (matches (specs->paths[i].path, path))
{
- int allowedp = specs->paths[i].allowedp;
+ bool allowedp = specs->paths[i].allowedp;
DEBUGP (("%s path %s because of rule `%s'.\n",
allowedp ? "Allowing" : "Rejecting",
path, specs->paths[i].path));
return allowedp;
}
- return 1;
+ return true;
}
\f
/* Registering the specs. */
result = alloca (HP_len + 1 + numdigit (port) + 1); \
memcpy (result, host, HP_len); \
result[HP_len] = ':'; \
- long_to_string (result + HP_len + 1, port); \
+ number_to_string (result + HP_len + 1, port); \
} while (0)
/* Register RES specs that below to server on HOST:PORT. They will
if (!registered_specs)
registered_specs = make_nocase_string_hash_table (0);
- /* Required to shut up the compiler. */
- old = NULL;
- hp_old = NULL;
-
- if (hash_table_get_pair (registered_specs, hp, hp_old, old))
+ if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
{
if (old)
free_specs (old);
serves URL. The file will be named according to the currently
active rules, and the file name will be returned in *file.
- Return non-zero if robots were retrieved OK, zero otherwise. */
+ Return true if robots were retrieved OK, false otherwise. */
-int
+bool
res_retrieve_file (const char *url, char **file)
{
uerr_t err;