#undef U
#undef RU
-/* Decodes the forms %xy in a URL to the character the hexadecimal
- code of which is xy. xy are hexadecimal digits from
- [0123456789ABCDEF] (case-insensitive). If x or y are not
- hex-digits or `%' precedes `\0', the sequence is inserted
- literally. */
+/* URL-unescape the string S.
+
+ This is done by transforming the sequences "%HH" to the character
+ represented by the hexadecimal digits HH. If % is not followed by
+ two hexadecimal digits, it is inserted literally.
+
+ The transformation is done in place. If you need the original
+ string intact, make a copy before calling this function. */
static void
url_unescape (char *s)
*t = '\0';
}
-/* Like url_escape, but return S if there are no unsafe chars. */
+/* The core of url_escape_* functions. Escapes the characters that
+ match the provided mask in urlchr_table.
+
+ If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
+ will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
+ freshly allocated string will be returned in all cases. */
static char *
-url_escape_allow_passthrough (const char *s)
+url_escape_1 (const char *s, unsigned char mask, int allow_passthrough)
{
const char *p1;
char *p2, *newstr;
int addition = 0;
for (p1 = s; *p1; p1++)
- if (URL_UNSAFE_CHAR (*p1))
+ if (urlchr_test (*p1, mask))
addition += 2; /* Two more characters (hex digits) */
if (!addition)
- return (char *)s;
+ return allow_passthrough ? (char *)s : xstrdup (s);
newlen = (p1 - s) + addition;
newstr = (char *)xmalloc (newlen + 1);
p2 = newstr;
while (*p1)
{
- if (URL_UNSAFE_CHAR (*p1))
+ /* Quote the characters that match the test mask. */
+ if (urlchr_test (*p1, mask))
{
unsigned char c = *p1++;
*p2++ = '%';
else
*p2++ = *p1++;
}
- *p2 = '\0';
assert (p2 - newstr == newlen);
+ *p2 = '\0';
return newstr;
}
-/* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a
- given string, returning a malloc-ed %XX encoded string. */
-
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+ string, returning a freshly allocated string. */
+
char *
url_escape (const char *s)
{
- char *encoded = url_escape_allow_passthrough (s);
- if (encoded != s)
- return encoded;
- else
- return xstrdup (s);
+ return url_escape_1 (s, urlchr_unsafe, 0);
}
-/* Encode unsafe characters in PTR to %xx. If such encoding is done,
- the old value of PTR is freed and PTR is made to point to the newly
- allocated storage. */
-
-#define ENCODE(ptr) do { \
- char *e_new = url_escape_allow_passthrough (ptr); \
- if (e_new != ptr) \
- { \
- xfree (ptr); \
- ptr = e_new; \
- } \
-} while (0)
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+ string. If no characters are unsafe, S is returned. */
+
+static char *
+url_escape_allow_passthrough (const char *s)
+{
+ return url_escape_1 (s, urlchr_unsafe, 1);
+}
\f
enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };
assert (p2 - newstr == newlen);
return newstr;
}
-
-/* Run PTR_VAR through reencode_escapes. If a new string is consed,
- free PTR_VAR and make it point to the new storage. Obviously,
- PTR_VAR needs to be an lvalue. */
-
-#define REENCODE(ptr_var) do { \
- char *rf_new = reencode_escapes (ptr_var); \
- if (rf_new != ptr_var) \
- { \
- xfree (ptr_var); \
- ptr_var = rf_new; \
- } \
-} while (0)
\f
/* Returns the scheme type if the scheme is supported, or
SCHEME_INVALID if not. */
/* Like strpbrk, with the exception that it returns the pointer to the
terminating zero (end-of-string aka "eos") if no matching character
- is found. */
+ is found.
+
+ Although I normally balk at Gcc-specific optimizations, it probably
+ makes sense here: glibc has optimizations that detect strpbrk being
+ called with literal string as ACCEPT and inline the search. That
+ optimization is defeated if strpbrk is hidden within the call to
+ another function. (And no, making strpbrk_or_eos inline doesn't
+ help because the check for literal accept is in the
+ preprocessor.) */
+
+#ifdef __GNUC__
+
+#define strpbrk_or_eos(s, accept) ({ \
+ char *SOE_p = strpbrk (s, accept); \
+ if (!SOE_p) \
+ SOE_p = (char *)s + strlen (s); \
+ SOE_p; \
+})
+
+#else /* not __GNUC__ */
static char *
strpbrk_or_eos (const char *s, const char *accept)
p = (char *)s + strlen (s);
return p;
}
+#endif
/* Turn STR into lowercase; return non-zero if a character was
actually changed. */
return full_path;
}
-/* Sync u->path and u->url with u->dir and u->file. */
+/* Escape unsafe and reserved characters, except for the slash
+ characters. */
-static void
-sync_path (struct url *url)
+static char *
+url_escape_dir (const char *dir)
{
- char *newpath;
+ char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
+ char *h, *t;
+ if (newdir == dir)
+ return (char *)dir;
- xfree (url->path);
+ /* Unescape slashes in NEWDIR. */
+
+ h = newdir; /* hare */
+ t = newdir; /* tortoise */
- if (!*url->dir)
+ for (; *h; h++, t++)
{
- newpath = xstrdup (url->file);
- REENCODE (newpath);
+ if (*h == '%' && h[1] == '2' && h[2] == 'F')
+ {
+ *t = '/';
+ h += 2;
+ }
+ else
+ *t = *h;
}
+ *t = '\0';
+
+ return newdir;
+}
+
+/* Sync u->path and u->url with u->dir and u->file. Called after
+ u->file or u->dir have been changed, typically by the FTP code. */
+
+static void
+sync_path (struct url *u)
+{
+ char *newpath, *efile, *edir;
+
+ xfree (u->path);
+
+ /* u->dir and u->file are not escaped. URL-escape them before
+ reassembling them into u->path. That way, if they contain
+ separators like '?' or even if u->file contains slashes, the
+ path will be correctly assembled. (u->file can contain slashes
+ if the URL specifies it with %2f, or if an FTP server returns
+ it.) */
+ edir = url_escape_dir (u->dir);
+ efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
+
+ if (!*edir)
+ newpath = xstrdup (efile);
else
{
- int dirlen = strlen (url->dir);
- int filelen = strlen (url->file);
+ int dirlen = strlen (edir);
+ int filelen = strlen (efile);
- newpath = xmalloc (dirlen + 1 + filelen + 1);
- memcpy (newpath, url->dir, dirlen);
- newpath[dirlen] = '/';
- memcpy (newpath + dirlen + 1, url->file, filelen);
- newpath[dirlen + 1 + filelen] = '\0';
- REENCODE (newpath);
+ /* Copy "DIR/FILE" to newpath. */
+ char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
+ memcpy (p, edir, dirlen);
+ p += dirlen;
+ *p++ = '/';
+ memcpy (p, efile, filelen);
+ p += filelen;
+ *p++ = '\0';
}
- url->path = newpath;
+ u->path = newpath;
- /* Synchronize u->url. */
- xfree (url->url);
- url->url = url_string (url, 0);
+ if (edir != u->dir)
+ xfree (edir);
+ if (efile != u->file)
+ xfree (efile);
+
+ /* Regenerate u->url as well. */
+ xfree (u->url);
+ u->url = url_string (u, 0);
}
/* Mutators. Code in ftp.c insists on changing u->dir and u->file.
/* A growable string structure, used by url_file_name and friends.
This should perhaps be moved to utils.c.
- The idea is to have an easy way to construct a string by having
- various functions append data to it. Instead of passing the
- obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in
- questions, we pass the pointer to this struct. */
+ The idea is to have a convenient and efficient way to construct a
+ string by having various functions append data to it. Instead of
+ passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
+ functions in questions, we pass the pointer to this struct. */
struct growable {
char *base;
}
enum {
- filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */
- filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */
- filechr_unsafe_windows = 2, /* disallowed on Windows file system */
+ filechr_not_unix = 1, /* unusable on Unix, / and \0 */
+ filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */
+ filechr_control = 4, /* a control character, e.g. 0-31 */
};
#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
/* Shorthands for the table: */
-#define A filechr_unsafe_always
-#define S filechr_unsafe_shell
-#define W filechr_unsafe_windows
+#define U filechr_not_unix
+#define W filechr_not_windows
+#define C filechr_control
-/* Forbidden chars:
+#define UW U|W
+#define UWC U|W|C
- always: \0, /
- Unix shell: 0-31, 128-159
- Windows: \, |, /, <, >, ?, :
+/* Table of characters unsafe under various conditions (see above).
Arguably we could also claim `%' to be unsafe, since we use it as
the escape character. If we ever want to be able to reliably
const static unsigned char filechr_table[256] =
{
- A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */
- S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */
- S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
- S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */
+UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */
+ C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */
+ C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
+ C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */
0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */
- 0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */
+ 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */
0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */
0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */
- S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */
- S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */
+ C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
+ C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
-/* Return non-zero if character CH is unsafe for use in file or
- directory name. Called by append_uri_pathel. */
-
-static inline int
-file_unsafe_char (char ch, int restrict)
-{
- int mask = filechr_unsafe_always;
- if (restrict == restrict_shell)
- mask |= filechr_unsafe_shell;
- else if (restrict == restrict_windows)
- mask |= (filechr_unsafe_shell | filechr_unsafe_windows);
- return FILE_CHAR_TEST (ch, mask);
-}
-
/* FN_PORT_SEP is the separator between host and port in file names
for non-standard port numbers. On Unix this is normally ':', as in
"www.xemacs.org:4001/index.html". Under Windows, we set it to +
because Windows can't handle ':' in file names. */
-#define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+')
+#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+')
/* FN_QUERY_SEP is the separator between the file name and the URL
query, normally '?'. Since Windows cannot handle '?' as part of
file name, we use '@' instead there. */
-#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@')
+#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
/* Quote path element, characters in [b, e), as file name, and append
the quoted string to DEST. Each character is quoted as per
const char *p;
int quoted, outlen;
- /* Currently restrict_for_windows is determined at compile time
- only. But some users download files to Windows partitions; they
- should be able to say --windows-file-names so Wget escapes
- characters invalid on Windows. Similar run-time restrictions for
- other file systems can be implemented. */
- const int restrict = opt.restrict_file_names;
+ int mask;
+ if (opt.restrict_files_os == restrict_unix)
+ mask = filechr_not_unix;
+ else
+ mask = filechr_not_windows;
+ if (opt.restrict_files_ctrl)
+ mask |= filechr_control;
/* Copy [b, e) to PATHEL and URL-unescape it. */
BOUNDED_TO_ALLOCA (b, e, pathel);
add for file quoting. */
quoted = 0;
for (p = pathel; *p; p++)
- if (file_unsafe_char (*p, restrict))
+ if (FILE_CHAR_TEST (*p, mask))
++quoted;
/* p - pathel is the string length. Each quoted char means two
char *q = TAIL (dest);
for (p = pathel; *p; p++)
{
- if (!file_unsafe_char (*p, restrict))
+ if (!FILE_CHAR_TEST (*p, mask))
*q++ = *p;
else
{
4) Hierarchy is built.
The exception is the case when file does exist and is a
- directory (actually support for bad httpd-s). */
+ directory (see `mkalldirs' for explanation). */
if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
&& !(file_exists_p (fname) && !file_non_directory_p (fname)))
- return fnres.base;
+ return fname;
- /* Find a unique name. */
- unique = unique_name (fname);
- xfree (fname);
+ unique = unique_name (fname, 1);
+ if (unique != fname)
+ xfree (fname);
return unique;
}