From 0a3697ad652df74ffeec8a97e1d23c343d8ef391 Mon Sep 17 00:00:00 2001 From: hniksic Date: Sun, 14 Sep 2003 15:04:13 -0700 Subject: [PATCH] [svn] New mechanism for quoting file names. Published in . --- NEWS | 11 +- doc/ChangeLog | 5 + doc/wget.texi | 37 ++++ src/ChangeLog | 28 +++ src/connect.c | 1 + src/ftp-ls.c | 7 +- src/ftp.c | 8 +- src/http.c | 4 +- src/init.c | 29 +++ src/main.c | 7 +- src/options.h | 6 + src/url.c | 561 ++++++++++++++++++++++++++++++-------------------- src/url.h | 4 +- 13 files changed, 474 insertions(+), 234 deletions(-) diff --git a/NEWS b/NEWS index 073a95bd..3cf33275 100644 --- a/NEWS +++ b/NEWS @@ -7,8 +7,6 @@ Please send GNU Wget bug reports to . * Changes in Wget 1.9. -** The build process now requires Autoconf 2.5x. - ** It is now possible to specify that POST method be used for HTTP requests. For example, `wget --post-data="id=foo&data=bar" URL' will send a POST request with the specified contents. @@ -32,6 +30,15 @@ considered a fatal error. ** The new option `--dns-cache=off' may be used to prevent Wget from caching DNS lookups. + +** The build process now requires Autoconf 2.5x. + +** Wget no longer quotes characters in local file names that would be +considered "unsafe" as part of URL. Quoting can still occur for +control characters or for '/', but no longer for frequent characters +such as space. You can use the new option --restrict-file-names to +enforce even stricter rules, which is useful when downloading to +Windows partitions. * Wget 1.8.1 is a bugfix release with no user-visible changes. diff --git a/doc/ChangeLog b/doc/ChangeLog index 1f0f1c09..e2570f07 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,8 @@ +2003-09-14 Hrvoje Niksic + + * wget.texi (Download Options): Document the new option + --restrict-file-names and the corresponding wgetrc command. + 2003-09-10 Hrvoje Niksic * wget.texi (Download Options): Documented new option --dns-cache. diff --git a/doc/wget.texi b/doc/wget.texi index 19eb439a..4b0bb3c0 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -800,6 +800,39 @@ lookups where they're probably not needed. If you don't understand the above description, you probably won't need this option. + +@cindex file names, restrict +@cindex Windows file names +@itemx --restrict-file-names=none|unix|windows +Restrict characters that may occur in local file names created by Wget +from remote URLs. Characters that are considered @dfn{unsafe} under a +set of restrictions are escaped, i.e. replaced with @samp{%XX}, where +@samp{XX} is the hexadecimal code of the character. + +The default for this option depends on the operating system: on Unix and +Unix-like OS'es, it defaults to ``unix''. Under Windows and Cygwin, it +defaults to ``windows''. Changing the default is useful when you are +using a non-native partition, e.g. when downloading files to a Windows +partition mounted from Linux, or when using NFS-mounted or SMB-mounted +Windows drives. + +When set to ``none'', the only characters that are quoted are those that +are impossible to get into a file name---the NUL character and @samp{/}. +The control characters, newline, etc. are all placed into file names. + +When set to ``unix'', additional unsafe characters are those in the +0--31 range and in the 128--159 range. This is because those characters +are typically not printable. + +When set to ``windows'', all of the above are quoted, along with +@samp{\}, @samp{|}, @samp{:}, @samp{?}, @samp{"}, @samp{*}, @samp{<}, +and @samp{>}. Additionally, Wget in Windows mode uses @samp{+} instead +of @samp{:} to separate host and port in local file names, and uses +@samp{@@} instead of @samp{?} to separate the query portion of the file +name from the rest. Therefore, a URL that would be saved as +@samp{www.xemacs.org:4300/search.pl?input=blah} in Unix mode would be +saved as @samp{www.xemacs.org+4300/search.pl@@input=blah} in Windows +mode. @end table @node Directory Options, HTTP Options, Download Options, Invoking @@ -2241,6 +2274,10 @@ Links}). If set to on, remove @sc{ftp} listings downloaded by Wget. Setting it to off is the same as @samp{-nr}. +@item restrict_file_names = off/unix/windows +Restrict the file names generated by Wget from URLs. See +@samp{--restrict-file-names} for a more detailed description. + @item retr_symlinks = on/off When set to on, retrieve symbolic links as if they were plain files; the same as @samp{--retr-symlinks}. diff --git a/src/ChangeLog b/src/ChangeLog index d094f5c0..356082d3 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,31 @@ +2003-09-14 Hrvoje Niksic + + * url.c (append_uri_pathel): Use opt.restrict_file_names when + calling file_unsafe_char. + + * init.c: New command restrict_file_names. + + * main.c (main): New option --restrict-file-names[=windows,unix]. + + * url.c (url_file_name): Renamed from url_filename. + (url_file_name): Add directory and hostdir prefix here, not in + mkstruct. + (append_dir_structure): New function, does part of the work that + used to be in mkstruct. Iterates over path elements in u->path, + calling append_uri_pathel on each one to append it to the file + name. + (append_uri_pathel): URL-unescape a path element and reencode it + with a different set of rules, more appropriate for handling of + files. + (file_unsafe_char): New function, uses a lookup table to decide + whether a character should be escaped for use in file name. + (append_string): New utility function. + (append_char): Ditto. + (file_unsafe_char): New argument restrict_for_windows, decide + whether Windows file names should be escaped in run-time. + + * connect.c: Include to get prototype for abort(). + 2003-09-14 Hrvoje Niksic * utils.c (wtimer_sys_set): Extracted the code that sets the diff --git a/src/connect.c b/src/connect.c index 99f0909d..26dc404d 100644 --- a/src/connect.c +++ b/src/connect.c @@ -30,6 +30,7 @@ so, delete this exception statement from your version. */ #include #include +#include #include #ifdef HAVE_UNISTD_H # include diff --git a/src/ftp-ls.c b/src/ftp-ls.c index 47982777..919b4a60 100644 --- a/src/ftp-ls.c +++ b/src/ftp-ls.c @@ -842,8 +842,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f) { char *tmpu, *tmpp; /* temporary, clean user and passwd */ - tmpu = encode_string (u->user); - tmpp = u->passwd ? encode_string (u->passwd) : NULL; + tmpu = url_escape (u->user); + tmpp = u->passwd ? url_escape (u->passwd) : NULL; upwd = (char *)xmalloc (strlen (tmpu) + (tmpp ? (1 + strlen (tmpp)) : 0) + 2); sprintf (upwd, "%s%s%s@", tmpu, tmpp ? ":" : "", tmpp ? tmpp : ""); @@ -863,7 +863,8 @@ ftp_index (const char *file, struct url *u, struct fileinfo *f) fprintf (fp, " "); if (f->tstamp != -1) { - /* #### Should we translate the months? */ + /* #### Should we translate the months? Or, even better, use + ISO 8601 dates? */ static char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" diff --git a/src/ftp.c b/src/ftp.c index 3159171f..d70969ad 100644 --- a/src/ftp.c +++ b/src/ftp.c @@ -1025,7 +1025,7 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con) struct stat st; if (!con->target) - con->target = url_filename (u); + con->target = url_file_name (u); if (opt.noclobber && file_exists_p (con->target)) { @@ -1245,7 +1245,7 @@ ftp_get_listing (struct url *u, ccon *con, struct fileinfo **f) /* Find the listing file name. We do it by taking the file name of the URL and replacing the last component with the listing file name. */ - uf = url_filename (u); + uf = url_file_name (u); lf = file_merge (uf, LIST_FILENAME); xfree (uf); DEBUGP ((_("Using `%s' as listing tmp file.\n"), lf)); @@ -1335,7 +1335,7 @@ ftp_retrieve_list (struct url *u, struct fileinfo *f, ccon *con) ofile = xstrdup (u->file); url_set_file (u, f->name); - con->target = url_filename (u); + con->target = url_file_name (u); err = RETROK; dlthis = 1; @@ -1723,7 +1723,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy) char *filename = (opt.output_document ? xstrdup (opt.output_document) : (con.target ? xstrdup (con.target) - : url_filename (u))); + : url_file_name (u))); res = ftp_index (filename, u, f); if (res == FTPOK && opt.verbose) { diff --git a/src/http.c b/src/http.c index 14176aac..82c6d8de 100644 --- a/src/http.c +++ b/src/http.c @@ -1614,12 +1614,12 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer, hstat.local_file = local_file; else if (local_file) { - *local_file = url_filename (u); + *local_file = url_file_name (u); hstat.local_file = local_file; } else { - dummy = url_filename (u); + dummy = url_file_name (u); hstat.local_file = &dummy; } diff --git a/src/init.c b/src/init.c index 124bfb10..bce2427a 100644 --- a/src/init.c +++ b/src/init.c @@ -100,6 +100,7 @@ CMD_DECLARE (cmd_spec_htmlify); CMD_DECLARE (cmd_spec_mirror); CMD_DECLARE (cmd_spec_progress); CMD_DECLARE (cmd_spec_recursive); +CMD_DECLARE (cmd_spec_restrict_file_names); CMD_DECLARE (cmd_spec_useragent); /* List of recognized commands, each consisting of name, closure and function. @@ -188,6 +189,7 @@ static struct { { "reject", &opt.rejects, cmd_vector }, { "relativeonly", &opt.relative_only, cmd_boolean }, { "removelisting", &opt.remove_listing, cmd_boolean }, + { "restrictfilenames", &opt.restrict_file_names, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, { "retryconnrefused", &opt.retry_connrefused, cmd_boolean }, { "robots", &opt.use_robots, cmd_boolean }, @@ -281,6 +283,13 @@ defaults (void) opt.dots_in_line = 50; opt.dns_cache = 1; + + /* The default for file name restriction defaults to the OS type. */ +#if !defined(WINDOWS) && !defined(__CYGWIN__) + opt.restrict_file_names = restrict_shell; +#else + opt.restrict_file_names = restrict_windows; +#endif } /* Return the user's home directory (strdup-ed), or NULL if none is @@ -1008,6 +1017,26 @@ cmd_spec_recursive (const char *com, const char *val, void *closure) return 1; } +static int +cmd_spec_restrict_file_names (const char *com, const char *val, void *closure) +{ + /* The currently accepted values are `none', `unix', and + `windows'. */ + if (0 == strcasecmp (val, "none")) + opt.restrict_file_names = restrict_none; + else if (0 == strcasecmp (val, "unix")) + opt.restrict_file_names = restrict_shell; + else if (0 == strcasecmp (val, "windows")) + opt.restrict_file_names = restrict_windows; + else + { + fprintf (stderr, _("%s: %s: Invalid specification `%s'.\n"), + exec_name, com, val); + return 0; + } + return 1; +} + static int cmd_spec_useragent (const char *com, const char *val, void *closure) { diff --git a/src/main.c b/src/main.c index 67bf55cd..77e1bf30 100644 --- a/src/main.c +++ b/src/main.c @@ -179,10 +179,11 @@ Download:\n\ --bind-address=ADDRESS bind to ADDRESS (hostname or IP) on local host.\n\ --limit-rate=RATE limit download rate to RATE.\n\ --dns-cache=off disable caching DNS lookups.\n\ + --restrict-file-names=MODE restrict chars in file names to MODE.\n\ \n"), stdout); fputs (_("\ Directories:\n\ - -nd --no-directories don\'t create directories.\n\ + -nd, --no-directories don\'t create directories.\n\ -x, --force-directories force creation of directories.\n\ -nH, --no-host-directories don\'t create host directories.\n\ -P, --directory-prefix=PREFIX save files to PREFIX/...\n\ @@ -344,6 +345,7 @@ main (int argc, char *const *argv) { "proxy-user", required_argument, NULL, 143 }, { "quota", required_argument, NULL, 'Q' }, { "reject", required_argument, NULL, 'R' }, + { "restrict-file-names", required_argument, NULL, 176 }, { "save-cookies", required_argument, NULL, 162 }, { "timeout", required_argument, NULL, 'T' }, { "tries", required_argument, NULL, 't' }, @@ -610,6 +612,9 @@ GNU General Public License for more details.\n")); case 175: setval ("dnscache", optarg); break; + case 176: + setval ("restrictfilenames", optarg); + break; case 'A': setval ("accept", optarg); break; diff --git a/src/options.h b/src/options.h index e7eff5e2..7010cd41 100644 --- a/src/options.h +++ b/src/options.h @@ -184,6 +184,12 @@ struct options char *post_data; /* POST query string */ char *post_file_name; /* File to post */ + + enum { + restrict_none, + restrict_shell, + restrict_windows + } restrict_file_names; /* whether we restrict file name chars. */ }; extern struct options opt; diff --git a/src/url.c b/src/url.c index eac1cfdd..3a8feb70 100644 --- a/src/url.c +++ b/src/url.c @@ -1,5 +1,6 @@ /* URL handling. - Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc. + Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003 + Free Software Foundation, Inc. This file is part of GNU Wget. @@ -95,24 +96,22 @@ static int path_simplify PARAMS ((char *)); code assumes ASCII character set and 8-bit chars. */ enum { + /* rfc1738 reserved chars, preserved from encoding. */ urlchr_reserved = 1, + + /* rfc1738 unsafe chars, plus some more. */ urlchr_unsafe = 2 }; +#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask)) +#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved) +#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe) + +/* Shorthands for the table: */ #define R urlchr_reserved #define U urlchr_unsafe #define RU R|U -#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask)) - -/* rfc1738 reserved chars, preserved from encoding. */ - -#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved) - -/* rfc1738 unsafe chars, plus some more. */ - -#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe) - const static unsigned char urlchr_table[256] = { U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ @@ -142,6 +141,9 @@ const static unsigned char urlchr_table[256] = U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, }; +#undef R +#undef U +#undef RU /* Decodes the forms %xy in a URL to the character the hexadecimal code of which is xy. xy are hexadecimal digits from @@ -150,7 +152,7 @@ const static unsigned char urlchr_table[256] = literally. */ static void -decode_string (char *s) +url_unescape (char *s) { char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ @@ -175,10 +177,10 @@ decode_string (char *s) *t = '\0'; } -/* Like encode_string, but return S if there are no unsafe chars. */ +/* Like url_escape, but return S if there are no unsafe chars. */ static char * -encode_string_maybe (const char *s) +url_escape_allow_passthrough (const char *s) { const char *p1; char *p2, *newstr; @@ -186,7 +188,7 @@ encode_string_maybe (const char *s) int addition = 0; for (p1 = s; *p1; p1++) - if (UNSAFE_CHAR (*p1)) + if (URL_UNSAFE_CHAR (*p1)) addition += 2; /* Two more characters (hex digits) */ if (!addition) @@ -199,7 +201,7 @@ encode_string_maybe (const char *s) p2 = newstr; while (*p1) { - if (UNSAFE_CHAR (*p1)) + if (URL_UNSAFE_CHAR (*p1)) { unsigned char c = *p1++; *p2++ = '%'; @@ -215,13 +217,13 @@ encode_string_maybe (const char *s) return newstr; } -/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a +/* Encode the unsafe characters (as determined by URL_UNSAFE_CHAR) in a given string, returning a malloc-ed %XX encoded string. */ char * -encode_string (const char *s) +url_escape (const char *s) { - char *encoded = encode_string_maybe (s); + char *encoded = url_escape_allow_passthrough (s); if (encoded != s) return encoded; else @@ -232,13 +234,13 @@ encode_string (const char *s) the old value of PTR is freed and PTR is made to point to the newly allocated storage. */ -#define ENCODE(ptr) do { \ - char *e_new = encode_string_maybe (ptr); \ - if (e_new != ptr) \ - { \ - xfree (ptr); \ - ptr = e_new; \ - } \ +#define ENCODE(ptr) do { \ + char *e_new = url_escape_allow_passthrough (ptr); \ + if (e_new != ptr) \ + { \ + xfree (ptr); \ + ptr = e_new; \ + } \ } while (0) enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH }; @@ -258,7 +260,7 @@ decide_copy_method (const char *p) char preempt = (XCHAR_TO_XDIGIT (*(p + 1)) << 4) + XCHAR_TO_XDIGIT (*(p + 2)); - if (UNSAFE_CHAR (preempt) || RESERVED_CHAR (preempt)) + if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt)) return CM_PASSTHROUGH; else return CM_DECODE; @@ -267,20 +269,20 @@ decide_copy_method (const char *p) /* Garbled %.. sequence: encode `%'. */ return CM_ENCODE; } - else if (UNSAFE_CHAR (*p) && !RESERVED_CHAR (*p)) + else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) return CM_ENCODE; else return CM_PASSTHROUGH; } -/* Translate a %-quoting (but possibly non-conformant) input string S - into a %-quoting (and conformant) output string. If no characters +/* Translate a %-escaped (but possibly non-conformant) input string S + into a %-escaped (and conformant) output string. If no characters are encoded or decoded, return the same string S; otherwise, return a freshly allocated string with the new contents. After a URL has been run through this function, the protocols that use `%' as the quote character can use the resulting string as-is, - while those that don't call decode_string() to get to the intended + while those that don't call url_unescape() to get to the intended data. This function is also stable: after an input string is transformed the first time, all further transformations of the result yield the same result string. @@ -293,20 +295,21 @@ decide_copy_method (const char *p) GET /abc%20def HTTP/1.0 - So it appears that the unsafe chars need to be quoted, as with - encode_string. But what if we're requested to download - `abc%20def'? Remember that %-encoding is valid URL syntax, so what - the user meant was a literal space, and he was kind enough to quote - it. In that case, Wget should obviously leave the `%20' as is, and - send the same request as above. So in this case we may not call - encode_string. - - But what if the requested URI is `abc%20 def'? If we call - encode_string, we end up with `/abc%2520%20def', which is almost - certainly not intended. If we don't call encode_string, we are - left with the embedded space and cannot send the request. What the + It appears that the unsafe chars need to be quoted, for example + with url_escape. But what if we're requested to download + `abc%20def'? url_escape transforms "%" to "%25", which would leave + us with `abc%2520def'. This is incorrect -- since %-escapes are + part of URL syntax, "%20" is the correct way to denote a literal + space on the Wget command line. This leaves us in the conclusion + that in that case Wget should not call url_escape, but leave the + `%20' as is. + + And what if the requested URI is `abc%20 def'? If we call + url_escape, we end up with `/abc%2520%20def', which is almost + certainly not intended. If we don't call url_escape, we are left + with the embedded space and cannot complete the request. What the user meant was for Wget to request `/abc%20%20def', and this is - where reencode_string kicks in. + where reencode_escapes kicks in. Wget used to solve this by first decoding %-quotes, and then encoding all the "unsafe" characters found in the resulting string. @@ -317,7 +320,7 @@ decide_copy_method (const char *p) is inevitable because by the second step we would lose information on whether the `+' was originally encoded or not. Both results were wrong because in CGI parameters + means space, while %2B means - literal plus. reencode_string correctly translates the above to + literal plus. reencode_escapes correctly translates the above to "a%2B+b", i.e. returns the original string. This function uses an algorithm proposed by Anon Sricharoenchai: @@ -352,7 +355,7 @@ decide_copy_method (const char *p) "foo%2b+bar" -> "foo%2b+bar" */ static char * -reencode_string (const char *s) +reencode_escapes (const char *s) { const char *p1; char *newstr, *p2; @@ -417,12 +420,12 @@ reencode_string (const char *s) return newstr; } -/* Run PTR_VAR through reencode_string. If a new string is consed, +/* Run PTR_VAR through reencode_escapes. If a new string is consed, free PTR_VAR and make it point to the new storage. Obviously, PTR_VAR needs to be an lvalue. */ #define REENCODE(ptr_var) do { \ - char *rf_new = reencode_string (ptr_var); \ + char *rf_new = reencode_escapes (ptr_var); \ if (rf_new != ptr_var) \ { \ xfree (ptr_var); \ @@ -544,9 +547,9 @@ parse_uname (const char *str, int len, char **user, char **passwd) (*user)[len] = '\0'; if (*user) - decode_string (*user); + url_unescape (*user); if (*passwd) - decode_string (*passwd); + url_unescape (*passwd); return 1; } @@ -611,6 +614,10 @@ rewrite_shorthand_url (const char *url) static void parse_path PARAMS ((const char *, char **, char **)); +/* Like strpbrk, with the exception that it returns the pointer to the + terminating zero (end-of-string aka "eos") if no matching character + is found. */ + static char * strpbrk_or_eos (const char *s, const char *accept) { @@ -825,7 +832,7 @@ url_parse (const char *url, int *error) return NULL; } - url_encoded = reencode_string (url); + url_encoded = reencode_escapes (url); p = url_encoded; p += strlen (supported_schemes[scheme].leading_string); @@ -1016,9 +1023,9 @@ url_parse (const char *url, int *error) else { if (url_encoded == url) - u->url = xstrdup (url); + u->url = xstrdup (url); else - u->url = url_encoded; + u->url = url_encoded; } url_encoded = NULL; @@ -1032,13 +1039,13 @@ url_error (int error_code) return parse_errors[error_code]; } +/* Parse PATH into dir and file. PATH is extracted from the URL and + is URL-escaped. The function returns unescaped DIR and FILE. */ + static void -parse_path (const char *quoted_path, char **dir, char **file) +parse_path (const char *path, char **dir, char **file) { - char *path, *last_slash; - - STRDUP_ALLOCA (path, quoted_path); - decode_string (path); + char *last_slash; last_slash = strrchr (path, '/'); if (!last_slash) @@ -1051,6 +1058,8 @@ parse_path (const char *quoted_path, char **dir, char **file) *dir = strdupdelim (path, last_slash); *file = xstrdup (last_slash + 1); } + url_unescape (*dir); + url_unescape (*file); } /* Note: URL's "full path" is the path with the query string and @@ -1303,8 +1312,6 @@ rotate_backups(const char *fname) { sprintf (from, "%s.%d", fname, i - 1); sprintf (to, "%s.%d", fname, i); - /* #### This will fail on machines without the rename() system - call. */ rename (from, to); } @@ -1323,11 +1330,14 @@ mkalldirs (const char *path) int res; p = path + strlen (path); - for (; *p != '/' && p != path; p--); + for (; *p != '/' && p != path; p--) + ; + /* Don't create if it's just a file. */ if ((p == path) && (*p != '/')) return 0; t = strdupdelim (path, p); + /* Check whether the directory exists. */ if ((stat (t, &st) == 0)) { @@ -1360,194 +1370,302 @@ mkalldirs (const char *path) xfree (t); return res; } + +/* Functions for constructing the file name out of URL components. */ -static int -count_slashes (const char *s) +/* A growable string structure, used by url_file_name and friends. + This should perhaps be moved to utils.c. + + The idea is to have an easy way to construct a string by having + various functions append data to it. Instead of passing the + obligatory BASEVAR, SIZEVAR and TAILPOS to all the functions in + questions, we pass the pointer to this struct. */ + +struct growable { + char *base; + int size; + int tail; +}; + +/* Ensure that the string can accept APPEND_COUNT more characters past + the current TAIL position. If necessary, this will grow the string + and update its allocated size. If the string is already large + enough to take TAIL+APPEND_COUNT characters, this does nothing. */ +#define GROW(g, append_size) do { \ + struct growable *G_ = g; \ + DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \ +} while (0) + +/* Return the tail position of the string. */ +#define TAIL(r) ((r)->base + (r)->tail) + +/* Move the tail position by APPEND_COUNT characters. */ +#define TAIL_INCR(r, append_count) ((r)->tail += append_count) + +/* Append the string STR to DEST. NOTICE: the string in DEST is not + terminated. */ + +static void +append_string (const char *str, struct growable *dest) { - int i = 0; - while (*s) - if (*s++ == '/') - ++i; - return i; + int l = strlen (str); + GROW (dest, l); + memcpy (TAIL (dest), str, l); + TAIL_INCR (dest, l); } -/* Return the path name of the URL-equivalent file name, with a - remote-like structure of directories. */ -static char * -mkstruct (const struct url *u) +/* Append CH to DEST. For example, append_char (0, DEST) + zero-terminates DEST. */ + +static void +append_char (char ch, struct growable *dest) { - char *dir, *file; - char *res, *dirpref; - int l; + GROW (dest, 1); + *TAIL (dest) = ch; + TAIL_INCR (dest, 1); +} - if (opt.cut_dirs) - { - char *ptr = u->dir + (*u->dir == '/'); - int slash_count = 1 + count_slashes (ptr); - int cut = MINVAL (opt.cut_dirs, slash_count); - for (; cut && *ptr; ptr++) - if (*ptr == '/') - --cut; - STRDUP_ALLOCA (dir, ptr); - } - else - dir = u->dir + (*u->dir == '/'); +enum { + filechr_unsafe_always = 1, /* always unsafe, e.g. / or \0 */ + filechr_unsafe_shell = 2, /* unsafe for shell use, e.g. control chars */ + filechr_unsafe_windows = 2, /* disallowed on Windows file system */ +}; - /* Check for the true name (or at least a consistent name for saving - to directory) of HOST, reusing the hlist if possible. */ - if (opt.add_hostdir) - { - /* Add dir_prefix and hostname (if required) to the beginning of - dir. */ - dirpref = (char *)alloca (strlen (opt.dir_prefix) + 1 - + strlen (u->host) - + 1 + numdigit (u->port) - + 1); - if (!DOTP (opt.dir_prefix)) - sprintf (dirpref, "%s/%s", opt.dir_prefix, u->host); - else - strcpy (dirpref, u->host); +#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask)) - if (u->port != scheme_default_port (u->scheme)) - { - int len = strlen (dirpref); - dirpref[len] = ':'; - number_to_string (dirpref + len + 1, u->port); - } - } - else /* not add_hostdir */ - { - if (!DOTP (opt.dir_prefix)) - dirpref = opt.dir_prefix; - else - dirpref = ""; - } +/* Shorthands for the table: */ +#define A filechr_unsafe_always +#define S filechr_unsafe_shell +#define W filechr_unsafe_windows - /* If there is a prefix, prepend it. */ - if (*dirpref) - { - char *newdir = (char *)alloca (strlen (dirpref) + 1 + strlen (dir) + 2); - sprintf (newdir, "%s%s%s", dirpref, *dir == '/' ? "" : "/", dir); - dir = newdir; - } +/* Forbidden chars: - l = strlen (dir); - if (l && dir[l - 1] == '/') - dir[l - 1] = '\0'; + always: \0, / + Unix shell: 0-31, 128-159 + Windows: \, |, /, <, >, ?, : - if (!*u->file) - file = "index.html"; - else - file = u->file; + Arguably we could also claim `%' to be unsafe, since we use it as + the escape character. If we ever want to be able to reliably + translate file name back to URL, this would become important + crucial. Right now, it's better to be minimal in escaping. */ + +const static unsigned char filechr_table[256] = +{ + A, S, S, S, S, S, S, S, /* NUL SOH STX ETX EOT ENQ ACK BEL */ + S, S, S, S, S, S, S, S, /* BS HT LF VT FF CR SO SI */ + S, S, S, S, S, S, S, S, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ + S, S, S, S, S, S, S, S, /* CAN EM SUB ESC FS GS RS US */ + 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */ + 0, 0, W, 0, 0, 0, 0, A, /* ( ) * + , - . / */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ + 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */ + 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ + 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ + 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ + 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */ + 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ + 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ + 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ + 0, 0, 0, 0, 0, 0, 0, 0, /* x y z { | } ~ DEL */ - /* Finally, construct the full name. */ - res = (char *)xmalloc (strlen (dir) + 1 + strlen (file) - + 1); - sprintf (res, "%s%s%s", dir, *dir ? "/" : "", file); + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 128-143 */ + S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, S, /* 144-159 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - return res; + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Return non-zero if character CH is unsafe for use in file or + directory name. Called by append_uri_pathel. */ + +static inline int +file_unsafe_char (char ch, int restrict) +{ + int mask = filechr_unsafe_always; + if (restrict == restrict_shell) + mask |= filechr_unsafe_shell; + else if (restrict == restrict_windows) + mask |= (filechr_unsafe_shell | filechr_unsafe_windows); + return FILE_CHAR_TEST (ch, mask); } -/* Compose a file name out of BASE, an unescaped file name, and QUERY, - an escaped query string. The trick is to make sure that unsafe - characters in BASE are escaped, and that slashes in QUERY are also - escaped. */ +/* FN_PORT_SEP is the separator between host and port in file names + for non-standard port numbers. On Unix this is normally ':', as in + "www.xemacs.org:4001/index.html". Under Windows, we set it to + + because Windows can't handle ':' in file names. */ +#define FN_PORT_SEP (opt.restrict_file_names != restrict_windows ? ':' : '+') -static char * -compose_file_name (char *base, char *query) +/* FN_QUERY_SEP is the separator between the file name and the URL + query, normally '?'. Since Windows cannot handle '?' as part of + file name, we use '@' instead there. */ +#define FN_QUERY_SEP (opt.restrict_file_names != restrict_windows ? '?' : '@') + +/* Quote path element, characters in [b, e), as file name, and append + the quoted string to DEST. Each character is quoted as per + file_unsafe_char and the corresponding table. */ + +static void +append_uri_pathel (const char *b, const char *e, struct growable *dest) { - char result[256]; - char *from; - char *to = result; + char *pathel; + int pathlen; - /* Copy BASE to RESULT and encode all unsafe characters. */ - from = base; - while (*from && to - result < sizeof (result)) - { - if (UNSAFE_CHAR (*from)) - { - unsigned char c = *from++; - *to++ = '%'; - *to++ = XDIGIT_TO_XCHAR (c >> 4); - *to++ = XDIGIT_TO_XCHAR (c & 0xf); - } - else - *to++ = *from++; + const char *p; + int quoted, outlen; + + /* Currently restrict_for_windows is determined at compile time + only. But some users download files to Windows partitions; they + should be able to say --windows-file-names so Wget escapes + characters invalid on Windows. Similar run-time restrictions for + other file systems can be implemented. */ + const int restrict = opt.restrict_file_names; + + /* Copy [b, e) to PATHEL and URL-unescape it. */ + BOUNDED_TO_ALLOCA (b, e, pathel); + url_unescape (pathel); + pathlen = strlen (pathel); + + /* Go through PATHEL and check how many characters we'll need to + add for file quoting. */ + quoted = 0; + for (p = pathel; *p; p++) + if (file_unsafe_char (*p, restrict)) + ++quoted; + + /* p - pathel is the string length. Each quoted char means two + additional characters in the string, hence 2*quoted. */ + outlen = (p - pathel) + (2 * quoted); + GROW (dest, outlen); + + if (!quoted) + { + /* If there's nothing to quote, we don't need to go through the + string the second time. */ + memcpy (TAIL (dest), pathel, outlen); } - - if (query && to - result < sizeof (result)) + else { - *to++ = '?'; - - /* Copy QUERY to RESULT and encode all '/' characters. */ - from = query; - while (*from && to - result < sizeof (result)) + char *q = TAIL (dest); + for (p = pathel; *p; p++) { - if (*from == '/') + if (!file_unsafe_char (*p, restrict)) + *q++ = *p; + else { - *to++ = '%'; - *to++ = '2'; - *to++ = 'F'; - ++from; + unsigned char ch = *p; + *q++ = '%'; + *q++ = XDIGIT_TO_XCHAR (ch >> 4); + *q++ = XDIGIT_TO_XCHAR (ch & 0xf); } - else - *to++ = *from++; } + assert (q - TAIL (dest) == outlen); } + TAIL_INCR (dest, outlen); +} - if (to - result < sizeof (result)) - *to = '\0'; - else - /* Truncate input which is too long, presumably due to a huge - query string. */ - result[sizeof (result) - 1] = '\0'; +/* Append to DEST the directory structure that corresponds the + directory part of URL's path. For example, if the URL is + http://server/dir1/dir2/file, this appends "/dir1/dir2". + + Each path element ("dir1" and "dir2" in the above example) is + examined, url-unescaped, and re-escaped as file name element. + + Additionally, it cuts as many directories from the path as + specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it + will produce "bar" for the above example. For 2 or more, it will + produce "". + + Each component of the path is quoted for use as file name. */ - return xstrdup (result); +static void +append_dir_structure (const struct url *u, struct growable *dest) +{ + char *pathel, *next; + int cut = opt.cut_dirs; + + /* Go through the path components, de-URL-quote them, and quote them + (if necessary) as file names. */ + + pathel = u->path; + for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1) + { + if (cut-- > 0) + continue; + if (pathel == next) + /* Ignore empty pathels. path_simplify should remove + occurrences of "//" from the path, but it has special cases + for starting / which generates an empty pathel here. */ + continue; + + if (dest->tail) + append_char ('/', dest); + append_uri_pathel (pathel, next, dest); + } } -/* Create a unique filename, corresponding to a given URL. Calls - mkstruct if necessary. Does *not* actually create any directories. */ +/* Return a unique file name that matches the given URL as good as + possible. Does not create directories on the file system. */ + char * -url_filename (const struct url *u) +url_file_name (const struct url *u) { - char *file, *name; + struct growable fnres; + + char *u_file, *u_query; + char *fname, *unique; - char *query = u->query && *u->query ? u->query : NULL; + fnres.base = NULL; + fnres.size = 0; + fnres.tail = 0; + /* Start with the directory prefix, if specified. */ + if (!DOTP (opt.dir_prefix)) + append_string (opt.dir_prefix, &fnres); + + /* If "dirstruct" is turned on (typically the case with -r), add + the host and port (unless those have been turned off) and + directory structure. */ if (opt.dirstruct) { - char *base = mkstruct (u); - file = compose_file_name (base, query); - xfree (base); - } - else - { - char *base = *u->file ? u->file : "index.html"; - file = compose_file_name (base, query); - - /* Check whether the prefix directory is something other than "." - before prepending it. */ - if (!DOTP (opt.dir_prefix)) + if (opt.add_hostdir) { - /* #### should just realloc FILE and prepend dir_prefix. */ - char *nfile = (char *)xmalloc (strlen (opt.dir_prefix) - + 1 + strlen (file) + 1); - sprintf (nfile, "%s/%s", opt.dir_prefix, file); - xfree (file); - file = nfile; + if (fnres.tail) + append_char ('/', &fnres); + append_string (u->host, &fnres); + if (u->port != scheme_default_port (u->scheme)) + { + char portstr[24]; + number_to_string (portstr, u->port); + append_char (FN_PORT_SEP, &fnres); + append_string (portstr, &fnres); + } } + + append_dir_structure (u, &fnres); } - /* DOS-ish file systems don't like `%' signs in them; we change it - to `@'. */ -#ifdef WINDOWS - { - char *p = file; - for (p = file; *p; p++) - if (*p == '%') - *p = '@'; - } -#endif /* WINDOWS */ + /* Add the file name. */ + if (fnres.tail) + append_char ('/', &fnres); + u_file = *u->file ? u->file : "index.html"; + append_uri_pathel (u_file, u_file + strlen (u_file), &fnres); + + /* Append "?query" to the file name. */ + u_query = u->query && *u->query ? u->query : NULL; + if (u_query) + { + append_char (FN_QUERY_SEP, &fnres); + append_uri_pathel (u_query, u_query + strlen (u_query), &fnres); + } + + /* Zero-terminate the file name. */ + append_char ('\0', &fnres); + + fname = fnres.base; /* Check the cases in which the unique extensions are not used: 1) Clobbering is turned off (-nc). @@ -1557,17 +1675,18 @@ url_filename (const struct url *u) The exception is the case when file does exist and is a directory (actually support for bad httpd-s). */ + if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct) - && !(file_exists_p (file) && !file_non_directory_p (file))) - return file; + && !(file_exists_p (fname) && !file_non_directory_p (fname))) + return fnres.base; /* Find a unique name. */ - name = unique_name (file); - xfree (file); - return name; + unique = unique_name (fname); + xfree (fname); + return unique; } -/* Return the langth of URL's path. Path is considered to be +/* Return the length of URL's path. Path is considered to be terminated by one of '?', ';', '#', or by the end of the string. */ static int @@ -1680,8 +1799,10 @@ path_simplify (char *path) else if (*p == '/') { /* Remove empty path elements. Not mandated by rfc1808 et - al, but empty path elements are not all that useful, and - the rest of Wget might not deal with them well. */ + al, but it seems like a good idea to get rid of them. + Supporting them properly is hard (in which directory do + you save http://x.com///y.html?) and they don't seem to + bring much gain. */ char *q = p; while (*q == '/') ++q; @@ -1964,13 +2085,13 @@ url_string (const struct url *url, int hide_password) /* Make sure the user name and password are quoted. */ if (url->user) { - quoted_user = encode_string_maybe (url->user); + quoted_user = url_escape_allow_passthrough (url->user); if (url->passwd) { if (hide_password) quoted_passwd = HIDDEN_PASSWORD; else - quoted_passwd = encode_string_maybe (url->passwd); + quoted_passwd = url_escape_allow_passthrough (url->passwd); } } diff --git a/src/url.h b/src/url.h index bd88d950..d80fe54d 100644 --- a/src/url.h +++ b/src/url.h @@ -130,7 +130,7 @@ typedef enum /* Function declarations */ -char *encode_string PARAMS ((const char *)); +char *url_escape PARAMS ((const char *)); struct url *url_parse PARAMS ((const char *, int *)); const char *url_error PARAMS ((int)); @@ -157,7 +157,7 @@ char *uri_merge PARAMS ((const char *, const char *)); void rotate_backups PARAMS ((const char *)); int mkalldirs PARAMS ((const char *)); -char *url_filename PARAMS ((const struct url *)); +char *url_file_name PARAMS ((const struct url *)); char *getproxy PARAMS ((struct url *)); int no_proxy_match PARAMS ((const char *, const char **)); -- 2.39.2