--restrict-file-names=ascii
authorMicah Cowan <micah@cowan.name>
Tue, 28 Jul 2009 06:41:26 +0000 (23:41 -0700)
committerMicah Cowan <micah@cowan.name>
Tue, 28 Jul 2009 06:41:26 +0000 (23:41 -0700)
src/ChangeLog
src/init.c
src/options.h
src/url.c
tests/ChangeLog
tests/Test-restrict-ascii.px [new file with mode: 0755]
tests/run-px

index 7c7e4e972febccf3e913acfeeeb99587b6218d2d..afac4339986cc9bc34e6b6042133d3af1d4e77c9 100644 (file)
@@ -1,3 +1,14 @@
+2009-07-27  Micah Cowan  <micah@cowan.name>
+
+       * options.h (struct options): Added restrict_files_nonascii
+       boolean field.
+
+       * url.c (FILE_CHAR_TEST): Add check for chars outside the ASCII
+       range.
+
+       * init.c (defaults): Add restrict_files_nonascii to initialization.
+       (cmd_spec_restrict_file_names): Allow parsing of "ascii" keyword.
+
 2009-07-27  Marcel Telka  <marcel@telka.sk>
 
        * iri.c (do_conversion): Typo: invalide -> invalid
index 172b54b562fd8e302df5aa2569814be2507cf14f..5a4bbe50438351ed5683166eba48261f50369f69 100644 (file)
@@ -329,6 +329,7 @@ defaults (void)
   opt.restrict_files_os = restrict_unix;
 #endif
   opt.restrict_files_ctrl = true;
+  opt.restrict_files_nonascii = false;
   opt.restrict_files_case = restrict_no_case_restriction;
 
   opt.max_redirect = 20;
@@ -1275,6 +1276,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
   int restrict_os = opt.restrict_files_os;
   int restrict_ctrl = opt.restrict_files_ctrl;
   int restrict_case = opt.restrict_files_case;
+  int restrict_nonascii = opt.restrict_files_nonascii;
 
   const char *end;
 
@@ -1285,7 +1287,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
       end = strchr (val, ',');
       if (!end)
         end = val + strlen (val);
-      
+
       if (VAL_IS ("unix"))
         restrict_os = restrict_unix;
       else if (VAL_IS ("windows"))
@@ -1296,10 +1298,13 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
         restrict_case = restrict_uppercase;
       else if (VAL_IS ("nocontrol"))
         restrict_ctrl = false;
+      else if (VAL_IS ("ascii"))
+        restrict_nonascii = true;
       else
         {
-          fprintf (stderr,
-                   _("%s: %s: Invalid restriction %s, use [unix|windows],[lowercase|uppercase],[nocontrol].\n"),
+          fprintf (stderr, _("\
+%s: %s: Invalid restriction %s,\n\
+    use [unix|windows],[lowercase|uppercase],[nocontrol],[ascii].\n"),
                    exec_name, com, quote (val));
           return false;
         }
@@ -1314,6 +1319,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
   opt.restrict_files_os = restrict_os;
   opt.restrict_files_ctrl = restrict_ctrl;
   opt.restrict_files_case = restrict_case;
+  opt.restrict_files_nonascii = restrict_nonascii;
   
   return true;
 }
index 382fe312c516aae637ba25bf57fe9b9de55f047a..cc0d20e635094253cff5072446072ec1f3ff799e 100644 (file)
@@ -209,6 +209,8 @@ struct options
   bool restrict_files_ctrl;    /* non-zero if control chars in URLs
                                   are restricted from appearing in
                                   generated file names. */
+  bool restrict_files_nonascii; /* non-zero if bytes with values greater
+                                   than 127 are restricted. */
   enum {
     restrict_no_case_restriction,
     restrict_lowercase,
index 2453157a617d0b4df79afe57a607ccf7644d4d04..afc31811d443378c7576a5eb17d6e919ba792298 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1291,7 +1291,9 @@ enum {
   filechr_control     = 4       /* a control character, e.g. 0-31 */
 };
 
-#define FILE_CHAR_TEST(c, mask) (filechr_table[(unsigned char)(c)] & (mask))
+#define FILE_CHAR_TEST(c, mask) \
+    ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
+    (filechr_table[(unsigned char)(c)] & (mask)))
 
 /* Shorthands for the table: */
 #define U filechr_not_unix
index d1a8af8b428e121ddd0e26b4108406c37c05d97b..785b36e41f4e203bdbdf0c3a534106d144ca19e3 100644 (file)
@@ -1,3 +1,9 @@
+2009-07-27  Micah Cowan  <micah@cowan.name>
+
+       * Test-restrict-ascii.px: New.
+
+       * run-px: Added Test-restrict-ascii.px.
+
 2009-07-26  Micah Cowan  <micah@cowan.name>
 
        * Test-ftp-iri.px, Test-ftp-iri-fallback.px,
diff --git a/tests/Test-restrict-ascii.px b/tests/Test-restrict-ascii.px
new file mode 100755 (executable)
index 0000000..ec76349
--- /dev/null
@@ -0,0 +1,69 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use HTTPTest;
+
+# This program tests that --restrict-file-names=ascii can be used to
+# ensure that all high-valued bytes are escaped. The sample filename was
+# chosen because in former versions of Wget, one could either choose not
+# to escape any portion of the UTF-8 filename via
+# --restrict-file-names=nocontrol (which would only be helpful if one
+# was _on_ a UTF-8 system), or else Wget would escape _portions_ of
+# characters, leaving irrelevant "latin1"-looking characters combined
+# with percent-encoded "control" characters, instead of encoding all the
+# bytes of an entire non-ASCII UTF-8 character.
+
+###############################################################################
+
+# "gnosis" in UTF-8 greek.
+my $gnosis = '%CE%B3%CE%BD%CF%89%CF%83%CE%B9%CF%82';
+
+my $mainpage = <<EOF;
+<html>
+<head>
+  <title>Some Page Title</title>
+</head>
+<body>
+  <p>
+    Some text...
+  </p>
+</body>
+</html>
+EOF
+
+# code, msg, headers, content
+my %urls = (
+    "/$gnosis.html" => {
+        code => "200",
+        msg => "Dontcare",
+        headers => {
+            "Content-type" => "text/html",
+        },
+        content => $mainpage,
+    },
+);
+
+my $cmdline = $WgetTest::WGETPATH . " --restrict-file-names=ascii "
+    . "http://localhost:{{port}}/${gnosis}.html";
+
+my $expected_error_code = 0;
+
+my %expected_downloaded_files = (
+    "${gnosis}.html" => {
+        content => $mainpage,
+    },
+);
+
+###############################################################################
+
+my $the_test = HTTPTest->new (name => "Test-restrict-ascii",
+                              input => \%urls, 
+                              cmdline => $cmdline, 
+                              errcode => $expected_error_code, 
+                              output => \%expected_downloaded_files);
+exit $the_test->run();
+
+# vim: et ts=4 sw=4
+
index b480f1416c82a72977e4876b4d1c64c346df82cd..17a52cdffc9b088008e616c12110c3f7822a602a 100755 (executable)
@@ -60,6 +60,7 @@ my @tests = (
     'Test-O-nonexisting.px',
     'Test-O.px',
     'Test-O-nc.px',
+    'Test-restrict-ascii.px',
     'Test-Restrict-Lowercase.px',
     'Test-Restrict-Uppercase.px',
     'Test--spider-fail.px',