4 # The Intltool Message Merger
6 # Copyright (C) 2000, 2002 Free Software Foundation.
7 # Copyright (C) 2000, 2001 Eazel, Inc
9 # Intltool is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # version 2 published by the Free Software Foundation.
13 # Intltool is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 # General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 # As a special exception to the GNU General Public License, if you
23 # distribute this file as part of a program that contains a
24 # configuration script generated by Autoconf, you may include it under
25 # the same distribution terms that you use for the rest of that program.
27 # Authors: Maciej Stachowiak <mjs@noisehavoc.org>
28 # Kenneth Christiansen <kenneth@gnu.org>
29 # Darin Adler <darin@bentspoon.com>
31 # Proper XML UTF-8'ification written by Cyrille Chepelov <chepelov@calixo.net>
34 ## Release information
35 my $PROGRAM = "intltool-merge";
36 my $PACKAGE = "intltool";
43 ## Scalars used by the option stuff
47 my $XML_STYLE_ARG = 0;
48 my $KEYS_STYLE_ARG = 0;
49 my $DESKTOP_STYLE_ARG = 0;
50 my $SCHEMAS_STYLE_ARG = 0;
52 my $PASS_THROUGH_ARG = 0;
60 "version" => \$VERSION_ARG,
61 "quiet|q" => \$QUIET_ARG,
62 "oaf-style|o" => \$BA_STYLE_ARG, ## for compatibility
63 "ba-style|b" => \$BA_STYLE_ARG,
64 "xml-style|x" => \$XML_STYLE_ARG,
65 "keys-style|k" => \$KEYS_STYLE_ARG,
66 "desktop-style|d" => \$DESKTOP_STYLE_ARG,
67 "schemas-style|s" => \$SCHEMAS_STYLE_ARG,
68 "pass-through|p" => \$PASS_THROUGH_ARG,
69 "utf8|u" => \$UTF8_ARG,
70 "cache|c=s" => \$cache_file
77 my %po_files_by_lang = ();
78 my %translations = ();
80 # Use this instead of \w for XML files to handle more possible characters.
81 my $w = "[-A-Za-z0-9._:]";
83 # XML quoted string contents
92 } elsif ($BA_STYLE_ARG && @ARGV > 2) {
95 &ba_merge_translations;
97 } elsif ($XML_STYLE_ARG && @ARGV > 2) {
101 &xml_merge_translations;
103 } elsif ($KEYS_STYLE_ARG && @ARGV > 2) {
107 &keys_merge_translations;
109 } elsif ($DESKTOP_STYLE_ARG && @ARGV > 2) {
112 &desktop_merge_translations;
114 } elsif ($SCHEMAS_STYLE_ARG && @ARGV > 2) {
117 &schemas_merge_translations;
125 ## Sub for printing release information
128 print "${PROGRAM} (${PACKAGE}) ${VERSION}\n";
129 print "Written by Maciej Stachowiak, Darin Adler and Kenneth Christiansen.\n\n";
130 print "Copyright (C) 2000-2002 Free Software Foundation, Inc.\n";
131 print "Copyright (C) 2000-2001 Eazel, Inc.\n";
132 print "This is free software; see the source for copying conditions. There is NO\n";
133 print "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n";
137 ## Sub for printing usage information
140 print "Usage: ${PROGRAM} [OPTIONS] PO_DIRECTORY FILENAME OUTPUT_FILE\n";
141 print "Generates an output file that includes translated versions of some attributes,\n";
142 print "from an untranslated source and a po directory that includes translations.\n\n";
143 print " -b, --ba-style includes translations in the bonobo-activation style\n";
144 print " -d, --desktop-style includes translations in the desktop style\n";
145 print " -k, --keys-style includes translations in the keys style\n";
146 print " -s, --schemas-style includes translations in the schemas style\n";
147 print " -x, --xml-style includes translations in the standard xml style\n";
148 print " -u, --utf8 convert all strings to UTF-8 before merging\n";
149 print " -p, --pass-through use strings as found in .po files, without\n";
150 print " conversion (STRONGLY unrecommended with -x)\n";
151 print " -q, --quiet suppress most messages\n";
152 print " --help display this help and exit\n";
153 print " --version output version information and exit\n";
154 print "\nReport bugs to bugzilla.gnome.org, module intltool, or contact us through \n";
155 print "<xml-i18n-tools-list\@gnome.org>.\n";
160 ## Sub for printing error messages
163 print "Try `${PROGRAM} --help' for more information.\n";
170 print "Merging translations into $OUTFILE.\n" unless $QUIET_ARG;
181 &get_translation_database;
184 # General-purpose code for looking up translations in .po files
189 $tmp =~ s/^.*\/(.*)\.po$/$1/;
195 for my $po_file (glob "$PO_DIR/*.po") {
196 $po_files_by_lang{po_file2lang($po_file)} = $po_file;
202 my ($in_po_file) = @_;
205 open IN_PO_FILE, $in_po_file or die;
206 while (<IN_PO_FILE>) {
207 ## example: "Content-Type: text/plain; charset=ISO-8859-1\n"
208 if (/Content-Type\:.*charset=([-a-zA-Z0-9]+)\\n/) {
216 print "Warning: no encoding found in $in_po_file. Assuming ISO-8859-1\n";
217 $encoding = "ISO-8859-1";
222 sub utf8_sanity_check
225 if (!$PASS_THROUGH_ARG) {
226 $PASS_THROUGH_ARG="1";
231 sub get_translation_database
234 &get_cached_translation_database;
236 &create_translation_database;
240 sub get_newest_po_age
244 foreach my $file (values %po_files_by_lang) {
245 my $file_age = -M $file;
246 $newest_age = $file_age if !$newest_age || $file_age < $newest_age;
254 print "Generating and caching the translation database\n" unless $QUIET_ARG;
256 &create_translation_database;
258 open CACHE, ">$cache_file" || die;
259 print CACHE join "\x01", %translations;
265 print "Found cached translation database\n" unless $QUIET_ARG;
268 open CACHE, "<$cache_file" || die;
274 %translations = split "\x01", $contents;
277 sub get_cached_translation_database
279 my $cache_file_age = -M $cache_file;
280 if (defined $cache_file_age) {
281 if ($cache_file_age <= &get_newest_po_age) {
285 print "Found too-old cached translation database\n" unless $QUIET_ARG;
291 sub create_translation_database
293 for my $lang (keys %po_files_by_lang) {
294 my $po_file = $po_files_by_lang{$lang};
297 my $encoding = get_po_encoding ($po_file);
298 if (lc $encoding eq "utf-8") {
299 open PO_FILE, "<$po_file";
301 my $iconv = $ENV{"INTLTOOL_ICONV"} || "iconv";
302 open PO_FILE, "$iconv -f $encoding -t UTF-8 $po_file|";
305 open PO_FILE, "<$po_file";
314 $nextfuzzy = 1 if /^#, fuzzy/;
315 if (/^msgid "((\\.|[^\\])*)"/ ) {
316 $translations{$lang, $msgid} = $msgstr if $inmsgstr && $msgid && $msgstr;
323 $msgid = unescape_po_string($1);
329 if (/^msgstr "((\\.|[^\\])*)"/) {
330 $msgstr = unescape_po_string($1);
334 if (/^"((\\.|[^\\])*)"/) {
335 $msgid .= unescape_po_string($1) if $inmsgid;
336 $msgstr .= unescape_po_string($1) if $inmsgstr;
339 $translations{$lang, $msgid} = $msgstr if $inmsgstr && $msgid && $msgstr;
347 sub unescape_one_sequence
351 return "\\" if $sequence eq "\\\\";
352 return "\"" if $sequence eq "\\\"";
354 # gettext also handles \n, \t, \b, \r, \f, \v, \a, \xxx (octal),
355 # \xXX (hex) and has a comment saying they want to handle \u and \U.
360 sub unescape_po_string
364 $string =~ s/(\\.)/unescape_one_sequence($1)/eg;
382 my ($pre_encoded) = @_;
384 my @list_of_chars = unpack ('C*', $pre_encoded);
386 if ($PASS_THROUGH_ARG) {
387 return join ('', map (&entity_encode_int_even_high_bit, @list_of_chars));
389 return join ('', map (&entity_encode_int_minimalist, @list_of_chars));
393 sub entity_encode_int_minimalist
395 return """ if $_ == 34;
396 return "&" if $_ == 38;
397 return "'" if $_ == 39;
401 sub entity_encode_int_even_high_bit
403 if ($_ > 127 || $_ == 34 || $_ == 38 || $_ == 39) {
404 # the ($_ > 127) should probably be removed
405 return "&#" . $_ . ";";
411 sub entity_encoded_translation
413 my ($lang, $string) = @_;
415 my $translation = $translations{$lang, $string};
416 return $string if !$translation;
417 return entity_encode ($translation);
420 ## XML (bonobo-activation specific) merge code
422 sub ba_merge_translations
427 local $/; # slurp mode
428 open INPUT, "<$FILE" or die "can't open $FILE: $!";
433 open OUTPUT, ">$OUTFILE" or die "can't open $OUTFILE: $!";
435 while ($source =~ s|^(.*?)([ \t]*<\s*$w+\s+($w+\s*=\s*"$q"\s*)+/?>)([ \t]*\n)?||s) {
438 my $node = $2 . "\n";
442 while (s/(\s)_($w+\s*=\s*"($q)")/$1$2/s) {
443 push @strings, entity_decode($3);
448 for my $string (@strings) {
449 for my $lang (keys %po_files_by_lang) {
450 $langs{$lang} = 1 if $translations{$lang, $string};
454 for my $lang (sort keys %langs) {
456 s/(\sname\s*=\s*)"($q)"/$1"$2-$lang"/s;
457 s/(\s)_($w+\s*=\s*")($q)"/$1 . $2 . entity_encoded_translation($lang, $3) . '"'/seg;
462 print OUTPUT $source;
468 ## XML (non-bonobo-activation) merge code
470 sub xml_merge_translations
475 local $/; # slurp mode
476 open INPUT, "<$FILE" or die "can't open $FILE: $!";
481 open OUTPUT, ">$OUTFILE" or die;
483 # FIXME: support attribute translations
485 # Empty nodes never need translation, so unmark all of them.
486 # For example, <_foo/> is just replaced by <foo/>.
487 $source =~ s|<\s*_($w+)\s*/>|<$1/>|g;
489 # Support for <_foo>blah</_foo> style translations.
490 while ($source =~ s|^(.*?)([ \t]*)<\s*_($w+)\s*>(.*?)<\s*/_\3\s*>([ \t]*\n)?||s) {
497 print OUTPUT "$spaces<$tag>$string</$tag>\n";
499 $string =~ s/\s+/ /g;
502 $string = entity_decode($string);
504 for my $lang (sort keys %po_files_by_lang) {
505 my $translation = $translations{$lang, $string};
506 next if !$translation;
507 $translation = entity_encode($translation);
508 print OUTPUT "$spaces<$tag xml:lang=\"$lang\">$translation</$tag>\n";
512 print OUTPUT $source;
517 sub keys_merge_translations
519 open INPUT, "<${FILE}" or die;
520 open OUTPUT, ">${OUTFILE}" or die;
523 if (s/^(\s*)_(\w+=(.*))/$1$2/) {
528 my $non_translated_line = $_;
530 for my $lang (sort keys %po_files_by_lang) {
531 my $translation = $translations{$lang, $string};
532 next if !$translation;
534 $_ = $non_translated_line;
535 s/(\w+)=.*/[$lang]$1=$translation/;
547 sub desktop_merge_translations
549 open INPUT, "<${FILE}" or die;
550 open OUTPUT, ">${OUTFILE}" or die;
553 if (s/^(\s*)_(\w+=(.*))/$1$2/) {
558 my $non_translated_line = $_;
560 for my $lang (sort keys %po_files_by_lang) {
561 my $translation = $translations{$lang, $string};
562 next if !$translation;
564 $_ = $non_translated_line;
565 s/(\w+)=.*/${1}[$lang]=$translation/;
577 sub schemas_merge_translations
582 local $/; # slurp mode
583 open INPUT, "<$FILE" or die "can't open $FILE: $!";
588 open OUTPUT, ">$OUTFILE" or die;
590 # FIXME: support attribute translations
592 # Empty nodes never need translation, so unmark all of them.
593 # For example, <_foo/> is just replaced by <foo/>.
594 $source =~ s|<\s*_($w+)\s*/>|<$1/>|g;
596 # Support for <_foo>blah</_foo> style translations.
598 my $regex_start = "^(.*?)([ \t]*)<locale name=\"C\">";
599 my $regex_short = "([ \t\n]*)<short>(.*?)</short>";
600 my $regex_long = "([ \t\n]*)<long>(.*?)</long>";
601 my $regex_end = "([ \t\n]*)</locale>";
603 while ($source =~ s|$regex_start$regex_short$regex_long$regex_end||s) {
606 my $locale_start_spaces = $2;
607 my $locale_end_spaces = $7;
608 my $short_spaces = $3;
609 my $short_string = $4;
610 my $long_spaces = $5;
611 my $long_string = $6;
615 print OUTPUT "$locale_start_spaces<locale name=\"C\">";
616 print OUTPUT "$short_spaces<short>$short_string</short>";
617 print OUTPUT "$long_spaces<long>$long_string</long>";
618 print OUTPUT "$locale_end_spaces</locale>";
620 $short_string =~ s/\s+/ /g;
621 $short_string =~ s/^ //;
622 $short_string =~ s/ $//;
623 $short_string = entity_decode($short_string);
625 $long_string =~ s/\s+/ /g;
626 $long_string =~ s/^ //;
627 $long_string =~ s/ $//;
628 $long_string = entity_decode($long_string);
630 for my $lang (sort keys %po_files_by_lang) {
631 my $short_translation = $translations{$lang, $short_string};
632 my $long_translation = $translations{$lang, $long_string};
634 next if (!$short_translation && !$long_translation);
636 print OUTPUT "\n$locale_start_spaces<locale name=\"$lang\">";
638 if ($short_translation)
640 $short_translation = entity_encode($short_translation);
641 print OUTPUT "$short_spaces<short>$short_translation</short>";
644 if ($long_translation)
646 $long_translation = entity_encode($long_translation);
647 print OUTPUT "$long_spaces<long>$long_translation</long>";
650 print OUTPUT "$locale_end_spaces</locale>";
654 print OUTPUT $source;