#!/usr/bin/perl ######################################################################## # # Universal filter of Russian encodings version 2.1 # created by Serge Winitzki (1997-1999). This is free software. # Home page: http://www.geocities.com/CapeCanaveral/Lab/5735/1/ # # Features: # supported encodings: alt, iso, koi, lat, mac, win; # letters 'YO' and 'yo' correctly supported in all encodings; # strict 'Russkaja Latinica' conformance for the 'lat' encoding which # allows almost unambiguous repeated native<->latinized translations of text; # faster operation on Russian input (caveat: loads the whole file into memory); # determines the required encodings from invoked script's name (alt2koi etc.) # or from option string. # # Command line options (all options are case-insensitive): # -alt2koi or -mac2win or whatever select required encodings # # Options for lat -> ... conversion: # -tex do not translate text inside $..$, $$..$$ and \command names # -wisv translate w as v (default w is tshcha) # -qisja translate q as ja (default q is tshcha) # -usekh translate kh as h (default kh='k''h') # ######################################################################## # # Installation: # if needed, edit the first line to reflect your perl location (`which perl`); # put this script somewhere on the path with executable permission; # optionally make links to this script named alt2win, win2koi etc. # (The script can determine the source/target encoding from its *name*.) # e.g. copy this file to /usr/local/bin/323 and then say # cd /usr/local/bin; chmod 755 323 # ln -s 323 alt2koi; ln -s 323 koi2alt; and so on (optional) # for all needed combinations of alt, iso, koi, mac, win, lat. # After all this, use as a filter. For example, `lat2koi < file1 > file2` # or else have to specify encoding as `323 -lat2koi < file1 > file2` # ############################# start of script ########################## # # Direct native encodings: # $rusmac='\xDD\xDE\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xDF'; $rusalt='\xF0\xF1\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF'; $ruswin='\xA8\xB8\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF'; $ruskoi='\xB3\xA3\xE1\xE2\xF7\xE7\xE4\xE5\xF6\xFA\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFE\xFB\xFD\xFF\xF9\xF8\xFC\xE0\xF1\xC1\xC2\xD7\xC7\xC4\xC5\xD6\xDA\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDE\xDB\xDD\xDF\xD9\xD8\xDC\xC0\xD1'; $rusiso='\xA1\xF1\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF'; ####################### main part of the script ######################## $from="nothing"; $to="nothing"; $lat_output="no"; #whether latinized output is requested. special flag. $lat_input="no"; #same for input $help='Universal converter of Russian encodings version 2.1 Created by Serge Winitzki, 1999. No warranty. This is free software. http://www.geocities.com/CapeCanaveral/Lab/5735/1/ Supported encodings: alt, iso, koi, lat, mac, win Example usage: 323 -alt2koi < inputfile > outputfile Or rename to "xxx2xxx" where xxx is one of the supported encodings and e.g. alt2koi < inputfile > outputfile Note: latinized encoding "lat" is implemented according to the "Russkaja Latinica" scheme. See http://www.geocities.com/Athens/Forum/5344/RL/ for more details. Sample options for "lat" input: 323 -lat2koi -usekh -wisv -qisja -tex < inputfile > outputfile See the script preamble for more information. '; if ("@ARGV" =~ /-([aciklmnostw]{3})2([aciklmnostw]{3})/i) { $a1=$1; $a2=$2; $error="Incorrect encoding '$a1 -> $a2' on command line."; } else { #decide the source and target encoding based on our name $name=`basename $0`; if ($name =~ /([aciklmnostw]{3})2([aciklmnostw]{3})/i) { #this should match koi2win etc. $a1="$1"; $a2="$2"; } $error="Incorrect usage of this script, see $0 for documentation."; } if ("@ARGV" =~ /help/i) { print $help . "\n"; exit; } { if ($a1 =~ /win/i) { $from="$ruswin"; } elsif ($a1 =~ /koi/i) { $from="$ruskoi"; } elsif ($a1 =~ /alt/i) { $from="$rusalt"; } elsif ($a1 =~ /mac/i) { $from="$rusmac"; } elsif ($a1 =~ /iso/i) { $from="$rusiso"; } elsif ($a1 =~ /lat/i) { $from="$ruskoi"; #this is because our latin table is for koi $lat_input="yes"; } if ($a2 =~ /win/i) { $to="$ruswin"; } elsif ($a2 =~ /koi/i) { $to="$ruskoi"; } elsif ($a2 =~ /alt/i) { $to="$rusalt"; } elsif ($a2 =~ /mac/i) { $to="$rusmac"; } elsif ($a2 =~ /iso/i) { $to="$rusiso"; } elsif ($a2 =~ /lat/i) { $to="$ruskoi"; #this is because our latin table is for koi $lat_output="yes"; } } if ($to eq "nothing" or $from eq "nothing") { #wrong options print "$error\n$0 -help for brief usage instructions.\n"; exit 1; } undef $/; #make it convert the whole file at once, usually much faster. while() { #main loop #effectively we want to do e.g. # eval ("tr/$ruswin/$rusalt/"); #because tr requires constant strings if ($lat_input eq "yes") { &translate_lat_to_koi(); #call special procedure operating on $_ } #now $_ contains all cyrillic text and we need to transform it eval ("tr/$from/$to/"); #we need to do this now #now $_ contains correctly transformed text if ($lat_output eq "yes") { &translate_koi_to_lat(); #call special procedure operating on $_ } print; } #################### end of main part of the script #################### sub translate_koi_to_lat { #use this procedure to replace each character in $_ #using Russkaja Latinica standard (by Alexy Khabrov and Serge Winitzki, 1995) #first, break digraphs Y-A, Y-U, Y-O - just in case we get them in the text although they are ungrammatical. Insert the canonical breaking char \\. s/([\xF9\xD9])([\xE1\xEF\xF5\xC1\xCF\xD5])/$1\\$2/g; #also break the sh-ch which should rarely happen but still s/([\xFB\xDB])([\xFE\xDE])/$1\\$2/g; #second, transform letters that require combinations. Using "x" for "kha", "j'" for "i kratkoe, "shch" for "tshcha", "e'" for "e oborotnoe". s/\xB3/Yo/g; s/\xF6/Zh/g; s/\xEA/J'/g; s/\xFE/Ch/g; s/\xFB/Sh/g; s/\xFD/Shch/g; s/\xFC/E'/g; s/\xE0/Yu/g; s/\xF1/Ya/g; s/\xA3/yo/g; s/\xD6/zh/g; s/\xCA/j'/g; s/\xDE/ch/g; s/\xDB/sh/g; s/\xDD/shch/g; s/\xDC/e'/g; s/\xC0/yu/g; s/\xD1/ya/g; #then replace other letters tr/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFF\xF9\xF8\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDF\xD9\xD8/ABVGDEZIKLMNOPRSTUFXC~Y'abvgdeziklmnoprstufxc~y'/; } sub translate_lat_to_koi { #operate on $_ only %translit=( "Shch" => "\xFD", "shch" => "\xDD", "Yo" => "\xB3", "yo" => "\xA3", "Jo" => "\xB3", "jo" => "\xA3", "Zh" => "\xF6", "zh" => "\xD6", "J'" => "\xEA", "j'" => "\xCA", "J`" => "\xEA", "j`" => "\xCA", "Ch" => "\xFE", "ch" => "\xDE", "Sh" => "\xFB", "sh" => "\xDB", "E'" => "\xFC", "e'" => "\xDC", "E`" => "\xFC", "e`" => "\xDC", "`E" => "\xFC", "`e" => "\xDC", "Yu" => "\xE0", "yu" => "\xC0", "Ju" => "\xE0", "ju" => "\xC0", "Ya" => "\xF1", "ya" => "\xD1", "Ja" => "\xF1", "ja" => "\xD1", ); %malleable=( # lowercase '~' => "\xDF", '`' => "\xD8", "'" => "\xD8", '@' => "\xDC", ); %malleable_uc=( # uppercase '~' => "\xFF", '`' => "\xF8", "'" => "\xF8", '@' => "\xFC", ); $i=0; #pointer into the input string ($_) $EnglishNow=0; #state flag for the digestion machine #now need to set some options $want_tex = ("@ARGV" =~ /-tex/i) ? 1 : 0; $want_wisv = ("@ARGV" =~ /-wisv/i) ? 1 : 0; $want_qisja = ("@ARGV" =~ /-qisja/i) ? 1 : 0; $want_kh = ("@ARGV" =~ /-usekh/i) ? 1 : 0; #need to modify the tables now if ($want_kh) { $translit{"Kh"} = "\xE8"; $translit{"kh"} = "\xC8"; } $output=""; #to hold the output text while ($i < length($_)) { #loop through the input # The current char is substr($_,i,1). # Note that $i will not always advance by 1 and sometimes be changed inside &digest_some() my $doutput = &digest_some(); $i += length($doutput); $output .= $doutput; } $_ = $output; } sub digest_some { # Return next output char(s), using $i as read-only position in $_ and using flags $want_tex and $want_wisv # our state: $EnglishNow=2 if inside $$ or after '\ ', 1 if inside \command, 0 if in Russian. # the '$' and \commands are all ignored unless $want_tex my $thischar = substr($_, $i, 1); #just caching, aren't going to change it my $nextchar = substr($_, $i+1, 1); #this may be changed if ($EnglishNow == 2) { if ($want_tex) { if ($thischar . $nextchar eq '$$') { $EnglishNow= 0; return '$$'; } if ($thischar eq '$') { $EnglishNow= 0; return '$'; } } # insert any additional switchers here if ($thischar . $nextchar eq '\\ ') { #switching back to Russian $EnglishNow= 0; $i += 2; #incrementing $i here since not returning anything return ""; } # ok, English is still going on return $thischar; } # case of $EnglishNow == 2 is done if ($EnglishNow == 1 and $want_tex) { if ($thischar eq ' ' or $thischar eq '\n') { # terminates \command $EnglishNow= 0; return $thischar; } if ($thischar . $nextchar eq '$$') { $EnglishNow= 2; return '$$'; } if ($thischar eq '$') { $EnglishNow= 2; return '$'; } if ($thischar eq '\\') { if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts another \command right after this one $EnglishNow= 1; return $thischar; } } # didn't switch to Russian, continue without translation return $thischar; } # case of $EnglishNow == 1 is done if ($EnglishNow == 0) { if ($want_tex) { if ($thischar . $nextchar eq '$$') { $EnglishNow= 2; return '$$'; } if ($thischar eq '$') { $EnglishNow= 2; return '$'; } } if ($thischar eq '\\') { if ($want_tex) { if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts \command $EnglishNow= 1; return $thischar; } } if ($nextchar eq ' ') { # switch to English now $EnglishNow = 2; $i += 2; return ""; } if ($nextchar eq '\\') { # double backslash, skipping one $i += 1; return "\\"; } #we get a backslash in Russian mode and not followed by space #tex mode quirks and double backslashes are already done #so we should swallow it and go on with the next char $i += 1; return ""; } # End of processing backslash char # all switches have been processed, now do Russian stuff # first, the 4-letter combination for "tshcha" if (substr($_, $i, 4) eq 'shch') { #lowercase $i += 3; return $translit{'shch'}; } if (substr($_, $i, 4) =~ /shch/i) { # uppercase: we now know it's not lowercase so any case combination works $i += 3; return $translit{'Shch'}; } #now looking for digraphs $digraph = $thischar . $nextchar; $digraph =~ tr/A-Z/a-z/; #now it's all lowercase if (defined($translit{$digraph})) { # Found a digraph! if ($nextchar =~ /[A-Z]/ or $thischar =~ /[A-Z]/) { # uppercase $thischar =~ tr/a-z/A-Z/; # Clobber, clobber $nextchar =~ tr/A-Z/a-z/; $digraph = $thischar . $nextchar; } $i += 1; return $translit{$digraph}; } # now search for malleables if (defined($malleable{$thischar})) { # Found a malleable. $prevchar = ($i>0) ? substr($_, $i-1, 1) : ""; if ($thischar eq '`' or $thischar eq "'") { if (not ($prevchar =~ /[\@A-Za-z]/) and $nextchar =~ /[\@A-Za-z]/) { # ' and ` at beginning of word are not translated return $thischar; } } if ($prevchar eq '\\') { return $thischar; # ' and ` prefixed by \ are not translated } if ($prevchar eq '^') { # Special cases. return $malleable_uc{$thischar}; } if ($prevchar eq '_') { return $malleable{$thischar}; } if (($prevchar =~ /[A-Z \n\t]/ or length($prevchar) == 0) and $nextchar =~ /[A-Z \n\t]/) { return $malleable_uc{$thischar}; } return $malleable{$thischar}; } #if we are still here, we have a simple letter if ($want_qisja) { $thischar = ($thischar eq 'Q') ? $translit{'Ja'} : (($thischar eq 'q') ? $translit{'ja'} : $thischar); } if ($want_wisv) { $thischar = ($thischar eq 'W') ? 'V' : (($thischar eq 'w') ? 'v' : $thischar); } $thischar =~ tr/ABVGDEZIKLMNOPRSTUFXHCYWQJabvgdeziklmnoprstufxhcywqj/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE8\xE3\xF9\xFD\xFD\xEA\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC8\xC3\xD9\xDD\xDD\xCA/; return $thischar; } # case of EnglishNow == 0 is done } __END__