#!/usr/bin/perl
########################################################################
#
#       Universal filter of Russian encodings version 2.1
#       created by Serge Winitzki (1997-1999). This is free software.
#       Home page: http://www.geocities.com/CapeCanaveral/Lab/5735/1/
#
#       Features:
#       supported encodings: alt, iso, koi, lat, mac, win;
#       letters 'YO' and 'yo' correctly supported in all encodings;
#       strict 'Russkaja Latinica' conformance for the 'lat' encoding which
#       allows almost unambiguous repeated native<->latinized translations of text;
#       faster operation on Russian input (caveat: loads the whole file into memory);
#       determines the required encodings from invoked script's name (alt2koi etc.)
#       or from option string.
#
#       Command line options (all options are case-insensitive):
#       -alt2koi or -mac2win or whatever        select required encodings
#
#       Options for lat -> ... conversion:
#       -tex    do not translate text inside $..$, $$..$$ and \command names
#       -wisv   translate w as v (default w is tshcha)
#       -qisja  translate q as ja (default q is tshcha)
#       -usekh  translate kh as h (default kh='k''h')
#       
########################################################################
#
#       Installation:
#       if needed, edit the first line to reflect your perl location (`which perl`);
#       put this script somewhere on the path with executable permission;
#       optionally make links to this script named alt2win, win2koi etc.
#       (The script can determine the source/target encoding from its *name*.)
#       e.g. copy this file to /usr/local/bin/323 and then say
#       cd /usr/local/bin; chmod 755 323
#       ln -s 323 alt2koi; ln -s 323 koi2alt; and so on (optional)
#       for all needed combinations of alt, iso, koi, mac, win, lat.
#       After all this, use as a filter. For example, `lat2koi < file1 > file2`
#       or else have to specify encoding as `323 -lat2koi < file1 > file2`
#
############################# start of script ##########################
#
#       Direct native encodings:
#
$rusmac='\xDD\xDE\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xDF';
$rusalt='\xF0\xF1\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF';
$ruswin='\xA8\xB8\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF';
$ruskoi='\xB3\xA3\xE1\xE2\xF7\xE7\xE4\xE5\xF6\xFA\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFE\xFB\xFD\xFF\xF9\xF8\xFC\xE0\xF1\xC1\xC2\xD7\xC7\xC4\xC5\xD6\xDA\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDE\xDB\xDD\xDF\xD9\xD8\xDC\xC0\xD1';
$rusiso='\xA1\xF1\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF';

####################### main part of the script ########################

$from="nothing";
$to="nothing";
$lat_output="no";       #whether latinized output is requested. special flag.
$lat_input="no";        #same for input

$help='Universal converter of Russian encodings version 2.1
Created by Serge Winitzki, 1999. No warranty. This is free software.
http://www.geocities.com/CapeCanaveral/Lab/5735/1/

   Supported encodings: alt, iso, koi, lat, mac, win
   Example usage:

        323 -alt2koi < inputfile > outputfile

   Or rename to "xxx2xxx" where xxx is one of the supported encodings and e.g.

        alt2koi < inputfile > outputfile

   Note: latinized encoding "lat" is implemented according to the "Russkaja
Latinica" scheme. See http://www.geocities.com/Athens/Forum/5344/RL/ for
more details. Sample options for "lat" input:

        323 -lat2koi -usekh -wisv -qisja -tex < inputfile > outputfile

   See the script preamble for more information.
';

if ("@ARGV" =~ /-([aciklmnostw]{3})2([aciklmnostw]{3})/i) {
        $a1=$1;
        $a2=$2;
        $error="Incorrect encoding '$a1 -> $a2' on command line.";
} else {
        #decide the source and target encoding based on our name
        $name=`basename $0`;
        if ($name =~ /([aciklmnostw]{3})2([aciklmnostw]{3})/i) {        #this should match koi2win etc.
        $a1="$1";
        $a2="$2";
        }
        $error="Incorrect usage of this script, see $0 for documentation.";
}

if ("@ARGV" =~ /help/i) {
        print $help . "\n";
        exit;
}


{
        if ($a1 =~ /win/i) {
                $from="$ruswin";
        } elsif ($a1 =~ /koi/i) {
                $from="$ruskoi";
        } elsif ($a1 =~ /alt/i) {
                $from="$rusalt";
        } elsif ($a1 =~ /mac/i) {
                $from="$rusmac";
        } elsif ($a1 =~ /iso/i) {
                $from="$rusiso";
        } elsif ($a1 =~ /lat/i) {
                $from="$ruskoi";        #this is because our latin table is for koi
                $lat_input="yes";
        }
        if ($a2 =~ /win/i) {
                $to="$ruswin";
        } elsif ($a2 =~ /koi/i) {
                $to="$ruskoi";
        } elsif ($a2 =~ /alt/i) {
                $to="$rusalt";
        } elsif ($a2 =~ /mac/i) {
                $to="$rusmac";
        } elsif ($a2 =~ /iso/i) {
                $to="$rusiso";
        } elsif ($a2 =~ /lat/i) {
                $to="$ruskoi";  #this is because our latin table is for koi
                $lat_output="yes";
   }

}

if ($to eq "nothing" or $from eq "nothing") {   #wrong options
        print "$error\n$0 -help for brief usage instructions.\n";
        exit 1;
}

undef $/;       #make it convert the whole file at once, usually much faster.

while() {        #main loop

#effectively we want to do e.g.
# eval ("tr/$ruswin/$rusalt/"); #because tr requires constant strings


        if ($lat_input eq "yes") {
                &translate_lat_to_koi();        #call special procedure operating on $_
        }
        #now $_ contains all cyrillic text and we need to transform it
        eval ("tr/$from/$to/"); #we need to do this now
        #now $_ contains correctly transformed text
        if ($lat_output eq "yes") {
                &translate_koi_to_lat();        #call special procedure operating on $_
        }
        print;
}

#################### end of main part of the script ####################

sub translate_koi_to_lat {

#use this procedure to replace each character in $_

#using Russkaja Latinica standard (by Alexy Khabrov and Serge Winitzki, 1995)

#first, break digraphs Y-A, Y-U, Y-O - just in case we get them in the text although they are ungrammatical. Insert the canonical breaking char \\.

        s/([\xF9\xD9])([\xE1\xEF\xF5\xC1\xCF\xD5])/$1\\$2/g;

#also break the sh-ch which should rarely happen but still

        s/([\xFB\xDB])([\xFE\xDE])/$1\\$2/g;

#second, transform letters that require combinations. Using "x" for "kha", "j'" for "i kratkoe, "shch" for "tshcha", "e'" for "e oborotnoe".
        s/\xB3/Yo/g;
        s/\xF6/Zh/g;
        s/\xEA/J'/g;
        s/\xFE/Ch/g;
        s/\xFB/Sh/g;
        s/\xFD/Shch/g;
        s/\xFC/E'/g;
        s/\xE0/Yu/g;
        s/\xF1/Ya/g;

        s/\xA3/yo/g;
        s/\xD6/zh/g;
        s/\xCA/j'/g;
        s/\xDE/ch/g;
        s/\xDB/sh/g;
        s/\xDD/shch/g;
        s/\xDC/e'/g;
        s/\xC0/yu/g;
        s/\xD1/ya/g;

#then replace other letters

tr/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFF\xF9\xF8\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDF\xD9\xD8/ABVGDEZIKLMNOPRSTUFXC~Y'abvgdeziklmnoprstufxc~y'/;
}

sub translate_lat_to_koi {      #operate on $_ only

        %translit=(
                "Shch" => "\xFD",
                "shch" => "\xDD",
                "Yo" => "\xB3",
                "yo" => "\xA3",
                "Jo" => "\xB3",
                "jo" => "\xA3",
                "Zh" => "\xF6",
                "zh" => "\xD6",
                "J'" => "\xEA",
                "j'" => "\xCA",
                "J`" => "\xEA",
                "j`" => "\xCA",
                "Ch" => "\xFE",
                "ch" => "\xDE",
                "Sh" => "\xFB",
                "sh" => "\xDB",
                "E'" => "\xFC",
                "e'" => "\xDC",
                "E`" => "\xFC",
                "e`" => "\xDC",
                "`E" => "\xFC",
                "`e" => "\xDC",
                "Yu" => "\xE0",
                "yu" => "\xC0",
                "Ju" => "\xE0",
                "ju" => "\xC0",
                "Ya" => "\xF1",
                "ya" => "\xD1",
                "Ja" => "\xF1",
                "ja" => "\xD1",
        );

        %malleable=(    # lowercase
                '~' => "\xDF",
                '`' => "\xD8",
                "'" => "\xD8",
                '@' => "\xDC",
        );

        %malleable_uc=( # uppercase
                '~' => "\xFF",
                '`' => "\xF8",
                "'" => "\xF8",
                '@' => "\xFC",
        );

        $i=0;   #pointer into the input string ($_)
        
        $EnglishNow=0;  #state flag for the digestion machine
        #now need to set some options
        $want_tex = ("@ARGV" =~ /-tex/i) ? 1 : 0;
        $want_wisv = ("@ARGV" =~ /-wisv/i) ? 1 : 0;
        $want_qisja = ("@ARGV" =~ /-qisja/i) ? 1 : 0;
        $want_kh = ("@ARGV" =~ /-usekh/i) ? 1 : 0;
        
        #need to modify the tables now
        if ($want_kh) {
                $translit{"Kh"} = "\xE8";
                $translit{"kh"} = "\xC8";
        }
                
        $output="";     #to hold the output text

        while ($i < length($_)) {       #loop through the input
                # The current char is substr($_,i,1).
                # Note that $i will not always advance by 1 and sometimes be changed inside &digest_some()
                my $doutput = &digest_some();
                $i += length($doutput);
                $output .= $doutput;
        }
        $_ = $output;
}

sub digest_some {       # Return next output char(s), using $i as read-only position in $_ and using flags $want_tex and $want_wisv

# our state: $EnglishNow=2 if inside $$ or after '\ ', 1 if inside \command, 0 if in Russian.
# the '$' and \commands are all ignored unless $want_tex
        my $thischar = substr($_, $i, 1);       #just caching, aren't going to change it
        my $nextchar = substr($_, $i+1, 1);     #this may be changed

        if ($EnglishNow == 2) {
          if ($want_tex) {
                if ($thischar . $nextchar eq '$$') {
                        $EnglishNow= 0;
                        return '$$';
                }
                if ($thischar eq '$') {
                        $EnglishNow= 0;
                        return '$';
                }
          }
                # insert any additional switchers here
                if ($thischar . $nextchar eq '\\ ') {
                        #switching back to Russian
                        $EnglishNow= 0;
                        $i += 2;        #incrementing $i here since not returning anything
                        return "";
                }
                # ok, English is still going on
                return $thischar;
        } # case of $EnglishNow == 2 is done    

        if ($EnglishNow == 1 and $want_tex) {
                if ($thischar eq ' ' or $thischar eq '\n') {    # terminates \command
                        $EnglishNow= 0;
                        return $thischar;
                }
                if ($thischar . $nextchar eq '$$') {
                        $EnglishNow= 2;
                        return '$$';
                }
                if ($thischar eq '$') {
                        $EnglishNow= 2;
                        return '$';
                }
                if ($thischar eq '\\') {
                        if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts another \command right after this one
                                $EnglishNow= 1;
                                return $thischar;
                        }
                }       
                # didn't switch to Russian, continue without translation
                return $thischar;
        } # case of $EnglishNow == 1 is done
        
        if ($EnglishNow == 0) {
          if ($want_tex) {
                if ($thischar . $nextchar eq '$$') {
                        $EnglishNow= 2;
                        return '$$';
                }
                if ($thischar eq '$') {
                        $EnglishNow= 2;
                        return '$';
                }
          }
          if ($thischar eq '\\') {
                if ($want_tex) {
                        if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts \command
                                $EnglishNow= 1;
                                return $thischar;
                        }
                }
                if ($nextchar eq ' ') { # switch to English now
                        $EnglishNow = 2;
                        $i += 2;
                        return "";
                }
                if ($nextchar eq '\\') {        # double backslash, skipping one
                        $i += 1;
                        return "\\";
                }
                
                #we get a backslash in Russian mode and not followed by space
                #tex mode quirks and double backslashes are already done
                #so we should swallow it and go on with the next char
                $i += 1;
                return "";
          }     # End of processing backslash char
                # all switches have been processed, now do Russian stuff
                
                # first, the 4-letter combination for "tshcha"
                
                if (substr($_, $i, 4) eq 'shch') {      #lowercase
                        $i += 3;
                        return $translit{'shch'};
                }
                
                if (substr($_, $i, 4) =~ /shch/i) {     # uppercase: we now know it's not lowercase so any case combination works
                        $i += 3;
                        return $translit{'Shch'};
                }
                
                #now looking for digraphs
                $digraph = $thischar . $nextchar;
                $digraph =~ tr/A-Z/a-z/;        #now it's all lowercase
                if (defined($translit{$digraph})) {     # Found a digraph!
                        if ($nextchar =~ /[A-Z]/ or $thischar =~ /[A-Z]/) {     # uppercase
                                $thischar =~ tr/a-z/A-Z/;       # Clobber, clobber
                                $nextchar =~ tr/A-Z/a-z/;
                                $digraph = $thischar . $nextchar;
                        }
                        $i += 1;
                        return $translit{$digraph};
                }

                # now search for malleables
                if (defined($malleable{$thischar})) {   # Found a malleable.
                        $prevchar = ($i>0) ? substr($_, $i-1, 1) : "";
                        if ($thischar eq '`' or $thischar eq "'") {
                                if (not ($prevchar =~ /[\@A-Za-z]/) and $nextchar =~ /[\@A-Za-z]/) {    # ' and ` at beginning of word are not translated
                                        return $thischar;
                                }
                                
                        }
                        if ($prevchar eq '\\') {
                                return $thischar;       # ' and ` prefixed by \ are not translated
                        }
                        if ($prevchar eq '^') { # Special cases.
                                return $malleable_uc{$thischar};
                        }
                        if ($prevchar eq '_') {
                                return $malleable{$thischar};
                        }
                        if (($prevchar =~ /[A-Z \n\t]/ or length($prevchar) == 0) and $nextchar =~ /[A-Z \n\t]/) {
                                return $malleable_uc{$thischar};
                        }
                        return $malleable{$thischar};
                }

                #if we are still here, we have a simple letter
                if ($want_qisja) {
                        $thischar = ($thischar eq 'Q') ? $translit{'Ja'} : (($thischar eq 'q') ? $translit{'ja'} : $thischar);
                }
                if ($want_wisv) {
                        $thischar = ($thischar eq 'W') ? 'V' : (($thischar eq 'w') ? 'v' : $thischar);
                }
                $thischar =~ tr/ABVGDEZIKLMNOPRSTUFXHCYWQJabvgdeziklmnoprstufxhcywqj/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE8\xE3\xF9\xFD\xFD\xEA\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC8\xC3\xD9\xDD\xDD\xCA/;
                return $thischar;

        } # case of EnglishNow == 0 is done

}
__END__