|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/download/ |
Upload File : |
# -*- coding: cn-big5; -*-
package ChineseNumbers;
require Exporter;
use strict;
use subs qw{EnglishToChineseNumber ChineseToEnglishNumber};
# Author: Erik Peterson
# E-mail: [email protected]
# Source: http://www.mandarintools.com/numbers.html
#
# Usage:
#
# use ChineseNumbers;
#
# ChineseNumbers->EnglishToChineseNumber(enumber, [output_type])
# enumber is an integer
# output_type (which is optional) can be
# big5 : Output Chinese using Big5
# formalb5 : Output as formal numbers in Big5
# gb : Output using GB
# formalgb : Output as formal numbers in GB (not working yet)
# utf8 : Output as (traditional character) UTF-8
# unicodehex: Output as 4-digit Unicode hex blocks
# pinyin : Output as Hanyu Pinyin
# jyutpin : Output as Cantonese jyutpin romanization
# yalecant : Output as Cantonese Yale romanization
# The default is big5
#
# ChineseNumbers->ChineseToEnglishNumber(cnumber, [input_type])
# cnumber is a string in GB, Big5, UTF-8
# input_type is "big5", "gb", or "utf8", depending on cnumber
# default is "big5"
#
# ChineseNumbers->chinese_output([option])
# Set the default output type used by EnglishToChineseNumber
# option can be any of the output options for EnglishToChineseNumber
# If no arguments, returns the current default
#
# ChineseNumbers->chinese_input([option])
# Set the default input type used by ChineseToEnglishNumber
# option can be "big5", "gb", or "utf8"
# If no arguments, returns the current default
BEGIN { }
my $minus = "�t";
my @digits = ("�s", "�@", "�G", "�T", "�|", "��", "��", "�C", "�K", "�E");
my %digits = ("�s", 0,
"�@", 1,
"�G", 2,
"��", 2,
"�T", 3,
"�|", 4,
"��", 5,
"��", 6,
"�C", 7,
"�K", 8,
"�E", 9);
my @beforeWan = ("�Q", "��", "�d");
my @afterWan = ("", "�U", "��", "��", "��");
my %beforeWan = ("�Q", 10,
"��", 100,
"�d", 1000);
my %afterWan = ("�U", 10000,
"��", 100000000,
"��", 1000000000000,
"��", 10000000000000000);
my $ALTTWO = "��";
my $TEN = 10;
my $default_outputtype = "big5";
my $default_inputtype = "big5";
my %trad2simp = ("�t" => "��",
"�s" => "��",
"�@" => "һ",
"�G" => "��",
"�T" => "�",
"�|" => "��",
"��" => "��",
"��" => "��",
"�C" => "��",
"�K" => "��",
"�E" => "��",
"�Q" => "ʮ",
"��" => "��",
"�d" => "ǧ",
"�U" => "��",
"��" => "��",
"��" => "��",
"��" => "��");
my %simp2trad = ("��" => "�t",
"��" => "�s",
"һ" => "�@",
"��" => "�G",
"�" => "�T",
"��" => "�|",
"��" => "��",
"��" => "��",
"��" => "�C",
"��" => "�K",
"��" => "�E",
"ʮ" => "�Q",
"��" => "��",
"ǧ" => "�d",
"��" => "�U",
"��" => "��",
"��" => "��",
"��" => "��");
my %trad2formal = ("�t" => "�t",
"�s" => "�s",
"�@" => "��",
"�G" => "�L",
"�T" => "��",
"�|" => "�v",
"��" => "��",
"��" => "��",
"�C" => "�m",
"�K" => "��",
"�E" => "�h",
"�Q" => "�B",
"��" => "��",
"�d" => "�a",
"�U" => "�U",
"��" => "��",
"��" => "��",
"��" => "��");
my %trad2formalgb = ("�t" => "",
"�s" => "",
"�@" => "",
"�G" => "",
"�T" => "",
"�|" => "",
"��" => "",
"��" => "",
"�C" => "",
"�K" => "",
"�E" => "",
"�Q" => "",
"��" => "",
"�d" => "",
"�U" => "",
"��" => "",
"��" => "",
"��" => "");
my %trad2unicode = ("�t" => "8CA0",
"�s" => "96F6",
"�@" => "4E00",
"�G" => "4E8C",
"�T" => "4E09",
"�|" => "56DB",
"��" => "4E94",
"��" => "516D",
"�C" => "4E03",
"�K" => "516B",
"�E" => "4E5D",
"�Q" => "5341",
"��" => "767E",
"�d" => "5343",
"�U" => "842C",
"��" => "5104",
"��" => "5146",
"��" => "5169");
my %unicode2trad = ("8CA0" => "�t",
"8D1F" => "�t", # simp
"96F6" => "�s",
"4E00" => "�@",
"4E8C" => "�G",
"4E09" => "�T",
"56DB" => "�|",
"4E94" => "��",
"516D" => "��",
"4E03" => "�C",
"516B" => "�K",
"4E5D" => "�E",
"5341" => "�Q",
"767E" => "��",
"5343" => "�d",
"842C" => "�U",
"4E07" => "�U", # simp
"5104" => "��",
"4EBF" => "��", # simp
"5146" => "��",
"5169" => "��", # simp
"4E24" => "��");
my %trad2pinyin = ("�t" => "fu4",
"�s" => "ling2",
"�@" => "yi1",
"�G" => "er4",
"�T" => "san1",
"�|" => "si4",
"��" => "wu3",
"��" => "liu4",
"�C" => "qi1",
"�K" => "ba1",
"�E" => "jiu3",
"�Q" => "shi2",
"��" => "bai3",
"�d" => "qian1",
"�U" => "wan4",
"��" => "yi4",
"��" => "zhao4",
"��" => "liang3");
my %trad2yalecant = ("�t" => "fu",
"�s" => "ling2",
"�@" => "yat",
"�G" => "yih7",
"�T" => "saam1",
"�|" => "sei5",
"��" => "ng4",
"��" => "luhk",
"�C" => "chat1",
"�K" => "baat1",
"�E" => "gao3",
"�Q" => "sap7",
"��" => "baak5",
"�d" => "chin1",
"�U" => "maahn",
"��" => "yik1",
"��" => "siu",
"��" => "leung4");
my %trad2jyutpin = ("�t" => "fu6",
"�s" => "ling4",
"�@" => "jat1",
"�G" => "ji6",
"�T" => "saam1",
"�|" => "sei3",
"��" => "ng5",
"��" => "luk6",
"�C" => "cat1",
"�K" => "baat3",
"�E" => "gau2",
"�Q" => "sap6",
"��" => "baak3",
"�d" => "cin1",
"�U" => "maan6",
"��" => "jik1",
"��" => "siu6",
"��" => "loeng5");
sub new {
return bless {};
}
# The heart of the program. Does the actual conversion
sub EnglishToChineseNumber {
my($self) = shift;
my($enumber) = shift;
my($outputtype) = shift;
if ($outputtype eq "") {
$outputtype = $default_outputtype;
}
$outputtype = lc($outputtype);
# print "Output type : $outputtype\n";
my(@powers) = ();
my($power) = 0;
my($value) = 0;
my($negative) = 0; # is it a negative integer?
my($inzero) = 0; # are we in a stretch or 1 or more zeros (only add one zero for the stretch)
my($canaddzero) = 0; # only add a zero if there's something non-zero on both sides of it
my($cnumber) = ""; # the final result
# Remove all non-digits
$enumber =~ s/[^0-9\.-]//g;
# If zero, just return zero
if ($enumber == 0) {
return $digits[0];
}
# Check if it's negative, set the negative flag and make it positive
if ($enumber < 0) {
$negative = 1;
$enumber = -$enumber;
}
# Get the value of the coefficient for each power of ten
while ($TEN ** $power <= $enumber) {
$value = ($enumber % ($TEN** ($power+1)))/($TEN**$power);
$powers[$power] = $value;
# Subtract out the current power's coefficient and increase the power
$enumber -= $enumber % ($TEN**($power+1));
$power++;
}
my($i);
# Take the decomposition of the number for above and generate the Chinese equivalent
for ($i = 0; $i < $power; $i++) {
#System.out.println("10^" + i + ":\t" + powers[i]);
if (($i % 4) == 0) { # Reached the next four powers up level
if ($powers[$i] != 0) {
$inzero = 0;
$canaddzero = 1;
$cnumber = $digits[$powers[$i]] . $afterWan[$i/4] . $cnumber;
} else {
# Check that something in the next three powers is non-zero before adding
if ((($i+3 < $power) && $powers[$i+3] != 0) ||
(($i+2 < $power) && $powers[$i+2] != 0) ||
(($i+1 < $power) && $powers[$i+1] != 0))
{
$cnumber = $afterWan[$i/4] . $cnumber;
$canaddzero = 0; # added
}
}
} else { # Add one, tens, hundreds, or thousands place for each level
if ($powers[$i] != 0) {
$inzero = 0;
$canaddzero = 1;
if ($power == 2 && $i == 1 && $powers[$i] == 1) { # No �@ with 10 through 19
$cnumber = $beforeWan[($i % 4)-1] . $cnumber;
#} else if ((i%4 = 3) && powers[i] == 2) { # when to use liang3 vs. er4
#cnumber.insert(0, ALTTWO + beforeWan[(i%4)-1]);
} else {
$cnumber = $digits[$powers[$i]] . $beforeWan[($i%4)-1] . $cnumber;
}
} else {
if ($canaddzero == 1 && $inzero == 0) { # Only insert one �s for all consecutive zeroes
$inzero = 1;
$cnumber = $digits[$powers[$i]] . $cnumber;
}
}
}
}
# Add the negative character
if ($negative == 1) {
$cnumber = $minus . $cnumber;
}
my($result, $j);
if ($outputtype eq "big5") {
$result = $cnumber;
} elsif ($outputtype eq "gb") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2simp{substr($cnumber, $j, 2)};
}
} elsif ($outputtype eq "formalb5") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2formal{substr($cnumber, $j, 2)};
}
} elsif ($outputtype eq "formalgb") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2simp{substr($cnumber, $j, 2)};
#$result .= $trad2formalgb{substr($cnumber, $j, 2)};
}
} elsif ($outputtype eq "utf8") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= &hex2utf8($trad2unicode{substr($cnumber, $j, 2)});
}
} elsif ($outputtype eq "unicodehex") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2unicode{substr($cnumber, $j, 2)} . " ";
}
} elsif ($outputtype eq "pinyin") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2pinyin{substr($cnumber, $j, 2)} . " ";
}
} elsif ($outputtype eq "jyutpin") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2jyutpin{substr($cnumber, $j, 2)} . " ";
}
} elsif ($outputtype eq "yalecant") {
for ($j = 0; $j < length($cnumber); $j+=2) {
$result .= $trad2yalecant{substr($cnumber, $j, 2)} . " ";
}
} else {
$result = $cnumber;
}
return $result;
}
sub ChineseToEnglishNumber {
my($self) = shift;
my($inputnumber) = shift;
my($inputtype);
if (@_) {
$inputtype = shift;
} else {
$inputtype = $default_inputtype;
}
my($i, $j);
my($alldigits) = 1;
my($cnumber);
if ($inputtype eq "gb") {
for ($j = 0; $j < length($inputnumber); $j+=2) {
$cnumber .= $simp2trad{substr($inputnumber, $j, 2)};
}
} elsif ($inputtype eq "utf8") {
for ($j = 0; $j < length($inputnumber); $j+=3) {
$cnumber .= $unicode2trad{&utf82hex(substr($inputnumber, $j, 3))};
}
} else {
$cnumber = $inputnumber;
}
if (length($cnumber) > 2) {
for ($i = 0; $i < length($cnumber); $i+=2) {
if (!defined($digits{substr($cnumber, $i, 2)})) {
$alldigits = 0;
}
}
if ($alldigits == 1) {
return &ChineseToEnglishBrief($cnumber);
} else {
return &ChineseToEnglishFull($cnumber);
}
} else {
return &ChineseToEnglishFull($cnumber);
}
}
sub ChineseToEnglishBrief {
my($cnumber) = shift;
my($nextcchar);
my($place, $digitval, $total) = (0,0,0);
for ($place = 0; $place < length($cnumber)/2; $place++) {
$digitval = $digits{substr($cnumber, $place*2, 2)};
$total += $digitval * (10**(length($cnumber)/2 - 1 - $place));
}
return $total;
}
sub ChineseToEnglishFull {
my($cnumber) = shift;
my($negative) = 0;
my($cnumlength) = length($cnumber);
my($i);
my($j, $digitval, $cchar);
my($power) = 0;
my($leveltotal) = 0;
my($total);
my($nextcchar);
#print "In Chinese to English Full<BR>";
for ($i = 0; $i < $cnumlength; $i+=2) {
#print "$i ";
$cchar = substr($cnumber, $i, 2);
#print "$cchar $leveltotal $power<BR>";
if ($i == 0 && ($cchar eq "" || $cchar eq '�t')) {
$negative = 1;
} elsif ($i == 0 && $cchar eq '��') {
# Do nothing for now
} elsif ($cchar eq '��') {
$power = 12;
$leveltotal = 1 if $leveltotal == 0;
$total += $leveltotal * (10 ** $power);
$leveltotal = 0;
$power -= 4;
#$power = 0;
} elsif ($cchar eq '��') {
$power = 8;
$leveltotal = 1 if $leveltotal == 0;
$total += $leveltotal * (10** $power);
$leveltotal = 0;
$power -= 4;
#$power = 0;
} elsif ($cchar eq '�U') {
$power = 4;
$leveltotal = 1 if $leveltotal == 0;
$total += $leveltotal * (10**$power);
$leveltotal = 0;
$power -= 4;
#$power = 0;
} elsif ($cchar eq '�d') {
$leveltotal += 1000;
} elsif ($cchar eq "��") {
$leveltotal += 100;
} elsif ($cchar eq "�Q") {
$leveltotal += 10;
} elsif ($cchar eq "�s") {
$power = 0;
} elsif ($cchar eq "�s" ||
$cchar eq "�@" ||
$cchar eq "��" ||
$cchar eq "�G" ||
$cchar eq "�T" ||
$cchar eq "�|" ||
$cchar eq "��" ||
$cchar eq "��" ||
$cchar eq "�C" ||
$cchar eq "�K" ||
$cchar eq "�E") {
$digitval = $digits{substr($cnumber, $i, 2)};
#print "Digit val is $digitval<BR>\n";
if ($i+2 < $cnumlength) {
$nextcchar = substr($cnumber, $i+2, 2);
if ($nextcchar eq "�Q") {
$leveltotal += $digitval * 10;
$i+=2;
} elsif ($nextcchar eq "��") {
$leveltotal += $digitval * 100;
$i+=2;
} elsif ($nextcchar eq "�d") {
$leveltotal += $digitval * 1000;
$i+=2;
} else {
$leveltotal += $digitval;
}
} else {
$leveltotal += $digitval;
}
} else {
print STDERR "Seems to be an error in the number. $cnumber\n";
return "";
# return negative infinity;
}
}
# Catch remaining leveltotal
#print("Level total " + $leveltotal + " power " + $power + " ten to power " + (10**$power)/10);
$total += $leveltotal * 10** $power;
if ($negative == 1) { $total = -$total; }
return $total;
}
sub chinese_output {
my($self) = shift;
if (@_) { $default_outputtype = shift }
return $default_outputtype;
}
sub chinese_input {
my($self) = shift;
if (@_) { $default_inputtype = shift }
return $default_inputtype;
}
# hex2utf8: Take a string of 4 hex digits (0-9A-F) and convert it
# to the corresponding (1, 2, or 3 byte) UTF-8 representation.
sub hex2utf8 {
my($hexchar) = shift;
my($binchar, $retval, $bin1, $bin2, $bin3);
if ($hexchar !~ m/^0x/) {
$hexchar = "0x" . $hexchar;
}
#print STDERR $hexchar ."\n";
$binchar = oct($hexchar);
if ($binchar <= 127) {
$retval = pack("C", $binchar);
} elsif ($binchar <= 2047) {
$bin1 = $binchar;
$bin1 >>= 6;
$bin1 |= 0xC0;
$bin2 = $binchar;
$bin2 &= 0x3F;
$bin2 |= 0x80;
$retval = pack("C2", $bin1, $bin2);
} else {
$bin1 = $binchar;
$bin1 >>= 12;
$bin1 |= 0xE0;
$bin2 = $binchar;
$bin2 &= 0x0FFF;
$bin2 >>= 6;
$bin2 |= 0x80;
$bin3 = $binchar;
$bin3 &= 0x003F;
$bin3 |= 0x80;
$retval = pack("C*", $bin1, $bin2, $bin3);
}
return $retval;
}
sub utf82hex {
my($utfstring) = @_;
my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3);
my($hex1, $hex2, $hexstring);
$i = 0;
while ($i < length($utfstring)) {
$byte1 = substr($utfstring, $i, 1);
if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)
$unichar = pack("C", 0x00) . $byte1;
$i++;
} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long
$byte2 = substr($utfstring, $i+1, 1);
$int1 = unpack("C", $byte1) & 0x1F;
$int1 <<= 0x06;
$int2 = unpack("C", $byte2) & 0x3F;
$unival = $int1 | $int2;
$unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
$i += 2;
} else { # 3 bytes long
$byte2 = substr($utfstring, $i+1, 1);
$byte3 = substr($utfstring, $i+2, 1);
$int1 = 0x0F & unpack("C", $byte1);
$int1 <<= 12;
$int2 = 0x3F & unpack("C", $byte2);
$int2 <<= 6;
$int3 = 0x3F & unpack("C", $byte3);
$unival = $int1 | $int2 | $int3;
$unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
$i += 3;
}
$unistring .= $unichar;
}
$hex1 = unpack "H2", substr($unistring, 0, 1);
$hex2 = unpack "H2", substr($unistring, 1, 1);
$hexstring = "\U$hex1$hex2\E";
return $hexstring;
}
END { }
1;