|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/cgi-bin/ |
Upload File : |
#!/usr/bin/perl
#use lib '/chtools';
#use lib '/var/www/htdocs/eepeter/cgi-bin';
require "cgi-lib.pl";
require "segmenter_u8.pl";
sub addTones {
my($withnumbers) = shift;
my($i);
$withnumbers =~ s/ng(\d)\b/${1}ng/g;
$withnumbers =~ s/n(\d)\b/${1}n/g;
$withnumbers =~ s/ao(\d)\b/a${1}o/g;
$withnumbers =~ s/ai(\d)\b/a${1}i/g;
$withnumbers =~ s/ei(\d)\b/e${1}i/g;
$withnumbers =~ s/ou(\d)\b/o${1}u/g;
@tonenums = ("a1", "a2", "a3", "a4", "a5", "e1", "e2", "e3", "e4", "e5",
"i1", "i2", "i3", "i4", "i5", "o1", "o2", "o3", "o4", "o5",
"u1", "u2", "u3", "u4", "u5",
"u:1", "u:2", "u:3", "u:4", "u:5", "u:",
"v1", "v2", "v3", "v4", "v5", "v");
@tonemarks = ('ā', 'á', 'ǎ', 'à', 'a',
'ē', 'é', 'ě', 'è', 'e',
'ī', 'í', 'ǐ', 'ì', 'i',
'ō', 'ó', 'ǒ', 'ò', 'o',
'ū', 'ú', 'ǔ', 'ù', 'u',
'ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü', 'ü',
'ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü', 'ü');
for ($i = 0; $i < scalar(@tonenums); $i++) {
$withnumbers =~ s/$tonenums[$i]/$tonemarks[$i]/ge;
}
$withnumbers =~ s/5//g;
return $withnumbers;
}
&ReadParse(*values);
$atype = $values{'atype'};
$addtones = $values{'addtones'};
$ctext = $values{'ctext'};
$tagwords = $values{'words'};
if ($ctext =~ m#^\s*(http|gopher|ftp)://#) {
#open(WEB, "./lynx -assume_charset=gb2312 -source $ctext |");
#while (<WEB>) {
# chomp;
# push $sourcetext@srclines, $_, "\r\n";
#}
#close(<WEB>);
$sourcetext = `./lynx -assume_charset=gb2312 -source $ctext`;
$sourcetext = formatHTML($sourcetext);
$sourcetext =~ s/\r//g;
$sourcetext =~ s/\n\s\s+/\n\n/g;
@srclines = split(/(\n)/, $sourcetext);
} else {
$ctext .= "\r\n";
(@utfchars) = ($ctext =~ m/([\xE0-\xEF]..)/g);
foreach $char (@utfchars) { $utfcharset{$char} = 1; }
if ($atype eq 'addmargin') {
@srclines = split(/(\r\n(\r\n)+)/, $ctext, 80);
} else {
@srclines = split(/(\r\n)/, $ctext, 80);
}
}
print "Content-type: text/html; charset=utf-8\n\n";
print "<HTML>\n";
print "<HEAD><TITLE>Chinese Annotation Results</TITLE>\n";
$anchor = 0;
$chartype = $values{'chartype'}; # "simp", "trad", "both"
#$chartype = "simp";
&init_cedict($chartype);
$tmptagwords = $tagwords . "\r\n";
@tagwords = split(/\r\n/, $tmptagwords, 100);
foreach $tagword (@tagwords) {
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ m/^(\S+)\s(\[[a-zA-Z0-5: ]+\])\s(.+)$/);
$cwords{$chin} = "[$py] $eng";
} else {
$cwords{$tagword} = "[] //";
}
}
$count = 0;
foreach $srcline (@srclines) {
$seglines[$count++] = segmentLine($srcline);
}
foreach $segline (@seglines) {
@segwords = split(/\s+/, $segline);
foreach $segword (@segwords) {
$segwords{$segword} = 1;
}
}
if ($atype eq "js_adddict") {
print <<JS;
<SCRIPT LANGUAGE=JAVASCRIPT>
<!--
// Status line display
function sline(txt) {
window.status=txt;
}
// Clear Status Line
function clearstat() {
window.status="";
}
//-->
</SCRIPT>
JS
}
print "</HEAD>\n<BODY>\n";
if ($atype eq "segment") { # Just segment, no annotation
foreach $segline (@seglines) {
print $segline . "<BR>";
}
} elsif ($atype eq "adddict") {
$tagwords .= "\r\n";
@tagwords = split(/\r\n/, $tagwords, 100);
foreach $tagword (@tagwords) {
#print "$tagword\n";
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
$py = addTones($py) if $addtones;
$cdef{$chin} = "$chin\n$py\n$eng";
$tagwords{$chin} = "$chin\n$py\n$eng";
} else {
$tagwords{$tagword} = $cdef{$tagword};
}
}
foreach $_ (@seglines) {
s/\r\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (exists($cdef{$word})) {
if (exists($canchor{$word})) {
print "<A HREF=\"\#$canchor{$word}\">$word</A>";
} else {
$anchor++;
$canchor{$word} = $anchor;
$anchwords[$anchor] = $word;
print "<A HREF=\"\#$canchor{$word}\">$word</A>";
}
} else {
print "$word";
}
}
}
print "<HR>\n";
for ($i = 1; $i <= $anchor; $i++) {
print "\n<A NAME=\"$i\">\n";
print $cdef{$anchwords[$i]} . "<P>";
}
print " <BR>" x 35;
} elsif ($atype eq "js_adddict") {
open(CED, "cedict_ts.u8") or die "Can't open dictionary";
while (<CED>) {
s/[\r\n]*$//;
s/\'/\\\'/g;
s/\"/\\\"/g;
($tchin, $schin, $py, $eng) = (m/^(\S+) (\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
$py = addTones($py) if $addtones;
if ($chartype eq "simp" and defined($segwords{$schin})) {
$cdef{$schin} = "$schin\t$py\t$eng";
} elsif ($chartype eq "trad" and defined($segwords{$tchin})) {
$cdef{$tchin} = "$tchin\t$py\t$eng";
} elsif ($chartype eq "both" and (defined($segwords{$tchin}) or defined($segwords{$schin}))) {
$cdef{$schin} = "$schin\t$py\t$eng";
$cdef{$tchin} = "$tchin\t$py\t$eng";
}
#($chin, $py, $eng) = (m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
#$cdef{$chin} = "$chin $py $eng";
}
close(CED);
$tagwords .= "\r\n";
@tagwords = split(/\r\n/, $tagwords, 100);
foreach $tagword (@tagwords) {
#print "$tagword\n";
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ (m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/));
$py = addTones($py) if $addtones;
$cdef{$chin} = "$chin $py $eng";
$tagwords{$chin} = "$chin\n$py\n$eng";
} else {
$tagwords{$tagword} = $cdef{$tagword};
}
}
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (exists($cdef{$word})) {
if (exists($canchor{$word})) {
print "<A HREF=\"\#$canchor{$word}\" onMouseOver=\"sline(\'$cdef{$word}\'); return true\" onMouseOut=\"clearstat()\">$word</A>";
} else {
$anchor++;
$canchor{$word} = $anchor;
$anchwords[$anchor] = $word;
print "<A HREF=\"\#$canchor{$word}\" onMouseOver=\"sline(\'$cdef{$word}\'); return true\" onMouseOut=\"clearstat()\">$word</A>";
# print "<A HREF=\"\#$canchor{$word}\">$word</A>";
}
} else {
print "$word";
}
}
}
print "<HR>\n";
for ($i = 1; $i <= $anchor; $i++) {
print "\n<A NAME=\"$i\">\n";
$cdef{$anchwords[$i]} =~ s/\\\"/\"/g;
$cdef{$anchwords[$i]} =~ s/\\\'/\'/g;
print $cdef{$anchwords[$i]} . "<P>";
}
print "<BR>" x 25;
} elsif ($atype eq "topinyin") {
$py = addTones($py) if $addtones;
$py =~ s/\s//g;
$tagwords .= "\r\n";
@tagwords = split(/\r\n/, $tagwords, 100);
foreach $tagword (@tagwords) {
#print "$tagword\n";
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
$py = addTones($py) if $addtones;
$cdef{$chin} = $py;
$tagwords{$chin} = "$chin\n$py\n$eng";
} else {
$tagwords{$tagword} = $cdef{$tagword};
}
}
open(UPY, "uni8py.txt") or die "Can't open dictionary\n";
while(<UPY>) {
s/[\r\n]*$//;
my($u8char, $py) = split;
$py = addTones($py) if $addtones;
($fpy, $rest) = split(/\s/, $py);
$u8py{$u8char} = $fpy;
}
close(UPY);
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (defined($cdef{$word}) and lengthu8($word) > 1) {
print $cdef{$word};
} elsif (vec($word, 0, 8) > 127) {
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print $u8py{substru8($word, $i, 1)};
}
} else {
print "$word";
}
}
}
} elsif ($atype eq "addpinyin") {
open(CED, "cedict_ts.u8") or die "Can't open dictionary";
while (<CED>) {
($tchin, $schin, $py, $eng) = (m/^(\S+) (\S+) \[([a-zA-Z0-5: ]+)\] (.+)$/);
$py = addTones($py) if $addtones;
if ($chartype eq "simp" and defined($segwords{$schin})) {
$cdef{$schin} = "$py";
} elsif ($chartype eq "trad" and defined($segwords{$tchin})) {
$cdef{$tchin} = "$py";
} elsif ($chartype eq "both" and (defined($segwords{$tchin}) or defined($segwords{$schin}))) {
$cdef{$schin} = "$py";
$cdef{$tchin} = "$py";
}
}
close(CED);
$tagwords .= "\r\n";
@tagwords = split(/\r\n/, $tagwords, 100);
foreach $tagword (@tagwords) {
#print "$tagword\n";
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
$py = addTones($py) if $addtones;
$cdef{$chin} = $py;
$tagwords{$chin} = "$chin\n$py\n$eng";
} else {
$tagwords{$tagword} = $cdef{$tagword};
}
}
open(UPY, "uni8py.txt") or die "Can't open dictionary\n";
while(<UPY>) {
s/[\r\n]*$//;
my($u8char, $py) = split;
$py = addTones($py) if $addtones;
($fpy, $rest) = split(/\s/, $py, 2);
if (defined($utfcharset{$u8char})) {
$u8py{$u8char} = $fpy;
}
}
close(UPY);
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (defined($cdef{$word}) and lengthu8($word) > 1) {
my(@pys) = split(/\s+/, $cdef{$word});
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print substru8($word, $i, 1);
print $pys[$i];
}
} elsif (vec($word, 0, 8) > 127) {
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print substru8($word, $i, 1);
print $u8py{substru8($word, $i, 1)};
}
} else {
print "$word";
}
}
}
} elsif ($atype eq "addruby") {
open(CED, "cedict_ts.u8") or die "Can't open dictionary";
while (<CED>) {
($tchin, $schin, $py, $eng) = (m/^(\S+) (\S+) \[([a-zA-Z0-5: ]+)\] (.+)$/);
if ($chartype eq "simp" and defined($segwords{$schin})) {
$py = addTones($py) if $addtones;
$cdef{$schin} = "$py";
} elsif ($chartype eq "trad" and defined($segwords{$tchin})) {
$py = addTones($py) if $addtones;
$cdef{$tchin} = "$py";
} elsif ($chartype eq "both" and (defined($segwords{$tchin}) or defined($segwords{$schin}))) {
$py = addTones($py) if $addtones;
$cdef{$schin} = "$py";
$cdef{$tchin} = "$py";
}
}
close(CED);
$tagwords .= "\r\n";
@tagwords = split(/\r\n/, $tagwords, 100);
foreach $tagword (@tagwords) {
#print "$tagword\n";
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
$py = addTones($py) if $addtones;
$cdef{$chin} = $py;
$tagwords{$chin} = "$chin\n$py\n$eng";
} else {
$tagwords{$tagword} = $cdef{$tagword};
}
}
open(UPY, "uni8py.txt") or die "Can't open dictionary\n";
while(<UPY>) {
s/[\r\n]*$//;
my($u8char, $py) = split;
$py = addTones($py) if $addtones;
($fpy, $rest) = split(/\s+/, $py, 2);
$u8py{$u8char} = $fpy;
}
close(UPY);
print "<style>\nRUBY { ruby-align:center }\n</style>\n";
print "<font size=+1>";
foreach $segline (@seglines) {
$segline =~ s/\n/<BR>/;
if ($segline =~ m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/, $segline);
foreach $word (@words) {
if (defined($cdef{$word}) and lengthu8($word) > 1) {
my(@pys) = split(/\s+/, $cdef{$word});
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print "<ruby>" . substru8($word, $i, 1);
print "<rt>" . $pys[$i] . "</ruby>";
}
} elsif (vec($word, 0, 8) > 127) {
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print "<ruby>" . substru8($word, $i, 1);
print "<rt>" . $u8py{substru8($word, $i, 1)} . "</ruby>";
}
} else {
print "$word";
}
}
}
} elsif ($atype eq "addmargin") {
$tagwords .= "\r\n";
@tagwords = split(/\r\n/, $tagwords, 60);
foreach $tagword (@tagwords) {
if ($tagword =~ m/\s/) {
($chin, $py, $eng) = ($tagword =~ m/^(\S+)\s(\[[a-zA-Z0-5: ]+\])\s(.+)$/);
$py = addTones($py) if $addtones;
$cdef{$chin} = "$chin\n$py\n$eng";
$tagwords{$chin} = "$chin\n$py\n$eng";
} else {
$tagwords{$tagword} = $cdef{$tagword};
}
}
print "<TABLE>\n";
foreach $_ (@seglines) {
print "<TR>\n<TD WIDTH=\"80\%\">\n";
@words = split(/\s+/);
@anchwords = ();
foreach $word (@words) {
if (exists($tagwords{$word})) {
if (exists($canchor{$word})) {
print "$word";
} else {
# First occurence
$anchor++;
$canchor{$word} = $anchor;
$anchwords[$anchor] = $word;
print "<STRONG>$word</STRONG>";
}
} else {
print "$word";
}
}
# Print sidenotes
print "</TD>\n<TD WIDTH=\"20\%\">\n";
for ($i = 1; $i <= $anchor; $i++) {
print "<SMALL>" . $cdef{$anchwords[$i]} . "</SMALL><P>";
}
print "</TD></TR>\n";
}
print "</TABLE>\n";
}
print "</BODY>\n</HTML>";
exit(0);
sub formatHTML {
my($htmltext) = @_;
$htmltext =~ s/\s+/ /g;
$htmltext =~ s/<BR>/\n/ig;
$htmltext =~ s/<P(\s[^>]+)?>/\n/ig;
$htmltext =~ s/<[^>]+>//g;
return $htmltext;
}