|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/cgi-bin/ |
Upload File : |
#!/usr/local/bin/perl5.8.8
use File::Temp "tempfile";
#use LWP::UserAgent;
use LWP::Simple;
use Encode;
use Encode::CN;
use Encode::TW;
require "cgi-lib.pl";
require "segmenter_db.pl";
my $debug = 1;
my $byword = 0;
sub addTones {
my($withnumbers) = shift;
my($i);
$withnumbers =~ s/ng(\d)\b/${1}ng/g;
$withnumbers =~ s/n(\d)\b/${1}n/g;
$withnumbers =~ s/ao(\d)\b/a${1}o/g;
$withnumbers =~ s/ai(\d)\b/a${1}i/g;
$withnumbers =~ s/ei(\d)\b/e${1}i/g;
$withnumbers =~ s/ou(\d)\b/o${1}u/g;
$withnumbers =~ s/r(\d)\b/${1}r/g;
@tonenums = ("a1", "a2", "a3", "a4", "a5", "e1", "e2", "e3", "e4", "e5",
"i1", "i2", "i3", "i4", "i5", "o1", "o2", "o3", "o4", "o5",
"u1", "u2", "u3", "u4", "u5",
"u:1", "u:2", "u:3", "u:4", "u:5", "u:",
"v1", "v2", "v3", "v4", "v5", "v",
"A1", "A2", "A3", "A4", "A5", "E1", "E2", "E3", "E4", "E5",
"I1", "I2", "I3", "I4", "I5", "O1", "O2", "O3", "O4", "O5",
"U1", "U2", "U3", "U4", "U5",
"U:1", "U:2", "U:3", "U:4", "U:5", "U:",
"V1", "V2", "V3", "V4", "V5", "V"
);
@tonemarks = ('ā', 'á', 'ǎ', 'à', 'a',
'ē', 'é', 'ě', 'è', 'e',
'ī', 'í', 'ǐ', 'ì', 'i',
'ō', 'ó', 'ǒ', 'ò', 'o',
'ū', 'ú', 'ǔ', 'ù', 'u',
'ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü', 'ü',
'ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü', 'ü',
'Ā', 'Á', 'Ǎ', 'À', 'A',
'Ē', 'É', 'Ě', 'È', 'E',
'Ī', 'Í', 'Ǐ', 'Ì', 'I',
'Ō', 'Ó', 'Ǒ', 'Ò', 'O',
'Ū', 'Ú', 'Ǔ', 'Ù', 'U',
'Ǖ', 'Ǘ', 'Ǚ', 'Ǜ', 'Ü', 'Ü',
'Ǖ', 'Ǘ', 'Ǚ', 'Ǜ', 'Ü', 'Ü'
);
for ($i = 0; $i < scalar(@tonenums); $i++) {
$withnumbers =~ s/$tonenums[$i]/$tonemarks[$i]/ge;
}
$withnumbers =~ s/5//g;
return $withnumbers;
}
&ReadParse(*values);
my $atype = $values{'atype'};
my $addtones = $values{'addtones'};
my $ctext = $values{'ctext'};
my $url = $values{'url'};
$url =~ s/^\s*//;
$url =~ s/\s*$//;
#my $tagwords = $values{'words'};
my $sourcetext;
if (length($url) > 0) {
if ($url !~ m/^\s*http\:\/\//) {
$url = "http://" . $url;
}
#print "Accessing: '$url'<br>\n" if $debug;
$sourcetext = get $url;
if (!defined($sourcetext)) {
$sourcetext = "Couldn't get $url";
}
if ($sourcetext =~ m/charset=gb/) {
$sourcetext = decode("euc-cn", $sourcetext);
$sourcetext = encode("utf8", $sourcetext);
} elsif ($sourcetext =~ m/charset=big5/) {
$sourcetext = decode("cp950", $sourcetext);
$sourcetext = encode("utf8", $sourcetext);
#from_to($sourcetext, "cp950", "utf8");
} elsif ($sourcetext =~ m/charset=utf/) {
#$sourcetext = decode("utf8", $sourcetext);
}
$sourcetext = formatHTML($sourcetext);
$sourcetext =~ s/^\s*//;
$sourcetext =~ s/\s*$//;
$sourcetext =~ s/\r//g;
$sourcetext =~ s/\n\s\s+/\n\n/g;
$sourcetext = "<html>\r\n<head><title>Chinese Annotation Results</title></head>\r\n<body>\r\n$sourcetext</body></html>";
#@srclines = split(/(\n)/, $sourcetext);
} else {
$ctext =~ s/\n/<br>\n/g;
$sourcetext = "<html>\r\n<head><title>Chinese Annotation Results</title></head>\r\n<body>\r\n$ctext</body></html>";
#if ($atype eq 'addmargin') {
#@srclines = split(/(\r\n(\r\n)+)/, $ctext, 80);
#} else {
#@srclines = split(/(\r\n)/, $ctext, 80);
#}
}
print "Content-type: text/html; charset=utf-8\n\n";
#print "<HTML>\n";
#print "<HEAD><TITLE>Chinese Annotation Results</TITLE>\n";
#$anchor = 0;
#$chartype = $values{'chartype'}; # "simp", "trad", "both"
#$chartype = "simp";
#print STDERR "Before cedict init<br>\n":
&init_cedict();
#print "After cedict init<br>\n":
#print "Total source lines " . scalar(@srclines) . "<br>\n";
#$count = 0;
#foreach $srcline (@srclines) {
#print "Srcline $srcline\n";
#if ($srcline =~ m/[\x80-\xff]/) {
#$seglines[$count++] = &segmentLine($srcline);
#$seglines[$count++] = $srcline;
# } else {
#$seglines[$count++] = $srcline;
# }
#last if $count > 15;
#}
#print "</HEAD>\n<BODY>\n";
$separator = " ";
&segmentLineOffsets($sourcetext);
my($seplen) = length($separator);
# for ($i = 0; $i < @offsets; $i++) {
# if ($offsets[$i] > 0) {
# print "$i " . substru8($cline, $i, $offsets[$i]) . "\n";
# }
# }
my($i);
$sourcetext = decode("utf8", $sourcetext);
my $intag = 1;
my $inscript = 0;
my $intextarea = 0;
my $inhead = 0;
my $incomment = 0;
for ($i = length($sourcetext)-1; $i >= 0; $i--) {
$tempchar = substr($sourcetext, $i, 1);
if ($tempchar eq "<") { $intag = 0; }
elsif ($tempchar eq ">") { $intag = 1; }
if ($tempchar eq '<') {
if ($i+8 < length($sourcetext) &&
substr($sourcetext, $i, 8) =~ m/^<\/script$/i) {
$inscript = 1;
} elsif ($i+7 < length($sourcetext) &&
substr($sourcetext, $i, 7) =~ m/^<script$/i) {
$inscript = 0;
} elsif ($i+6 < length($sourcetext) && # Here to handle GeoCities broken code
substr($sourcetext, $i, 6) =~ m/^<\/html$/i) {
$inscript = 0;
} elsif ($i+10 < length($sourcetext) &&
substr($sourcetext, $i, 10) =~ m/^<\/textarea$/i) {
$intextarea = 1;
} elsif ($i+9 < length($sourcetext) &&
substr($sourcetext, $i, 9) =~ m/^<textarea$/i) {
$intextarea = 0;
} elsif (i+6 < length($sourcetext) &&
substr($sourcetext, $i, 6) =~ m/^<\/head$/i) {
$intextarea = 1;
} elsif (i+5 < length($sourcetext) &&
substr($sourcetext, $i, 5) =~ m/^<head$/i) {
$intextarea = 0;
} elsif (i+4 < length($sourcetext) &&
substr($sourcetext, $i, 4) =~ m/^<!--$/i) {
$incomment = 0;
}
} elsif ($tempchar eq "-") {
if ($i+3 < length($sourcetext) &&
substr($sourcetext, $i, 3) =~ m/^-->$/i) {
$incomment = 1;
}
}
next if $intag || $inscript || $intextarea || $incomment;
#print "$i\n";
next if $tempchar !~ m/\p{Han}/;
while ($i >= 0 && $offsets[$i] == 0) {
$i--;
}
$cword = substr($sourcetext, $i, $offsets[$i]);
$testword = encode("utf8", $cword);
if (exists($cwords{$testword})) {
my $def = $cwords{$testword};
if ($addtones) {
$def =~ s/\[([^\]]+)\]/"\[" . &addTones($1) . "\]"/eg;
}
$def =~ m/\[([^\]]+)\]/;
$defpy = $1;
if ($atype eq "segment") {
substr($sourcetext, $i+$offsets[$i], 0, $separator);
} elsif ($atype eq "topinyin") {
$defpy =~ s/ //g;
substr($sourcetext, $i, length($cword), $separator . $defpy . $separator);
} elsif ($atype eq "addpinyin") {
@pys = split(/ /, $defpy);
if ($byword == 0) {
for ($pyindex = scalar(@pys)-1; $pyindex >= 0; $pyindex--) {
substr($sourcetext, $i+$pyindex+1, 0, $pys[$pyindex]);
#srctxt.insert(i+pyindex+1, (String)pys.elementAt(pyindex));
}
} else {
$defpy =~ s/ //g;
substr($sourcetext, $i + length($cword), $defpy);
#srctxt.insert(i+pyindex, defpy);
}
} elsif ($atype eq "addruby") {
@pys = split(/ /, $defpy);
if ($byword) {
$defpy =~ s/ //g;
substr($sourcetext, $i + length($cword), 0, $defpy);
} else {
for ($pyindex = scalar(@pys)-1; $pyindex >= 0; $pyindex--) {
substr($sourcetext, $i+$pyindex+1, 0, "<rt>" . $pys[pyindex] . "</ruby>");
substr($sourcetext, $i+$pyindex, 0, "<ruby>");
}
}
} elsif ($atype eq "popup") {
$def = $cword . "\t" . $def;
$def =~ s/\'/\’/g;
$def =~ s/\"/\"/g;
#$def =~ s/\t/<BR>\ \ /g;
#$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
$def =~ s/\]\s+/\]\t/g;
substr($sourcetext, $i+length($cword), 0, "</span>");
substr($sourcetext, $i, 0, "<span onMouseOver=\"showdef(\'$def\', this); return true;\" onMouseOut=\"hidedef()\">");
} elsif ($atype eq "js_adddict") {
$def = $cword . " " . $def;
$def =~ s/\'/\’/g;
$def =~ s/\"/\"/g;
#$def =~ s/\t/<BR>\ \ /g;
#$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
#$def =~ s/\]\s+/\] /g;
substr($sourcetext, $i+length($cword), 0, "</span>");
substr($sourcetext, $i, 0, "<span onMouseOver=\"sline(\'$def\'); return true;\" onMouseOut=\"clearstat()\">");
}
}
}
$sourcetext = encode("utf8", $sourcetext);
$insertloc = -1;
if ($sourcetext =~ m/<head>/i) {
$insertloc = @-[0];
$insertloc += 6;
} elsif ($sourcetext =~ m/<\/title>/i) {
$insertloc = @-[0];
$insertloc += 8;
} elsif ($sourcetext =~ m/<html>/i) {
$insertloc = @-[0];
$insertloc += 6;
} else {
$insertloc = 0; # Just put it at the beginning
}
if (length($url) > 0) {
if ($sourcetext =~ m/<base href=\"?([^ \">]+)/i) {
$basepath = $1;
} elsif ($sourcetext !~ m/<base/i) {
substr($sourcetext, $insertloc, 0, "\n<base href=\"$url\">\n");
} else { # Just add HREF
}
if ($sourcetext !~ m/charset=/i) {
substr($sourcetext, $insertloc, 0,
"\n<META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=utf-8>");
} elsif ($sourcetext =~ s/charset=([^ \'\">]+)/charset=utf-8/i) {
}
}
if ($sourcetext =~ m/<body/i) {
$bodyloc = @-[0];
$bodyloc = index $sourcetext, ">", $bodyloc;
$bodyloc++;
}
if ($atype eq "js_adddict") {
$script = "
<SCRIPT LANGUAGE=JAVASCRIPT>
<!--
// Status line display
function sline(txt) {
window.status=txt;
}
// Clear Status Line
function clearstat() {
window.status=\"\";
}
//-->
</SCRIPT>
";
substr($sourcetext, $bodyloc, 0, $script);
} elsif ($atype eq "popup") {
$script = "
<SCRIPT>
<!--
document.write('<div id=\"defwin\" style=\"position:absolute;z-index:1;visibility:hidden;background-color:#ffffff;color:black\"><table cellspacing=0 cellpadding=5 border=3>' +
'<tr><td id=\"zhval\"></td></tr>' +
'</table></div>');
var ie=document.all
var ieNOTopera=document.all&&navigator.userAgent.indexOf(\"Opera\")==-1
function showdef(zhdef, text){
var t = document.getElementById(\"zhval\");
var defhtml = \"\";
var deffields = zhdef.split(\"\\t\");
defhtml = \"<font size=+3><span lang=zh>\" + deffields[0] + \"</span></font>\";
for (var i = 1; i < deffields.length; i+=2) {
defhtml += \"<br>\" + deffields[i] + \" \" + deffields[i+1];
}
t.innerHTML = defhtml;
var topmsg_obj = document.getElementById(\"defwin\")
var obj = text;
var curleft = 0;
var curtop = 0;
if (obj.offsetParent)
{
while (obj.offsetParent)
{
curleft += obj.offsetLeft
curtop += obj.offsetTop
obj = obj.offsetParent;
}
}
else if (obj.x) {
curleft += obj.x;
curtop += obj.y;
}
var dsoctop=ie? document.body.scrollTop : pageYOffset
var window_height=ieNOTopera? document.body.clientHeight : window.innerHeight
if (29 + curtop + topmsg_obj.offsetHeight > dsoctop + window_height) {
curtop = curtop - topmsg_obj.offsetHeight - 35
}
topmsg_obj.style.left = curleft;
topmsg_obj.style.top = curtop+29;
topmsg_obj.style.visibility=\"visible\";
}
function hidedef() {
var topmsg_obj = document.getElementById(\"defwin\");
topmsg_obj.style.visibility=\"hidden\";
}
-->
</SCRIPT>
";
substr($sourcetext, $bodyloc, 0, $script);
}
print $sourcetext;
dbmclose(%cwords);
exit(0);
sub formatHTML {
my($htmltext) = @_;
=comment
if ($htmltext =~ m/charset=gb/) {
#print "Converting from euc-cn to utf8 " . length($htmltext) . "<br>\n";
#@list = Encode->encodings();
#foreach $enc (@list) {
# print "$enc<br>\n";
#}
#from_to($htmltext, "euc-cn", "utf8");
#print "Converted from euc-cn to utf8 " . length($htmltext) . "\n";
$htmltext = decode("euc-cn", $htmltext);
#print "Between conversions" . length($htmltext) . "\n";
$htmltext = encode("utf8", $htmltext);
#print "After conversions" . length($htmltext) . "\n";
} elsif ($htmltext =~ m/charset=big5/) {
$htmltext = decode("cp950", $htmltext);
#print "Between conversions" . length($htmltext) . "\n";
$htmltext = encode("utf8", $htmltext);
#from_to($htmltext, "cp950", "utf8");
}
=cut
$htmltext =~ s/<style.*?<\/style>//igs;
$htmltext =~ s/<script.*?<\/script>//igs;
$htmltext =~ s/\ / /gi;
$htmltext =~ s/\&\#(\d+);//g; # Convert to UTF-8
$htmltext =~ s/\s+/ /g;
$htmltext =~ s/<BR>/\n/ig;
$htmltext =~ s/<P(\s[^>]+)?>/\n\n/ig;
$htmltext =~ s/<[^>]+>//g;
#print "HTML text " . length($htmltext) . "\n";
return $htmltext;
}
__END__
if ($atype eq "segment") { # Just segment, no annotation
foreach $segline (@seglines) {
print $segline . "<BR>";
}
} elsif ($atype eq "adddict") {
foreach $_ (@seglines) {
s/\r\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (exists($cwords{$word})) {
if (exists($canchor{$word})) {
print "<A HREF=\"\#$canchor{$word}\">$word</A>";
} else {
$anchor++;
$canchor{$word} = $anchor;
$anchwords[$anchor] = $word;
print "<A HREF=\"\#$canchor{$word}\">$word</A>";
}
} else {
print "$word";
}
}
}
print "<HR>\n";
for ($i = 1; $i <= $anchor; $i++) {
print "\n<A NAME=\"$i\">\n";
$def = $cwords{$anchwords[$i]};
$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
$def =~ s/\t/<br>\ \ \ /g;
print $anchwords[$i] . " " . $def . "<P>\n";
}
print " <BR>" x 35;
} elsif ($atype eq "js_adddict") {
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (exists($cwords{$word})) {
$def = $cwords{$word};
$def =~ s/\'/\’/g;
$def =~ s/\"/\"/g;
$def =~ s/\t/ /g;
$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
print "<span onMouseOver=\"sline(\'$def\'); return true\" onMouseOut=\"clearstat()\">$word</span>\n";
} else {
print "$word";
}
}
}
} elsif ($atype eq "popup") {
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (exists($cwords{$word})) {
$def = $word . "\t" . $cwords{$word};
$def =~ s/\'/\’/g;
$def =~ s/\"/\"/g;
#$def =~ s/\t/<BR>\ \ /g;
$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
$def =~ s/\]\s+/\]\t/g;
print "<span onMouseOver=\"showdef(\'$def\', this); return true\" onMouseOut=\"hidedef()\">$word</span>\n";
} else {
print "$word";
}
}
}
} elsif ($atype eq "topinyin") {
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (defined($cwords{$word})) {
($py) = ($cwords{$word} =~ m/^\[([^\]]+)\]/);
$py = addTones($py) if $addtones;
$py =~ s/ //g;
print "$py ";
} else {
print "$word";
}
}
}
} elsif ($atype eq "addpinyin") {
foreach $_ (@seglines) {
s/\n/<BR>/;
if (m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/);
foreach $word (@words) {
if (exists($cwords{$word})) {
($py) = ($cwords{$word} =~ m/^\[([^\]]+)\]/);
$py = addTones($py) if $addtones;
my(@pys) = split(/\s+/, $py);
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print substru8($word, $i, 1);
print $pys[$i];
}
} else {
print "$word";
}
}
}
} elsif ($atype eq "addruby") {
print "<style>\nRUBY { ruby-align:center }\n</style>\n";
print "<font size=+1>";
foreach $segline (@seglines) {
$segline =~ s/\n/<BR>/;
if ($segline =~ m/^\s+$/) {
print "<P>\n";
next;
}
@words = split(/(\s+)/, $segline);
foreach $word (@words) {
if (exists($cwords{$word})) {
($py) = ($cwords{$word} =~ m/^\[([^\]]+)\]/);
$py = addTones($py) if $addtones;
my(@pys) = split(/\s+/, $py);
my($i);
for ($i = 0; $i < lengthu8($word); $i++) {
print "<ruby>" . substru8($word, $i, 1);
print "<rt>" . $pys[$i] . "</ruby>";
}
} else {
print "$word";
}
}
}
}