.

KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.4.62
System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
User : www ( 80)
PHP Version : 8.3.8
Disable Function : NONE
Directory : /domains/mandarintools/cgi-bin/
Upload File :
Current File : /domains/mandarintools/cgi-bin/annotate_db.pl
#!/usr/local/bin/perl5.8.8

use File::Temp "tempfile";

#use LWP::UserAgent;
use LWP::Simple;
use Encode;
use Encode::CN;
use Encode::TW;

require "cgi-lib.pl";
require "segmenter_db.pl";

my $debug = 1;
my $byword = 0;

sub addTones {
    my($withnumbers) = shift;
    my($i);
    $withnumbers =~ s/ng(\d)\b/${1}ng/g;
    $withnumbers =~ s/n(\d)\b/${1}n/g;
    $withnumbers =~ s/ao(\d)\b/a${1}o/g;
    $withnumbers =~ s/ai(\d)\b/a${1}i/g;
    $withnumbers =~ s/ei(\d)\b/e${1}i/g;
    $withnumbers =~ s/ou(\d)\b/o${1}u/g;
    $withnumbers =~ s/r(\d)\b/${1}r/g;

    @tonenums = ("a1", "a2", "a3", "a4", "a5", "e1", "e2", "e3", "e4", "e5",
		 "i1", "i2", "i3", "i4", "i5", "o1", "o2", "o3", "o4", "o5",
		 "u1", "u2", "u3", "u4", "u5", 
		 "u:1", "u:2", "u:3", "u:4", "u:5", "u:",
		 "v1", "v2", "v3", "v4", "v5", "v",
		 "A1", "A2", "A3", "A4", "A5", "E1", "E2", "E3", "E4", "E5",
		 "I1", "I2", "I3", "I4", "I5", "O1", "O2", "O3", "O4", "O5",
		 "U1", "U2", "U3", "U4", "U5", 
		 "U:1", "U:2", "U:3", "U:4", "U:5", "U:",
		 "V1", "V2", "V3", "V4", "V5", "V"
		 );
    @tonemarks = ('&#x0101;', '&aacute;', '&#x01ce;', '&#x00e0;', 'a', 
		  '&#x0113;', '&#x00e9;', '&#x011b;', '&#x00e8;', 'e', 
		  '&#x012b;', '&#x00ed;', '&#x01d0;', '&#x00ec;', 'i',
		  '&#x014d;', '&#x00f3;', '&#x01d2;', '&#x00f2;', 'o',
		  '&#x016b;', '&#x00fa;', '&#x01d4;', '&#x00f9;', 'u',
		  '&#x01d6;', '&#x01d8;', '&#x01da;', '&#x01dc;', '&#x00fc;', '&#x00fc;',
		  '&#x01d6;', '&#x01d8;', '&#x01da;', '&#x01dc;', '&#x00fc;', '&#x00fc;',

		  '&#x0100;', '&#x00c1;', '&#x01cd;', '&#x00c0;', 'A', 
		  '&#x0112;', '&#x00c9;', '&#x011a;', '&#x00c8;', 'E', 
		  '&#x012a;', '&#x00cd;', '&#x01cf;', '&#x00cc;', 'I',
		  '&#x014c;', '&#x00d3;', '&#x01d1;', '&#x00d2;', 'O',
		  '&#x016a;', '&#x00da;', '&#x01d3;', '&#x00d9;', 'U',
		  '&#x01d5;', '&#x01d7;', '&#x01d9;', '&#x01db;', '&#x00dc;', '&#x00dc;',
		  '&#x01d5;', '&#x01d7;', '&#x01d9;', '&#x01db;', '&#x00dc;', '&#x00dc;'
		  
		  ); 

    for ($i = 0; $i < scalar(@tonenums); $i++) {
	$withnumbers =~ s/$tonenums[$i]/$tonemarks[$i]/ge;
    } 
    $withnumbers =~ s/5//g;

    return $withnumbers;
}

&ReadParse(*values);
my $atype = $values{'atype'};
my $addtones = $values{'addtones'};

my $ctext = $values{'ctext'};
my $url = $values{'url'};
$url =~ s/^\s*//;
$url =~ s/\s*$//;
#my $tagwords = $values{'words'};

my $sourcetext;

if (length($url) > 0) {
    if ($url !~ m/^\s*http\:\/\//) {
	$url = "http://" . $url;
    }
    #print "Accessing: '$url'<br>\n" if $debug;
    $sourcetext = get $url;
    if (!defined($sourcetext)) {
	$sourcetext = "Couldn't get $url";
    }

    if ($sourcetext =~ m/charset=gb/) {
	$sourcetext = decode("euc-cn", $sourcetext);
	$sourcetext = encode("utf8", $sourcetext);
    } elsif ($sourcetext =~ m/charset=big5/) {
	$sourcetext = decode("cp950", $sourcetext);
	$sourcetext = encode("utf8", $sourcetext);
	#from_to($sourcetext, "cp950", "utf8");
    } elsif ($sourcetext =~ m/charset=utf/) {
	#$sourcetext = decode("utf8", $sourcetext);
    }


    $sourcetext = formatHTML($sourcetext);
    $sourcetext =~ s/^\s*//;
    $sourcetext =~ s/\s*$//;
    $sourcetext =~ s/\r//g;
    $sourcetext =~ s/\n\s\s+/\n\n/g;
    $sourcetext = "<html>\r\n<head><title>Chinese Annotation Results</title></head>\r\n<body>\r\n$sourcetext</body></html>";

    #@srclines = split(/(\n)/, $sourcetext);
    
} else {
    $ctext =~ s/\n/<br>\n/g;
    $sourcetext = "<html>\r\n<head><title>Chinese Annotation Results</title></head>\r\n<body>\r\n$ctext</body></html>";

	
    #if ($atype eq 'addmargin') {
    #@srclines = split(/(\r\n(\r\n)+)/, $ctext, 80);
    #} else {
    #@srclines = split(/(\r\n)/, $ctext, 80);
    #}
}

print "Content-type: text/html; charset=utf-8\n\n";
#print "<HTML>\n";
#print "<HEAD><TITLE>Chinese Annotation Results</TITLE>\n";


#$anchor = 0;

#$chartype = $values{'chartype'}; # "simp", "trad", "both"
#$chartype = "simp";
#print STDERR "Before cedict init<br>\n":
&init_cedict();
#print "After cedict init<br>\n":
#print "Total source lines " . scalar(@srclines) . "<br>\n";

#$count = 0;
#foreach $srcline (@srclines) {
    #print "Srcline $srcline\n";
#if ($srcline =~ m/[\x80-\xff]/) {
	#$seglines[$count++] = &segmentLine($srcline);
	#$seglines[$count++] = $srcline;
#    } else {
	#$seglines[$count++] = $srcline;
#    }
    #last if $count > 15;
#}



#print "</HEAD>\n<BODY>\n";

$separator = " ";
&segmentLineOffsets($sourcetext);
my($seplen) = length($separator);

#    for ($i = 0; $i < @offsets; $i++) {
#        if ($offsets[$i] > 0) {
#           print "$i " . substru8($cline, $i, $offsets[$i]) . "\n";
#        }
#    }
my($i);

$sourcetext = decode("utf8", $sourcetext);
my $intag = 1;
my $inscript = 0;
my $intextarea = 0;
my $inhead = 0;
my $incomment = 0;


for ($i = length($sourcetext)-1; $i >= 0; $i--) {
    $tempchar = substr($sourcetext, $i, 1);
    if ($tempchar eq "<") { $intag = 0; }
    elsif ($tempchar eq ">") { $intag = 1; }

    if ($tempchar eq '<') {
	if ($i+8 < length($sourcetext) &&
	    substr($sourcetext, $i, 8) =~ m/^<\/script$/i) {
	    $inscript = 1;
	} elsif ($i+7 < length($sourcetext) &&
		 substr($sourcetext, $i, 7) =~ m/^<script$/i) {
	    $inscript = 0;
	} elsif ($i+6 < length($sourcetext) && # Here to handle GeoCities broken code
		 substr($sourcetext, $i, 6) =~ m/^<\/html$/i) {
	    $inscript = 0;
	} elsif ($i+10 < length($sourcetext) &&
		 substr($sourcetext, $i, 10) =~ m/^<\/textarea$/i) {
	    $intextarea = 1;
	} elsif ($i+9 < length($sourcetext) &&
		 substr($sourcetext, $i, 9) =~ m/^<textarea$/i) {
	    $intextarea = 0;
	} elsif (i+6 < length($sourcetext) &&
		 substr($sourcetext, $i, 6) =~ m/^<\/head$/i) {
	    $intextarea = 1;
	} elsif (i+5 < length($sourcetext) &&
		 substr($sourcetext, $i, 5) =~ m/^<head$/i) {
	    $intextarea = 0;
	} elsif (i+4 < length($sourcetext) &&
		 substr($sourcetext, $i, 4) =~ m/^<!--$/i) {
	    $incomment = 0;
	} 
    } elsif ($tempchar eq "-") {
	if ($i+3 < length($sourcetext) &&
	    substr($sourcetext, $i, 3) =~ m/^-->$/i) {
	    $incomment = 1;
	} 
    }
    

    next if $intag || $inscript || $intextarea || $incomment;

    #print "$i\n";
    next if $tempchar !~ m/\p{Han}/;

    while ($i >= 0 && $offsets[$i] == 0) {
	$i--;
    }

    $cword = substr($sourcetext, $i, $offsets[$i]);
    $testword = encode("utf8", $cword);
    
    if (exists($cwords{$testword})) {
	my $def = $cwords{$testword};

	if ($addtones) {
	    $def =~ s/\[([^\]]+)\]/"\[" . &addTones($1) . "\]"/eg;
	}

	$def =~ m/\[([^\]]+)\]/;
	$defpy = $1;
	
	
	if ($atype eq "segment") {
	    substr($sourcetext, $i+$offsets[$i], 0, $separator);

	} elsif ($atype eq "topinyin") {
	    $defpy =~ s/ //g;
	    substr($sourcetext, $i, length($cword), $separator . $defpy . $separator);

	} elsif ($atype eq "addpinyin") {
	    @pys = split(/ /, $defpy);
	    if ($byword == 0) {
		for ($pyindex = scalar(@pys)-1; $pyindex >= 0; $pyindex--) {
		    substr($sourcetext, $i+$pyindex+1, 0, $pys[$pyindex]);
		    #srctxt.insert(i+pyindex+1, (String)pys.elementAt(pyindex));
		}
	    } else {
		$defpy =~ s/ //g;
		substr($sourcetext, $i + length($cword), $defpy);
		#srctxt.insert(i+pyindex, defpy);
	    }

	} elsif ($atype eq "addruby") {
	    @pys = split(/ /, $defpy);
	    if ($byword) {
		$defpy =~ s/ //g;
		substr($sourcetext, $i + length($cword), 0, $defpy);

	    } else {
		for ($pyindex = scalar(@pys)-1; $pyindex >= 0; $pyindex--) {
		    substr($sourcetext, $i+$pyindex+1, 0, "<rt>" . $pys[pyindex] . "</ruby>");
		    substr($sourcetext, $i+$pyindex, 0, "<ruby>");
		}
	    }
	} elsif ($atype eq "popup") {

	    $def = $cword . "\t" . $def;
	    $def =~ s/\'/\&rsquo;/g;
	    $def =~ s/\"/\&quot;/g;
	    #$def =~ s/\t/<BR>\&nbsp;\&nbsp; /g;
	    #$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
	    $def =~ s/\]\s+/\]\t/g;
	    substr($sourcetext, $i+length($cword), 0, "</span>");
	    substr($sourcetext, $i, 0, "<span onMouseOver=\"showdef(\'$def\', this); return true;\" onMouseOut=\"hidedef()\">");

	} elsif ($atype eq "js_adddict") {
	    $def = $cword . " " . $def;
	    $def =~ s/\'/\&rsquo;/g;
	    $def =~ s/\"/\&quot;/g;
	    #$def =~ s/\t/<BR>\&nbsp;\&nbsp; /g;
	    #$def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
	    #$def =~ s/\]\s+/\] /g;
	    substr($sourcetext, $i+length($cword), 0, "</span>");
	    substr($sourcetext, $i, 0, "<span onMouseOver=\"sline(\'$def\'); return true;\" onMouseOut=\"clearstat()\">");

	}
    }

}


$sourcetext = encode("utf8", $sourcetext);

$insertloc = -1;
if ($sourcetext =~ m/<head>/i) {
    $insertloc = @-[0];
    $insertloc += 6;
} elsif ($sourcetext =~ m/<\/title>/i) {
    $insertloc = @-[0];
    $insertloc += 8;
} elsif ($sourcetext =~ m/<html>/i) {
    $insertloc = @-[0];
    $insertloc += 6;
} else {
    $insertloc = 0;  # Just put it at the beginning
}

if (length($url) > 0) {
    if ($sourcetext =~ m/<base href=\"?([^ \">]+)/i) {
	$basepath = $1;
    } elsif ($sourcetext !~ m/<base/i) {
	substr($sourcetext, $insertloc, 0, "\n<base href=\"$url\">\n");
    } else { # Just add HREF

    }

    
    if ($sourcetext !~ m/charset=/i) {
	substr($sourcetext, $insertloc, 0, 
	       "\n<META HTTP-EQUIV=\"content-type\" CONTENT=\"text/html; charset=utf-8>");
    } elsif ($sourcetext =~ s/charset=([^ \'\">]+)/charset=utf-8/i) {
    }


}

if ($sourcetext =~ m/<body/i) {
    $bodyloc = @-[0];
    $bodyloc = index $sourcetext, ">", $bodyloc;
    $bodyloc++;
}

if ($atype eq "js_adddict") {
    $script = "
<SCRIPT LANGUAGE=JAVASCRIPT>
<!-- 
// Status line display
 function sline(txt) {
  window.status=txt;
 }

//  Clear Status Line
  function clearstat() {
    window.status=\"\";
  }
//-->
</SCRIPT>
";
    substr($sourcetext, $bodyloc, 0, $script);
} elsif ($atype eq "popup") {

$script = "
<SCRIPT>
<!--

document.write('<div id=\"defwin\" style=\"position:absolute;z-index:1;visibility:hidden;background-color:#ffffff;color:black\"><table cellspacing=0 cellpadding=5 border=3>' +
            '<tr><td id=\"zhval\"></td></tr>' +
             '</table></div>');

var ie=document.all 
var ieNOTopera=document.all&&navigator.userAgent.indexOf(\"Opera\")==-1 

function showdef(zhdef, text){
   var t = document.getElementById(\"zhval\"); 
   var defhtml = \"\"; 
   var deffields = zhdef.split(\"\\t\");
   defhtml = \"<font size=+3><span lang=zh>\" + deffields[0] + \"</span></font>\"; 

   for (var i = 1; i < deffields.length; i+=2) {
      defhtml += \"<br>\" + deffields[i] + \"&nbsp;&nbsp;\" + deffields[i+1]; 
  }

   t.innerHTML = defhtml;

   var topmsg_obj = document.getElementById(\"defwin\") 
   var obj = text; 
       var curleft = 0; 
       var curtop = 0; 
       if (obj.offsetParent) 
       { 
               while (obj.offsetParent) 
               { 
                       curleft += obj.offsetLeft 
                       curtop += obj.offsetTop 
                       obj = obj.offsetParent; 
               } 
       } 
       else if (obj.x) {
         curleft += obj.x; 
         curtop += obj.y; 
       } 


   var dsoctop=ie? document.body.scrollTop : pageYOffset
   var window_height=ieNOTopera? document.body.clientHeight : window.innerHeight 

   if (29 + curtop + topmsg_obj.offsetHeight > dsoctop + window_height) { 
     curtop = curtop - topmsg_obj.offsetHeight - 35
   }

   topmsg_obj.style.left = curleft;
   topmsg_obj.style.top = curtop+29; 
   topmsg_obj.style.visibility=\"visible\";
} 

function hidedef() { 
   var topmsg_obj = document.getElementById(\"defwin\");
   topmsg_obj.style.visibility=\"hidden\";
}
-->
</SCRIPT>
";
    substr($sourcetext, $bodyloc, 0, $script);
}

print $sourcetext;

dbmclose(%cwords);
exit(0);


sub formatHTML {
    my($htmltext) = @_;

=comment
    if ($htmltext =~ m/charset=gb/) {
	#print "Converting from euc-cn to utf8 " . length($htmltext) . "<br>\n";
	#@list = Encode->encodings();
	#foreach $enc (@list) {
	#    print "$enc<br>\n";
	#}
	#from_to($htmltext, "euc-cn", "utf8");
	#print "Converted from euc-cn to utf8 " . length($htmltext) . "\n";
	$htmltext = decode("euc-cn", $htmltext);
	#print "Between conversions" . length($htmltext) . "\n";
	$htmltext = encode("utf8", $htmltext);
	#print "After conversions" . length($htmltext) . "\n";
    } elsif ($htmltext =~ m/charset=big5/) {
	$htmltext = decode("cp950", $htmltext);
	#print "Between conversions" . length($htmltext) . "\n";
	$htmltext = encode("utf8", $htmltext);
	#from_to($htmltext, "cp950", "utf8");
    }
=cut

    $htmltext =~ s/<style.*?<\/style>//igs;
    $htmltext =~ s/<script.*?<\/script>//igs;
    $htmltext =~ s/\&nbsp;/ /gi;
    $htmltext =~ s/\&\#(\d+);//g; # Convert to UTF-8
    $htmltext =~ s/\s+/ /g;
    $htmltext =~ s/<BR>/\n/ig;
    $htmltext =~ s/<P(\s[^>]+)?>/\n\n/ig;
    $htmltext =~ s/<[^>]+>//g;
    
    #print "HTML text " . length($htmltext) . "\n";

    return $htmltext;
}


__END__








if ($atype eq "segment") {  # Just segment, no annotation
    foreach $segline (@seglines) {
	print $segline . "<BR>";
    }
} elsif ($atype eq "adddict") {

    foreach $_ (@seglines) {
	s/\r\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (exists($cwords{$word})) {
		if (exists($canchor{$word})) {
	  	   print "<A HREF=\"\#$canchor{$word}\">$word</A>";
	        } else {
   	 	   $anchor++;
		   $canchor{$word} = $anchor;
		   $anchwords[$anchor] = $word;
		   print "<A HREF=\"\#$canchor{$word}\">$word</A>";
	        }
	    } else {
		print "$word";
	    }
	}
    }
    print "<HR>\n";
    for ($i = 1; $i <= $anchor; $i++) {
	print "\n<A NAME=\"$i\">\n";
        $def = $cwords{$anchwords[$i]};
        $def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
        $def =~ s/\t/<br>\&nbsp;\&nbsp;\&nbsp;/g;
	print $anchwords[$i] . "&nbsp;&nbsp;" . $def . "<P>\n";
    }
    print "&nbsp;<BR>" x 35;


} elsif ($atype eq "js_adddict") {
    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (exists($cwords{$word})) {
                $def = $cwords{$word};
                $def =~ s/\'/\&rsquo;/g;
                $def =~ s/\"/\&quot;/g;
                $def =~ s/\t/   /g;
                $def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
		print "<span onMouseOver=\"sline(\'$def\'); return true\" onMouseOut=\"clearstat()\">$word</span>\n";
	    } else {
		print "$word";
	    }
	}
    }

} elsif ($atype eq "popup") {

    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (exists($cwords{$word})) {
                $def = $word . "\t" . $cwords{$word};
                $def =~ s/\'/\&rsquo;/g;
                $def =~ s/\"/\&quot;/g;
                #$def =~ s/\t/<BR>\&nbsp;\&nbsp; /g;
                $def =~ s/(\[[^\]]+\])/&addTones($1)/eg if $addtones;
                $def =~ s/\]\s+/\]\t/g;
		print "<span onMouseOver=\"showdef(\'$def\', this); return true\" onMouseOut=\"hidedef()\">$word</span>\n";
	    } else {
		print "$word";
	    }
	}
    }


} elsif ($atype eq "topinyin") {

    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (defined($cwords{$word})) { 
                ($py) = ($cwords{$word} =~ m/^\[([^\]]+)\]/);
	        $py = addTones($py) if $addtones;
                $py =~ s/ //g;
		print "$py ";
	    } else {
		print "$word";
	    }
	}
    }
} elsif ($atype eq "addpinyin") {

    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (exists($cwords{$word})) {
                ($py) = ($cwords{$word} =~ m/^\[([^\]]+)\]/);
	        $py = addTones($py) if $addtones;
		my(@pys) = split(/\s+/, $py);
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print substru8($word, $i, 1);
		    print $pys[$i];
		}
	    } else {
		print "$word";
	    }
	}
    }
} elsif ($atype eq "addruby") {

    print "<style>\nRUBY { ruby-align:center }\n</style>\n";
    print "<font size=+1>";

    foreach $segline (@seglines) {
	$segline =~ s/\n/<BR>/;
	if ($segline =~ m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/, $segline);
	foreach $word (@words) {
	    if (exists($cwords{$word})) {
                ($py) = ($cwords{$word} =~ m/^\[([^\]]+)\]/);
	        $py = addTones($py) if $addtones;
		my(@pys) = split(/\s+/, $py);
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print "<ruby>" . substru8($word, $i, 1);
		    print "<rt>" . $pys[$i] . "</ruby>";
		}
	    } else {
		print "$word";
	    }
	}
    }

}
Anon7 - 2021