.

KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.4.62
System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
User : www ( 80)
PHP Version : 8.3.8
Disable Function : NONE
Directory : /domains/mandarintools/cgi-bin/
Upload File :
Current File : /domains/mandarintools/cgi-bin/annotate_u8.pl
#!/usr/bin/perl

#use lib '/chtools';
#use lib '/var/www/htdocs/eepeter/cgi-bin';

require "cgi-lib.pl";
require "segmenter_u8.pl";

sub addTones {
    my($withnumbers) = shift;
    my($i);
    $withnumbers =~ s/ng(\d)\b/${1}ng/g;
    $withnumbers =~ s/n(\d)\b/${1}n/g;
    $withnumbers =~ s/ao(\d)\b/a${1}o/g;
    $withnumbers =~ s/ai(\d)\b/a${1}i/g;
    $withnumbers =~ s/ei(\d)\b/e${1}i/g;
    $withnumbers =~ s/ou(\d)\b/o${1}u/g;

    @tonenums = ("a1", "a2", "a3", "a4", "a5", "e1", "e2", "e3", "e4", "e5",
		 "i1", "i2", "i3", "i4", "i5", "o1", "o2", "o3", "o4", "o5",
		 "u1", "u2", "u3", "u4", "u5", 
		 "u:1", "u:2", "u:3", "u:4", "u:5", "u:",
		 "v1", "v2", "v3", "v4", "v5", "v");
    @tonemarks = ('&#x0101;', '&aacute;', '&#x01ce;', '&#x00e0;', 'a', 
		  '&#x0113;', '&#x00e9;', '&#x011b;', '&#x00e8;', 'e', 
		  '&#x012b;', '&#x00ed;', '&#x01d0;', '&#x00ec;', 'i',
		  '&#x014d;', '&#x00f3;', '&#x01d2;', '&#x00f2;', 'o',
		  '&#x016b;', '&#x00fa;', '&#x01d4;', '&#x00f9;', 'u',
		  '&#x01d6;', '&#x01d8;', '&#x01da;', '&#x01dc;', '&#x00fc;', '&#x00fc;',
		  '&#x01d6;', '&#x01d8;', '&#x01da;', '&#x01dc;', '&#x00fc;', '&#x00fc;'); 

    for ($i = 0; $i < scalar(@tonenums); $i++) {
	$withnumbers =~ s/$tonenums[$i]/$tonemarks[$i]/ge;
    } 
    $withnumbers =~ s/5//g;

    return $withnumbers;
}

&ReadParse(*values);
$atype = $values{'atype'};
$addtones = $values{'addtones'};

$ctext = $values{'ctext'};
$tagwords = $values{'words'};

if ($ctext =~ m#^\s*(http|gopher|ftp)://#) {
   #open(WEB, "./lynx -assume_charset=gb2312 -source $ctext |");
    #while (<WEB>) {
    # chomp;
    # push $sourcetext@srclines, $_, "\r\n";
   #}
   #close(<WEB>);
    $sourcetext = `./lynx -assume_charset=gb2312 -source $ctext`;
    $sourcetext = formatHTML($sourcetext);
    $sourcetext =~ s/\r//g;
    $sourcetext =~ s/\n\s\s+/\n\n/g;
    @srclines = split(/(\n)/, $sourcetext);
    
} else {
    $ctext .= "\r\n";
    (@utfchars) = ($ctext =~ m/([\xE0-\xEF]..)/g);
    foreach $char (@utfchars) { $utfcharset{$char} = 1; }

    if ($atype eq 'addmargin') {
	@srclines = split(/(\r\n(\r\n)+)/, $ctext, 80);
    } else {
	@srclines = split(/(\r\n)/, $ctext, 80);
    }
}

print "Content-type: text/html; charset=utf-8\n\n";
print "<HTML>\n";
print "<HEAD><TITLE>Chinese Annotation Results</TITLE>\n";

$anchor = 0;

$chartype = $values{'chartype'}; # "simp", "trad", "both"
#$chartype = "simp";

&init_cedict($chartype);

$tmptagwords = $tagwords . "\r\n";
@tagwords = split(/\r\n/, $tmptagwords, 100);
foreach $tagword (@tagwords) {
    if ($tagword =~ m/\s/) {
	($chin, $py, $eng) = ($tagword =~ m/^(\S+)\s(\[[a-zA-Z0-5: ]+\])\s(.+)$/);
	$cwords{$chin} = "[$py] $eng";
    } else {
	$cwords{$tagword} = "[] //";
    }
}


$count = 0;
foreach $srcline (@srclines) {
    $seglines[$count++] = segmentLine($srcline);
}

foreach $segline (@seglines) {
    @segwords = split(/\s+/, $segline);
    foreach $segword (@segwords) {
	$segwords{$segword} = 1;
    }
}


if ($atype eq "js_adddict") {
    print <<JS;
<SCRIPT LANGUAGE=JAVASCRIPT>
<!-- 
// Status line display
 function sline(txt) {
  window.status=txt;
 }

//  Clear Status Line
  function clearstat() {
    window.status="";
  }
//-->
</SCRIPT>
JS
}
print "</HEAD>\n<BODY>\n";


if ($atype eq "segment") {  # Just segment, no annotation
    foreach $segline (@seglines) {
	print $segline . "<BR>";
    }
} elsif ($atype eq "adddict") {
    $tagwords .= "\r\n";
    @tagwords = split(/\r\n/, $tagwords, 100);
    foreach $tagword (@tagwords) {
	#print "$tagword\n";
	if ($tagword =~ m/\s/) {
	    ($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
	    $py = addTones($py) if $addtones;
	    $cdef{$chin} = "$chin\n$py\n$eng";
	    $tagwords{$chin} = "$chin\n$py\n$eng";
	} else {
	    $tagwords{$tagword} = $cdef{$tagword};
	}
    }

    foreach $_ (@seglines) {
	s/\r\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (exists($cdef{$word})) {
		if (exists($canchor{$word})) {
		print "<A HREF=\"\#$canchor{$word}\">$word</A>";
	    } else {
		$anchor++;
		$canchor{$word} = $anchor;
		$anchwords[$anchor] = $word;
		print "<A HREF=\"\#$canchor{$word}\">$word</A>";
	    }
	    } else {
		print "$word";
	    }
	}
    }
    print "<HR>\n";
    for ($i = 1; $i <= $anchor; $i++) {
	print "\n<A NAME=\"$i\">\n";
	print $cdef{$anchwords[$i]} . "<P>";
	
    }
    print "&nbsp;<BR>" x 35;

} elsif ($atype eq "js_adddict") {
    open(CED, "cedict_ts.u8") or die "Can't open dictionary";
    while (<CED>) {
	s/[\r\n]*$//;
	s/\'/\\\'/g;
	s/\"/\\\"/g;

	($tchin, $schin, $py, $eng) = (m/^(\S+) (\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
	$py = addTones($py) if $addtones;
	if ($chartype eq "simp" and defined($segwords{$schin})) {
	    $cdef{$schin} = "$schin\t$py\t$eng";
	} elsif ($chartype eq "trad" and defined($segwords{$tchin})) {
	    $cdef{$tchin} = "$tchin\t$py\t$eng";
	} elsif ($chartype eq "both" and (defined($segwords{$tchin}) or defined($segwords{$schin}))) {
	    $cdef{$schin} = "$schin\t$py\t$eng";
	    $cdef{$tchin} = "$tchin\t$py\t$eng";
	}
	#($chin, $py, $eng) = (m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
	#$cdef{$chin} = "$chin $py $eng";
    }
    close(CED);
    
    $tagwords .= "\r\n";
    @tagwords = split(/\r\n/, $tagwords, 100);
    foreach $tagword (@tagwords) {
	#print "$tagword\n";
	if ($tagword =~ m/\s/) {
	    ($chin, $py, $eng) = ($tagword =~ (m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/));
	    $py = addTones($py) if $addtones;
	    $cdef{$chin} = "$chin $py $eng";
	    $tagwords{$chin} = "$chin\n$py\n$eng";
	} else {
	    $tagwords{$tagword} = $cdef{$tagword};
	}
    }

    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (exists($cdef{$word})) {
		if (exists($canchor{$word})) {
		    print "<A HREF=\"\#$canchor{$word}\" onMouseOver=\"sline(\'$cdef{$word}\'); return true\" onMouseOut=\"clearstat()\">$word</A>";
		} else {
		    $anchor++;
		    $canchor{$word} = $anchor;
		    $anchwords[$anchor] = $word;
		    print "<A HREF=\"\#$canchor{$word}\" onMouseOver=\"sline(\'$cdef{$word}\'); return true\" onMouseOut=\"clearstat()\">$word</A>";
#		print "<A HREF=\"\#$canchor{$word}\">$word</A>";
		}
	    } else {
		print "$word";
	    }
	}
    }
    print "<HR>\n";
    for ($i = 1; $i <= $anchor; $i++) {
	print "\n<A NAME=\"$i\">\n";
        $cdef{$anchwords[$i]} =~ s/\\\"/\"/g;
        $cdef{$anchwords[$i]} =~ s/\\\'/\'/g;
	print $cdef{$anchwords[$i]} . "<P>";
	
    }
    print "<BR>" x 25;

} elsif ($atype eq "topinyin") {
	$py = addTones($py) if $addtones;
	$py =~ s/\s//g;
    
    $tagwords .= "\r\n";
    @tagwords = split(/\r\n/, $tagwords, 100);
    foreach $tagword (@tagwords) {
	#print "$tagword\n";
	if ($tagword =~ m/\s/) {
	    ($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
	    $py = addTones($py) if $addtones;
	    $cdef{$chin} = $py;
	    $tagwords{$chin} = "$chin\n$py\n$eng";
	} else {
	    $tagwords{$tagword} = $cdef{$tagword};
	}
    }

    open(UPY, "uni8py.txt") or die "Can't open dictionary\n";
    while(<UPY>) {
	s/[\r\n]*$//;
	my($u8char, $py) = split;
	$py = addTones($py) if $addtones;
	($fpy, $rest) = split(/\s/, $py);
	$u8py{$u8char} = $fpy;
    }
    close(UPY);

    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (defined($cdef{$word}) and lengthu8($word) > 1) {
		print $cdef{$word};
	    } elsif (vec($word, 0, 8) > 127) {
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print $u8py{substru8($word, $i, 1)};
		}
	    } else {
		print "$word";
	    }
	}
    }
} elsif ($atype eq "addpinyin") {
    open(CED, "cedict_ts.u8") or die "Can't open dictionary";
    while (<CED>) {
	($tchin, $schin, $py, $eng) = (m/^(\S+) (\S+) \[([a-zA-Z0-5: ]+)\] (.+)$/);
	$py = addTones($py) if $addtones;
	if ($chartype eq "simp" and defined($segwords{$schin})) {
	    $cdef{$schin} = "$py";
	} elsif ($chartype eq "trad" and defined($segwords{$tchin})) {
	    $cdef{$tchin} = "$py";
	} elsif ($chartype eq "both" and (defined($segwords{$tchin}) or defined($segwords{$schin}))) {
	    $cdef{$schin} = "$py";
	    $cdef{$tchin} = "$py";
	}
    }
    close(CED);
    
    $tagwords .= "\r\n";
    @tagwords = split(/\r\n/, $tagwords, 100);
    foreach $tagword (@tagwords) {
	#print "$tagword\n";
	if ($tagword =~ m/\s/) {
	    ($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
	    $py = addTones($py) if $addtones;
	    $cdef{$chin} = $py;
	    $tagwords{$chin} = "$chin\n$py\n$eng";
	} else {
	    $tagwords{$tagword} = $cdef{$tagword};
	}
    }


    open(UPY, "uni8py.txt") or die "Can't open dictionary\n";
    while(<UPY>) {
	s/[\r\n]*$//;
	my($u8char, $py) = split;
	$py = addTones($py) if $addtones;
	($fpy, $rest) = split(/\s/, $py, 2);
	if (defined($utfcharset{$u8char})) {
	    $u8py{$u8char} = $fpy;
	}
    }
    close(UPY);

    foreach $_ (@seglines) {
	s/\n/<BR>/;
	if (m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/);
	foreach $word (@words) {
	    if (defined($cdef{$word}) and lengthu8($word) > 1) {
		my(@pys) = split(/\s+/, $cdef{$word});
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print substru8($word, $i, 1);
		    print $pys[$i];
		}
	    } elsif (vec($word, 0, 8) > 127) {
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print substru8($word, $i, 1);
		    print $u8py{substru8($word, $i, 1)};
		}
	    } else {
		print "$word";
	    }
	}
    }
} elsif ($atype eq "addruby") {
    open(CED, "cedict_ts.u8") or die "Can't open dictionary";
    while (<CED>) {
	($tchin, $schin, $py, $eng) = (m/^(\S+) (\S+) \[([a-zA-Z0-5: ]+)\] (.+)$/);
	
	if ($chartype eq "simp" and defined($segwords{$schin})) {
	    $py = addTones($py) if $addtones;
	    $cdef{$schin} = "$py";
	} elsif ($chartype eq "trad" and defined($segwords{$tchin})) {
	    $py = addTones($py) if $addtones;
	    $cdef{$tchin} = "$py";
	} elsif ($chartype eq "both" and (defined($segwords{$tchin}) or defined($segwords{$schin}))) {
	    $py = addTones($py) if $addtones;
	    $cdef{$schin} = "$py";
	    $cdef{$tchin} = "$py";
	}
    }
    close(CED);
    
    $tagwords .= "\r\n";
    @tagwords = split(/\r\n/, $tagwords, 100);
    foreach $tagword (@tagwords) {
	#print "$tagword\n";
	if ($tagword =~ m/\s/) {
	    ($chin, $py, $eng) = ($tagword =~ m/^(\S+) (\[[a-zA-Z0-5: ]+\]) (.+)$/);
	    $py = addTones($py) if $addtones;
	    $cdef{$chin} = $py;
	    $tagwords{$chin} = "$chin\n$py\n$eng";
	} else {
	    $tagwords{$tagword} = $cdef{$tagword};
	}
    }

    open(UPY, "uni8py.txt") or die "Can't open dictionary\n";
    while(<UPY>) {
	s/[\r\n]*$//;
	my($u8char, $py) = split;
	$py = addTones($py) if $addtones;
	($fpy, $rest) = split(/\s+/, $py, 2);
	$u8py{$u8char} = $fpy;
    }
    close(UPY);

    print "<style>\nRUBY { ruby-align:center }\n</style>\n";
    print "<font size=+1>";

    foreach $segline (@seglines) {
	$segline =~ s/\n/<BR>/;
	if ($segline =~ m/^\s+$/) {
	    print "<P>\n";
	    next;
	}
	
	@words = split(/(\s+)/, $segline);
	foreach $word (@words) {
	    if (defined($cdef{$word}) and lengthu8($word) > 1) {
		my(@pys) = split(/\s+/, $cdef{$word});
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print "<ruby>" . substru8($word, $i, 1);
		    print "<rt>" . $pys[$i] . "</ruby>";
		}
	    } elsif (vec($word, 0, 8) > 127) {
		my($i);
		for ($i = 0; $i < lengthu8($word); $i++) {
		    print "<ruby>" . substru8($word, $i, 1);
		    print "<rt>" . $u8py{substru8($word, $i, 1)} . "</ruby>";
		}
	    } else {
		print "$word";
	    }
	}
    }

} elsif ($atype eq "addmargin") {
    $tagwords .= "\r\n";
    @tagwords = split(/\r\n/, $tagwords, 60);
    foreach $tagword (@tagwords) {
	if ($tagword =~ m/\s/) {
	    ($chin, $py, $eng) = ($tagword =~ m/^(\S+)\s(\[[a-zA-Z0-5: ]+\])\s(.+)$/);
	    $py = addTones($py) if $addtones;
	    $cdef{$chin} = "$chin\n$py\n$eng";
	    $tagwords{$chin} = "$chin\n$py\n$eng";
	} else {
	    $tagwords{$tagword} = $cdef{$tagword};
	}
    }

    print "<TABLE>\n";

    foreach $_ (@seglines) {
	print "<TR>\n<TD WIDTH=\"80\%\">\n";
	@words = split(/\s+/);
	@anchwords = ();
	foreach $word (@words) {
	    if (exists($tagwords{$word})) {
		if (exists($canchor{$word})) {
		    print "$word";
		} else {
		    # First occurence
		    $anchor++;
		    $canchor{$word} = $anchor;
		    $anchwords[$anchor] = $word;
		    print "<STRONG>$word</STRONG>";
		}
	    } else {
		print "$word";
	    }
	}

	# Print sidenotes
	print "</TD>\n<TD WIDTH=\"20\%\">\n";
	for ($i = 1; $i <= $anchor; $i++) {
	    print "<SMALL>" . $cdef{$anchwords[$i]} . "</SMALL><P>";
	}
	print "</TD></TR>\n";
	
    }

    print "</TABLE>\n";
}



print "</BODY>\n</HTML>";
exit(0);

sub formatHTML {
    my($htmltext) = @_;

    $htmltext =~ s/\s+/ /g;
    $htmltext =~ s/<BR>/\n/ig;
    $htmltext =~ s/<P(\s[^>]+)?>/\n/ig;
    $htmltext =~ s/<[^>]+>//g;
    
    return $htmltext;
}
Anon7 - 2021