|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/cgi-bin/ |
Upload File : |
# -*- coding:cn-gb -*-
$debugon = 0;
require "segmenter.pl";
print "Loaded segmenter\n" if $debugon;
$tagset = "MUC";
# Load various word lists
@hashtypes = ("geonames", "geotypes", "surnames", "notname", "titles",
"persons", "orgnames", "orgwords", "orgtypes", "dates",
"times", "currency");
foreach $hashtype (@hashtypes) {
&load_hash($hashtype);
}
sub load_hash {
my($hashname) = shift;
my(@hashkey, @words);
my $line;
# Prepare place name info
open(HASH, "data/$hashname.txt") or die $!;
my($i) = -1;
while ($line = <HASH>) {
next if $line =~ m/^\s*$/ or $line =~ m/^\s*\#/;
$line =~ s/[\r\n]*$//;
($hashkey, $dummy) = split(/\t/, $line);
&load_hash_key($hashkey, $hashname);
}
close(HASH);
}
sub load_hash_key {
my($hashkey, $hashname) = @_;
my($windex);
# Load words into a hash tree, allowing quick and sequential checking
$hashkey =~ s/^\s*(.*?)\s*$/$1/;
my(@words) = split(/\s+/, $hashkey);
my($node) = \%{$hashname};
for ($windex = 0; $windex <= $#words; $windex++) {
unless (defined($$node{$words[$windex]})) {
if ($windex == $#words) {
$$node{$words[$windex]} = { "EOW" => 1 };
} else {
$$node{$words[$windex]} = {};
}
}
$node = $$node{$words[$windex]};
}
}
# Add the words in the data files to the segmenter lexicon so that they are
# segmented correctly
foreach $hashtype ("geonames", "geotypes", "titles", "persons", "orgnames", "orgwords",
"orgtypes", "currency") {
open(HF, "data/$hashtype.txt");
while ($dataline = <HF>) {
next if $dataline =~ m/^\#/;
($hashdata, $rest) = split(/\t/, $dataline, 2);
@hashwords = split(/\s/, $hashdata);
foreach $hashword (@hashwords) {
addsegword($hashword);
}
}
close(HF);
}
sub addNEE {
$filetxt = shift;
$segtxt = "";
$filetxt =~ s/\r//g;
my $line;
my @lines = split(/\n/, $filetxt);
# Segment the file and store results in $segtxt
foreach $line (@lines) {
$segtxt .= &segmentline($line);
$segtxt .= "\n";
}
# Print out the segmented file to use in debugging
if ($debugon) {
$segfile = "tempseg.txt" if $segfile eq "";
open(SEG, "> $segfile");
print SEG addnewline($segtxt, $filetxt);
close(SEG);
}
# Specify the tags to use for markup (two possibilities)
if ($tagset eq "MUC") {
@{"PERSON"} = ("<ENAMEX TYPE=\"PERSON\">", "</ENAMEX>");
@{"LOCATION"} = ("<ENAMEX TYPE=\"LOCATION\">", "</ENAMEX>");
@{"ORGS"} = ("<ENAMEX TYPE=\"ORGANIZATION\">", "</ENAMEX>");
@{"DATE"} = ("<TIMEX TYPE=\"DATE\">", "</TIMEX>");
@{"TIME"} = ("<TIMEX TYPE=\"TIME\">", "</TIMEX>");
@{"PERCENT"} = ("<NUMEX TYPE=\"PERCENT\">", "</NUMEX>");
@{"MONEY"} = ("<NUMEX TYPE=\"MONEY\">", "</NUMEX>");
$newline = "\n";
} elsif ($tagset eq "HTML") {
@{"PERSON"} = ("<FONT COLOR=RED>", "</FONT>");
@{"LOCATION"} = ("<FONT COLOR=GREEN>", "</FONT>");
@{"ORGS"} = ("<FONT COLOR=BLUE>", "</FONT>");
@{"DATE"} = ("<FONT COLOR=PURPLE>", "</FONT>");
@{"TIME"} = ("<FONT COLOR=GRAY>", "</FONT>");
@{"PERCENT"} = ("<FONT COLOR=ORANGE>", "</FONT>");
@{"MONEY"} = ("<FONT COLOR=YELLOW>", "</FONT>");
$newline = "<BR>\n";
}
# Split into sentences (either �� or two newlines)
@sentences = split(/(��\s*|\n\n)/, $segtxt);
print "Running rules\n" if $debugon;
# Process for each type
$sentpos = 0;
print "Total sentences: " . @sentences/2 . "\n" if $debugon;
for ($i = 0; $i < @sentences; $i++) {
print "$i: $sentences[$i]\n";
if ($sentences[$i] =~ m/^(��\s*|\n\n)$/) {
$sentpos += length($sentences[$i]);
next;
}
$sentence = $sentences[$i];
@sentwords = split(/(\s+)/, $sentence);
&placenames;
&persons;
&organizations;
×
&dates;
&money;
&percents;
$sentpos += length($sentences[$i]);
# if ($i % 10 == 0) { print "Sentence #" . $i/2 . "\n"; } # if $debugon;
}
print "Reconciling overlaps\n" if $debugon;
my $outtxt = &reconcile_and_tag();
return $outtxt;
}
sub placenames {
my(@location_rules) = ('(LOCATION (BEG &allforeign+ %geotypes END) ADD )',
'(LOCATION (BEG &allforeign+ END "���") ADD )',
'(LOCATION (BEG &allforeign+ END "����") ADD )',
'(LOCATION (BEG %geonames %geotypes END) ADD )',
'(LOCATION (BEG ANY END ANY "����") ADD)',
'(LOCATION (BEG ANY END "����") ADD)',
'(LOCATION (BEG %geonames END))');
foreach $rule (@location_rules) {
&run_rule($rule);
}
}
sub persons {
my(@person_rules) = ('(PERSON (BEG &allforeign+ "��" &allforeign+ END) ADD )',
'(PERSON (BEG &allforeign+ "." &allforeign+ END) ADD )',
'(PERSON (%titles BEG &allforeign+ END) ADD )',
'(PERSON (%titles BEG &isPossibleChineseName END) ADD )',
'(PERSON (%titles BEG %surname ANY END) ADD )',
'(PERSON (BEG &allforeign+ END %titles) ADD )',
'(PERSON (%orgtypes "��" BEG &allforeign+ END) ADD )',
'(PERSON (%geonames "��" BEG &allforeign+ END) ADD )',
'(PERSON (BEG &isChineseName END) ADD )',
'(PERSON (BEG %persons END))');
foreach $rule (@person_rules) {
&run_rule($rule);
}
}
sub organizations {
my(@org_rules) = ('(ORGS (BEG &allforeign %orgtypes END) ADD )',
'(ORGS (BEG %geonames %orgtypes END) ADD )',
'(ORGS (BEG %orgwords+ %orgtypes END) ADD )',
'(ORGS (BEG %orgnames END))');
foreach $rule (@org_rules) {
&run_rule($rule);
}
}
sub dates {
my(@date_rules) = ('(DATE (BEG &allnumbers "��" END))',
'(DATE (BEG &allnumbers "��" &allnumbers "��" END))',
'(DATE (BEG &allnumbers "��" &allnumbers "��" &allnumbers "��" END))',
'(DATE (BEG &allnumbers "��" &allnumbers "��" END))',
'(DATE (BEG &allnumbers "��" &allnumbers "��" END))',
'(DATE (BEG &allnumbers "��" END))',
'(DATE (BEG &ismonth &allnumbers "��" END))',
'(DATE (BEG &allnumbers "�·�" END))',
'(DATE (BEG %dates END))');
foreach $rule (@date_rules) {
&run_rule($rule);
}
}
sub times {
my(@time_rules) = ('(TIME (BEG &allnumbers "��" END))',
'(TIME (BEG &allnumbers "����" END))',
'(TIME (BEG %times END))');
foreach $rule (@time_rules) {
&run_rule($rule);
}
}
sub money {
my(@money_rules) = ('(MONEY (BEG &allnumbers+ %currency END))',
'(MONEY (BEG "$" &allnumbers END))');
foreach $rule (@money_rules) {
&run_rule($rule);
}
}
sub percents {
@percent_rules = ('(PERCENT (BEG "�ٷ�֮" &allnumbers+ END))',
'(PERCENT (BEG &allnumbers "��֮" &allnumbers END))',
'(PERCENT (BEG "��" &allnumbers+ END))',
'(PERCENT (BEG &allnumbers+ "��" END))',
'(PERCENT (BEG &allnumbers+ "�ٷֵ�" END))',
'(PERCENT (BEG &isPercent END))');
foreach $rule (@percent_rules) {
&run_rule($rule);
}
}
sub run_rule {
my($toprule) = shift;
print "Running $toprule\n";
my($curpos) = $sentpos;
my($start, $end, $node, $tempi, $j, $i, $sentstart);
my($type, $rule, $addkey) = ($toprule =~ m/^\((\w+)\s\((.+?)\)\s?(\w*)\s?\)$/);
@atoms = split(/\s+/, $rule);
for ($i = 0; $i < @sentwords; $i++) {
print "Sent word $i of \n" if $debugon;
if ($sentwords[$i] =~ /^\s+$/) { # Skip spaces
$curpos += length($sentwords[$i]);
next;
}
$tempos = $curpos;
$posholder = $tempos;
$tempi = $i;
$match = 1;
for ($j = 0; $j < @atoms; $j++) {
if ($atoms[$j] eq "BEG") { # start offset
$start = $tempos;
} elsif ($atoms[$j] eq "END") { #end offset
$end = $tempos;
} elsif ($atoms[$j] eq "ANY" and $tempi < @sentwords) { # Match anything
$tempos += length($sentwords[$tempi]);
$tempi++;
} elsif ($atoms[$j] =~ /^.?\"/) { # exact string match
my($negate) = 0;
if ($atoms[$j] =~ /^\-/) { $negate = 1; }
if ($atoms[$j] =~ /\*$/) { $kleene = 0; }
if ($atoms[$j] =~ /\+$/) { $kleene = 1; }
my($string) = ($atoms[$j] =~ /^.?\"(.*?)\".?$/);
if ($tempi < @sentwords and $string eq $sentwords[$tempi]) {
$tempos += length($sentwords[$tempi]);
$tempi++;
} else {
$match = 0;
last;
}
} elsif ($atoms[$j] =~ /^.?\&/) { # run sub on word, true or false
my($negate) = 0;
my($kleene) = -1;
my($localatom) = $atoms[$j];
if ($localatom =~ s/^\-//) { $negate = 1; }
if ($localatom =~ s/\*$//) { $kleene = 0; }
if ($localatom =~ s/\+$//) { $kleene = 1; }
my($subname) = ($localatom =~ m/^.?\&(.+)(\*|\+)?$/);
if ($kleene == -1) {
if (&{$subname}($sentwords[$tempi]) != 0) {
$tempos += length($sentwords[$tempi]);
$tempi++;
} else {
$match = 0;
last;
}
} elsif ($kleene == 0) {
while (&{$subname}($sentwords[$tempi]) != 0 and
$tempi < @sentwords) {
$tempos += length($sentwords[$tempi]);
$tempi++;
if ($sentwords[$tempi] =~ /^\s+$/) { # Handle whitespace
$tempos += length($sentwords[$tempi]);
$tempi++;
}
}
} elsif ($kleene == 1) {
if (&{$subname}($sentwords[$tempi]) != 0) {
$tempos += length($sentwords[$tempi]);
$tempi++;
} else {
$match = 0;
last;
}
if ($sentwords[$tempi] =~ /^\s+$/) { # Handle whitespace
$tempos += length($sentwords[$tempi]);
$tempi++;
}
while (&{$subname}($sentwords[$tempi]) != 0 and
$tempi < @sentwords) {
$tempos += length($sentwords[$tempi]);
$tempi++;
if ($sentwords[$tempi] =~ /^\s+$/) { # Handle whitespace
$tempos += length($sentwords[$tempi]);
$tempi++;
}
}
}
} elsif ($atoms[$j] =~ /^.?\%/) { # member of a set (hash), one or more tokens
my($negate) = 0;
my($i, $kleene, $node, $hashstart, $posholder, $arrayholder);
if ($atoms[$j] =~ /^\-/) { $negate = 1; }
if ($atoms[$j] =~ /\*$/) { $kleene = 0; }
if ($atoms[$j] =~ /\+$/) { $kleene = 1; }
my($hashname) = ($atoms[$j] =~ /^.?\%(.*?)(\*|\+)?$/);
$hashstart = $tempos;
$node = \%{$hashname};
$posholder = $hashstart;
$arrayholder = $tempi;
while (defined(${$node}{$sentwords[$tempi]}) or $sentwords[$tempi] =~ m/^\s+$/) {
unless ($sentwords[$tempi] =~ /^\s+$/) {
$node = \%{$$node{$sentwords[$tempi]}};
}
$tempos += length($sentwords[$tempi]);
$tempi++;
if (defined($$node{"EOW"})) { #Complete as is. Update placeholders
$posholder = $tempos;
$arrayholder = $tempi;
}
}
if ($posholder == $hashstart) { # Nothing worked, fail the match
$tempi = $arrayholder;
$match = 0;
last;
} else {
$tempos = $posholder;
$tempi = $arrayholder;
}
} # end hash match
if ($match == 0) { last; }; # No match, move on to next starting point
if ($tempi < @sentwords and $sentwords[$tempi] =~ /^\s+$/) { # Handle whitespace
$tempos += length($sentwords[$tempi]);
$tempi++;
}
} # end for $j
if ($j == @atoms and $match == 1) {
print "We have a match! $start\t$end\t$type\n" if $debugon;
push @markups, "$start\t$end\t$type";
# Add back into hash to find if it occurs again
$type2hash{"ORG"} = "orgnames";
$type2hash{"LOCATION"} = "geonames";
$type2hash{"PERSON"} = "persons";
print "Load hashkey\n";
if ($addkey ne "") {
# print substr($segtxt, $start, $end-$start), "\n";
load_hash_key(substr($segtxt, $start, $end - $start), $type2hash{$type});
}
$i = $tempi - 1;
$curpos = $tempos;
print "$i $tempi $curpos\n";
} else {
$curpos += length($sentwords[$i]);
}
} # end for $i
} # end run_rule
sub reconcile_and_tag {
my($fchar, $tchar);
# Establish a hierarchy of tag types
@hierarchy = ("LOCATION", "PERSON", "ORG", "DATE", "TIME", "MONEY",
"PERCENT");
for (0 .. $#hierarchy) {
$hierarchy{$hierarchy[$_]} = $_;
}
# Sort markups by starting offset
@markups = sort bypos @markups;
foreach $markup (@markups) {
print "Markup $markup\n";
my($start, $end, $type) = split(/\t/, $markup);
# print $markup, "\t", substr($segtxt, $start, $end-$start) , "\n";
}
# reconcile overlapping tags here
$i = 0; $j = 1;
while ($i <= $#markups) {
if ($i == $#markups) {
# The last one, add it
push @markups2, $markups[$i];
last;
}
($start, $end, $type) = split(/\t/, $markups[$i]);
($start2, $end2, $type2) = split(/\t/, $markups[$j]);
if ($end <= $start2) { # no possible overlap, add and move on
push @markups2, $markups[$i];
$i = $j; $j++;
} elsif ($start == $start2 and $end == $end2) {
# Share exact offsets, use hierarcy to determine which
if ($hierarchy{$type} < $hierarchy{$type2}) {
$j++;
} else {
$i = $j; $j++;
}
} elsif ($start < $start2 and $end2 <= $end) { # tag2 in tag1; add tag1
# One entirely contained in other, use larger entity
$j++;
} elsif ($start == $start2 and $end < $end2) { # tag1 in tag2; add tag2
$i = $j; $j++;
} elsif ($start < $start2 and $end > $start2 and $end < $end2) {
# Two tags partially overlap, use one higher in hierarchy
if ($hierarchy{$type} > $hierarchy{$type2}) {
$j++;
} else {
$i = $j; $j++;
}
}
}
# Uncomment when above code works
@markups = (@markups2);
# add SGML tags here
$previndex = 0;
for ($k = 0; $k <= $#markups; $k++) {
($start, $end, $type) = split(/\t/, $markups[$k]);
$tagtxt .= substr($segtxt, $previndex, $start-$previndex) .
${$type}[0];
$tagtxt .= substr($segtxt, $start, $end - $start) . ${$type}[1];
$previndex = $end;
}
$tagtxt .= substr($segtxt, $previndex, length($segtxt) - $previndex);
# take out extra spaces added by segmenter, add newlines
for ($i = 0, $j = 0; $j < length($tagtxt); $i++, $j++) {
$fchar = substr($filetxt, $i, 1);
while ($fchar eq "\n") {
$outtxt .= $newline;
$i++;
$fchar = substr($filetxt, $i, 1);
}
$tchar = substr($tagtxt, $j, 1);
if ($fchar eq $tchar) {
$outtxt .= $fchar;
} else {
while ($fchar ne $tchar and $j < length($tagtxt)) {
if ($tchar eq "<") {
$outtxt .= $tchar;
while ($tchar ne ">") {
$j++;
$tchar = substr($tagtxt, $j, 1);
$outtxt .= $tchar;
}
} elsif ($fchar eq "\n") {
while ($fchar eq "\n") {
$outtxt .= "<BR>\n";
$i++;
$fchar = substr($filetxt, $i, 1);
}
} elsif ($fchar eq " ") {
$outtxt .= " ";
$i++;
$fchar = substr($filetxt, $i, 1);
} else {
$j++;
$tchar = substr($tagtxt, $j, 1);
}
}
$outtxt .= $fchar;
}
}
return $outtxt;
}
# Function to sort markups by position and type before removing overlapping markups
sub bypos {
($starta, $enda, $typea) = split(/\t/, $a);
($startb, $endb, $typeb) = split(/\t/, $b);
$starta <=> $startb
or
$enda <=> $endb
or
$hierarchy{$typea} <=> $hierarchy{$typeb};
}
sub isPercent {
my($token) = shift;
if ($token =~ m/^[0-9][0-9]*(\.[0-9]+)?\%/) {
return 1;
} else {
return 0;
}
}
sub isChineseName {
my($token) = shift;
print "Is Chinese name: $token\n";
if (length($token) < 4 or length($token) > 6) {
return 0;
}
my $surchar = substr($token, 0, 2);
if (!defined($csurname{$surchar})) {
return 0;
}
if (defined($cwords{$token}) and $cwords{$token} == 1) {
return 0;
}
if (defined($geonames{$token})) {
return 0;
}
return 1;
}
sub isPossibleChineseName {
my($token) = shift;
my($surchar);
if (length($token) < 4 or length($token) > 6) {
return 0;
}
$surchar = substr($token, 0, 2);
if (!defined($csurname{$surchar}) and !defined($uncommoncsurname{$surchar})) {
return 0;
}
if ($cwords{$token} == 1) {
return 0;
}
if (defined($geonames{$token})) {
return 0;
}
return 1;
}
sub addnewline {
my($tagtxt, $filetxt) = @_;
my($i, $j, $outtxt, $fchar, $tchar);
for ($i = 0, $j=0; $j < length($tagtxt); $i++, $j++) {
$fchar = substr($filetxt, $i, 1);
while ($fchar eq "\n") {
$outtxt .= "\n";
$i++;
$fchar = substr($filetxt, $i, 1);
}
$tchar = substr($tagtxt, $j, 1);
if ($fchar eq $tchar) {
$outtxt .= $fchar;
} else {
while ($fchar ne $tchar and $j < length($tagtxt)) {
if ($tchar eq "<") {
$outtxt .= $tchar;
while ($tchar ne ">") {
$j++;
$tchar = substr($tagtxt, $j, 1);
$outtxt .= $tchar;
}
} else {
$outtxt .= $tchar;
$j++;
$tchar = substr($tagtxt, $j, 1);
}
}
$outtxt .= $fchar;
}
}
$outtxt;
}
sub ismonth {
my($testmonth) = shift;
if ($testmonth =~ m/��$/) {
return 1;
}
}
1;