.

KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.4.62
System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
User : www ( 80)
PHP Version : 8.3.8
Disable Function : NONE
Directory : /domains/mandarintools/cgi-bin/
Upload File :
Current File : /domains/mandarintools/cgi-bin/neelib.pl
# -*- coding:cn-gb -*-

$debugon = 0;


require "segmenter.pl";
print "Loaded segmenter\n" if $debugon;
$tagset = "MUC";


# Load various word lists
@hashtypes = ("geonames", "geotypes", "surnames", "notname", "titles",
	      "persons", "orgnames", "orgwords", "orgtypes", "dates",
	      "times", "currency");

foreach $hashtype (@hashtypes) {
    &load_hash($hashtype);
}

sub load_hash {
    my($hashname) = shift;
    my(@hashkey, @words);
    my $line;
# Prepare place name info
    open(HASH, "data/$hashname.txt") or die $!;
    my($i) = -1;
    while ($line = <HASH>) {
	next if $line =~ m/^\s*$/ or $line =~ m/^\s*\#/;
	$line =~ s/[\r\n]*$//;
	($hashkey, $dummy) = split(/\t/, $line);
	&load_hash_key($hashkey, $hashname);
    }
    close(HASH);
}


sub load_hash_key {
    my($hashkey, $hashname) = @_;
    my($windex);
    # Load words into a hash tree, allowing quick and sequential checking
    $hashkey =~ s/^\s*(.*?)\s*$/$1/;
    my(@words) = split(/\s+/, $hashkey);
    my($node) = \%{$hashname};
    for ($windex = 0; $windex <= $#words; $windex++) {
	unless (defined($$node{$words[$windex]})) {
	    if ($windex == $#words) {
		$$node{$words[$windex]} = { "EOW" => 1 };
	    } else {
		$$node{$words[$windex]} = {};
	    }
	}
	$node = $$node{$words[$windex]};
    }
}


# Add the words in the data files to the segmenter lexicon so that they are
# segmented correctly
foreach $hashtype ("geonames", "geotypes", "titles", "persons", "orgnames", "orgwords",
		   "orgtypes", "currency") {
    open(HF, "data/$hashtype.txt");
    while ($dataline = <HF>) {
	next if $dataline =~ m/^\#/;
	($hashdata, $rest) = split(/\t/, $dataline, 2);
	@hashwords = split(/\s/, $hashdata);
	foreach $hashword (@hashwords) {
	    addsegword($hashword);
	}
    }

    close(HF);

}




sub addNEE {
    $filetxt = shift;

    $segtxt = "";

    $filetxt =~ s/\r//g;
    my $line;
    my @lines = split(/\n/, $filetxt);
    # Segment the file and store results in $segtxt
    foreach $line (@lines) { 
	$segtxt .= &segmentline($line);
	$segtxt .= "\n";
    }

# Print out the segmented file to use in debugging
    if ($debugon) {
	$segfile = "tempseg.txt" if $segfile eq "";
	open(SEG, "> $segfile");
	print SEG addnewline($segtxt, $filetxt);
	close(SEG);
    }

# Specify the tags to use for markup (two possibilities)
    if ($tagset eq "MUC") {
	@{"PERSON"} = ("<ENAMEX TYPE=\"PERSON\">", "</ENAMEX>");
	@{"LOCATION"} = ("<ENAMEX TYPE=\"LOCATION\">", "</ENAMEX>");
	@{"ORGS"} = ("<ENAMEX TYPE=\"ORGANIZATION\">", "</ENAMEX>");
	@{"DATE"} = ("<TIMEX TYPE=\"DATE\">", "</TIMEX>");
	@{"TIME"} = ("<TIMEX TYPE=\"TIME\">", "</TIMEX>");
	@{"PERCENT"} = ("<NUMEX TYPE=\"PERCENT\">", "</NUMEX>");
	@{"MONEY"} = ("<NUMEX TYPE=\"MONEY\">", "</NUMEX>");
	$newline = "\n";
    } elsif ($tagset eq "HTML") {
	@{"PERSON"} = ("<FONT COLOR=RED>", "</FONT>");
	@{"LOCATION"} = ("<FONT COLOR=GREEN>", "</FONT>");
	@{"ORGS"} = ("<FONT COLOR=BLUE>", "</FONT>");
	@{"DATE"} = ("<FONT COLOR=PURPLE>", "</FONT>");
	@{"TIME"} = ("<FONT COLOR=GRAY>", "</FONT>");
	@{"PERCENT"} = ("<FONT COLOR=ORANGE>", "</FONT>");
	@{"MONEY"} = ("<FONT COLOR=YELLOW>", "</FONT>");
	$newline = "<BR>\n";
    }

# Split into sentences (either �� or two newlines)
    @sentences = split(/(��\s*|\n\n)/, $segtxt);


    print "Running rules\n" if $debugon;
# Process for each type
    $sentpos = 0;
    print "Total sentences: " . @sentences/2 . "\n" if $debugon;
    for ($i = 0; $i < @sentences; $i++) {
	print "$i: $sentences[$i]\n";
	if ($sentences[$i] =~ m/^(��\s*|\n\n)$/) {
	    $sentpos += length($sentences[$i]);
	    next;
	}
	$sentence = $sentences[$i];
	@sentwords = split(/(\s+)/, $sentence);
	&placenames;
	&persons;
	&organizations;
	&times;
	&dates;
	&money;
	&percents;
	$sentpos += length($sentences[$i]);
#    if ($i % 10 == 0) { print "Sentence #" . $i/2 . "\n"; } # if $debugon;
    }

    print "Reconciling overlaps\n" if $debugon;
    my $outtxt = &reconcile_and_tag();

    return $outtxt;

}


sub placenames {
    my(@location_rules) = ('(LOCATION (BEG &allforeign+ %geotypes END) ADD )',
			   '(LOCATION (BEG &allforeign+ END "���") ADD )',
			   '(LOCATION (BEG &allforeign+ END "����") ADD )',
			   '(LOCATION (BEG %geonames %geotypes END) ADD )',
			   '(LOCATION (BEG ANY END ANY "����") ADD)',
			   '(LOCATION (BEG ANY END "����") ADD)',
			   '(LOCATION (BEG %geonames END))');
    foreach $rule (@location_rules) {
	&run_rule($rule);
    }
}

sub persons {
    my(@person_rules) = ('(PERSON (BEG &allforeign+ "��" &allforeign+ END) ADD )',
			 '(PERSON (BEG &allforeign+ "." &allforeign+ END) ADD )',
			 '(PERSON (%titles BEG &allforeign+ END) ADD )',
			 '(PERSON (%titles BEG &isPossibleChineseName END) ADD )',
			 '(PERSON (%titles BEG %surname ANY END) ADD )',
			 '(PERSON (BEG &allforeign+ END %titles) ADD )',
			 '(PERSON (%orgtypes "��" BEG &allforeign+ END) ADD )',
			 '(PERSON (%geonames "��" BEG &allforeign+ END) ADD )',
			 '(PERSON (BEG &isChineseName END) ADD )',
			 '(PERSON (BEG %persons END))');
    foreach $rule (@person_rules) {
	&run_rule($rule);
    }
}

sub organizations {
    my(@org_rules) = ('(ORGS (BEG &allforeign %orgtypes END) ADD )',
		      '(ORGS (BEG %geonames %orgtypes END) ADD )',
		      '(ORGS (BEG %orgwords+ %orgtypes END) ADD )',
		      '(ORGS (BEG %orgnames END))');
    foreach $rule (@org_rules) {
	&run_rule($rule);
    }
}

sub dates {
    my(@date_rules) = ('(DATE (BEG &allnumbers "��" END))',
		       '(DATE (BEG &allnumbers "��" &allnumbers "��" END))',
		       '(DATE (BEG &allnumbers "��" &allnumbers "��" &allnumbers "��" END))',
		       '(DATE (BEG &allnumbers "��" &allnumbers "��" END))',
		       '(DATE (BEG &allnumbers "��" &allnumbers "��" END))',
		       '(DATE (BEG &allnumbers "��" END))',
		       '(DATE (BEG &ismonth &allnumbers "��" END))',
		       '(DATE (BEG &allnumbers "�·�" END))',
		       '(DATE (BEG %dates END))');
    foreach $rule (@date_rules) {
	&run_rule($rule);
    }
}


sub times {
    my(@time_rules) = ('(TIME (BEG &allnumbers "��" END))',
		       '(TIME (BEG &allnumbers "����" END))',
		       '(TIME (BEG %times END))');
    foreach $rule (@time_rules) {
	&run_rule($rule);
    }
}


sub money {
    my(@money_rules) = ('(MONEY (BEG &allnumbers+ %currency END))',
			'(MONEY (BEG "$" &allnumbers END))');
    foreach $rule (@money_rules) {
	&run_rule($rule);
    }
}

sub percents {
    @percent_rules = ('(PERCENT (BEG "�ٷ�֮" &allnumbers+ END))',
		      '(PERCENT (BEG &allnumbers "��֮" &allnumbers END))',
		      '(PERCENT (BEG "��" &allnumbers+ END))',
		      '(PERCENT (BEG &allnumbers+ "��" END))',
		      '(PERCENT (BEG &allnumbers+ "�ٷֵ�" END))',
		      '(PERCENT (BEG &isPercent END))');
    foreach $rule (@percent_rules) {
	&run_rule($rule);
    }
}


sub run_rule {
    my($toprule) = shift;

    print "Running $toprule\n";

    my($curpos) = $sentpos;
    my($start, $end, $node, $tempi, $j, $i, $sentstart);
    my($type, $rule, $addkey) = ($toprule =~ m/^\((\w+)\s\((.+?)\)\s?(\w*)\s?\)$/);

    @atoms = split(/\s+/, $rule);
    for ($i = 0; $i < @sentwords; $i++) {
	print "Sent word $i of \n" if $debugon;
	if ($sentwords[$i] =~ /^\s+$/) { # Skip spaces
	    $curpos += length($sentwords[$i]);
	    next;
	}
	$tempos = $curpos;
	$posholder = $tempos;
	$tempi = $i;
	$match = 1;
	for ($j = 0; $j < @atoms; $j++) {
	    if ($atoms[$j] eq "BEG") {  # start offset
		$start = $tempos;
	    } elsif ($atoms[$j] eq "END") { #end offset
		$end = $tempos;
	    } elsif ($atoms[$j] eq "ANY" and $tempi < @sentwords) { # Match anything
		$tempos += length($sentwords[$tempi]);
		$tempi++;
	    } elsif ($atoms[$j] =~ /^.?\"/) { # exact string match
		my($negate) = 0;
		if ($atoms[$j] =~ /^\-/) { $negate = 1; }
		if ($atoms[$j] =~ /\*$/) { $kleene = 0; }
		if ($atoms[$j] =~ /\+$/) { $kleene = 1; }

		my($string) = ($atoms[$j] =~ /^.?\"(.*?)\".?$/);
		if ($tempi < @sentwords and $string eq $sentwords[$tempi]) {
		    $tempos += length($sentwords[$tempi]);
		    $tempi++;
		} else {
		    $match = 0;
		    last;
		}
	    } elsif ($atoms[$j] =~ /^.?\&/) { # run sub on word, true or false
		my($negate) = 0;
		my($kleene) = -1;
		my($localatom) = $atoms[$j];
		if ($localatom =~ s/^\-//) { $negate = 1; }
		if ($localatom =~ s/\*$//) { $kleene = 0; }
		if ($localatom =~ s/\+$//) { $kleene = 1; }

		my($subname) = ($localatom =~ m/^.?\&(.+)(\*|\+)?$/);

		if ($kleene == -1) {
		    if (&{$subname}($sentwords[$tempi]) != 0) {
			$tempos += length($sentwords[$tempi]);
			$tempi++;
		    } else {
			$match = 0;
			last;
		    }
		} elsif ($kleene == 0) {
		    while (&{$subname}($sentwords[$tempi]) != 0 and
			   $tempi < @sentwords) {
			$tempos += length($sentwords[$tempi]);
			$tempi++;
			if ($sentwords[$tempi] =~ /^\s+$/) {  # Handle whitespace
			    $tempos += length($sentwords[$tempi]);
			    $tempi++;
			}
		    }
		} elsif ($kleene == 1) {
		    if (&{$subname}($sentwords[$tempi]) != 0) {
			$tempos += length($sentwords[$tempi]);
			$tempi++;
		    } else {
			$match = 0;
			last;
		    }
		    if ($sentwords[$tempi] =~ /^\s+$/) {  # Handle whitespace
			$tempos += length($sentwords[$tempi]);
			$tempi++;
		    }
		    while (&{$subname}($sentwords[$tempi]) != 0 and
			   $tempi < @sentwords) {
			$tempos += length($sentwords[$tempi]);
			$tempi++;
			if ($sentwords[$tempi] =~ /^\s+$/) {  # Handle whitespace
			    $tempos += length($sentwords[$tempi]);
			    $tempi++;
			}
		    }
		}
	    } elsif ($atoms[$j] =~ /^.?\%/) { # member of a set (hash), one or more tokens
		my($negate) = 0;
		my($i, $kleene, $node, $hashstart, $posholder, $arrayholder);
		if ($atoms[$j] =~ /^\-/) { $negate = 1; }
		if ($atoms[$j] =~ /\*$/) { $kleene = 0; }
		if ($atoms[$j] =~ /\+$/) { $kleene = 1; }

		my($hashname) = ($atoms[$j] =~ /^.?\%(.*?)(\*|\+)?$/);
		
		$hashstart = $tempos;
		$node = \%{$hashname};
		
		$posholder = $hashstart;
		$arrayholder = $tempi;
		
		while (defined(${$node}{$sentwords[$tempi]}) or $sentwords[$tempi] =~ m/^\s+$/) {
		    unless ($sentwords[$tempi] =~ /^\s+$/) {
			$node = \%{$$node{$sentwords[$tempi]}};
		    }
		    $tempos += length($sentwords[$tempi]);
		    $tempi++;
		    if (defined($$node{"EOW"})) { #Complete as is. Update placeholders
			$posholder = $tempos;
			$arrayholder = $tempi;
		    }
		}
		    
		if ($posholder == $hashstart) { # Nothing worked, fail the match
		    $tempi = $arrayholder;
		    $match = 0;
		    last;
		} else {
		    $tempos = $posholder;
		    $tempi = $arrayholder;
		}
		    
	    } # end hash match
	    if ($match == 0) { last; };  # No match, move on to next starting point
	    if ($tempi < @sentwords and $sentwords[$tempi] =~ /^\s+$/) {  # Handle whitespace
		$tempos += length($sentwords[$tempi]);
		$tempi++;
	    }
	} # end for $j

	if ($j == @atoms and $match == 1) {
            print "We have a match! $start\t$end\t$type\n" if $debugon;
	    push @markups, "$start\t$end\t$type";

	    # Add back into hash to find if it occurs again
	    $type2hash{"ORG"} = "orgnames";
	    $type2hash{"LOCATION"} = "geonames";
	    $type2hash{"PERSON"} = "persons";
	    print "Load hashkey\n";
	    if ($addkey ne "") {
	#	print substr($segtxt, $start, $end-$start), "\n";
		load_hash_key(substr($segtxt, $start, $end - $start), $type2hash{$type});
	    }

	    $i = $tempi - 1;
	    $curpos = $tempos;
	    print "$i $tempi $curpos\n";
	} else {
	    $curpos += length($sentwords[$i]);
	}
        
    } # end for $i
}  # end run_rule



sub reconcile_and_tag {
   my($fchar, $tchar);

 # Establish a hierarchy of tag types
    @hierarchy = ("LOCATION", "PERSON", "ORG", "DATE", "TIME", "MONEY",
		  "PERCENT");
    for (0 .. $#hierarchy) {
	$hierarchy{$hierarchy[$_]} = $_;
    }

 # Sort markups by starting offset
    @markups = sort bypos @markups;

    foreach $markup (@markups) {
	print "Markup $markup\n";
	my($start, $end, $type) = split(/\t/, $markup);
#	print $markup, "\t", substr($segtxt, $start, $end-$start) , "\n";
    }

 # reconcile overlapping tags here
    $i = 0; $j = 1;
    while ($i <= $#markups) {
	if ($i == $#markups) {
            # The last one, add it
	    push @markups2, $markups[$i];
	    last;
	}
	($start, $end, $type) = split(/\t/, $markups[$i]);
	($start2, $end2, $type2) = split(/\t/, $markups[$j]);
	if ($end <= $start2) {  # no possible overlap, add and move on
	    push @markups2, $markups[$i];
	    $i = $j; $j++;
	} elsif ($start == $start2 and $end == $end2) {
	    # Share exact offsets, use hierarcy to determine which
	    if ($hierarchy{$type} < $hierarchy{$type2}) {
		$j++;
	    } else {
		$i = $j; $j++;
	    }
	} elsif ($start < $start2 and $end2 <= $end) {  # tag2 in tag1; add tag1
	    # One entirely contained in other, use larger entity
	    $j++;
	} elsif ($start == $start2 and $end < $end2) {  # tag1 in tag2; add tag2
	    $i = $j; $j++;
	} elsif ($start < $start2 and $end > $start2 and $end < $end2) { 
	    # Two tags partially overlap, use one higher in hierarchy
	    if ($hierarchy{$type} > $hierarchy{$type2}) {
		$j++;
	    } else {
		$i = $j; $j++;
	    }
	}
    } 


# Uncomment when above code works
    @markups = (@markups2);

 # add SGML tags here
    $previndex = 0;
    for ($k = 0; $k <= $#markups; $k++) {
	($start, $end, $type) = split(/\t/, $markups[$k]);
	$tagtxt .= substr($segtxt, $previndex, $start-$previndex) . 
	    ${$type}[0];
	$tagtxt .= substr($segtxt, $start, $end - $start) . ${$type}[1];
	$previndex = $end;
    }
    $tagtxt .= substr($segtxt, $previndex, length($segtxt) - $previndex);
    
 # take out extra spaces added by segmenter, add newlines

    for ($i = 0, $j = 0; $j < length($tagtxt); $i++, $j++) {
	$fchar = substr($filetxt, $i, 1);
	while ($fchar eq "\n") {
	    $outtxt .= $newline;
	    $i++;
	    $fchar = substr($filetxt, $i, 1);
	}
	$tchar = substr($tagtxt, $j, 1);

	if ($fchar eq $tchar) {
	    $outtxt .= $fchar;
	} else {
	    while ($fchar ne $tchar and $j < length($tagtxt)) {
		if ($tchar eq "<") {
		    $outtxt .= $tchar;
		    while ($tchar ne ">") {
			$j++;
			$tchar = substr($tagtxt, $j, 1);
			$outtxt .= $tchar;
		    }
		} elsif ($fchar eq "\n") {
		    while ($fchar eq "\n") {
			$outtxt .= "<BR>\n";
			$i++;
			$fchar = substr($filetxt, $i, 1);
		    }
                } elsif ($fchar eq " ") {
		    $outtxt .= " ";
		    $i++;
		    $fchar = substr($filetxt, $i, 1);
                } else {
		    $j++;
		    $tchar = substr($tagtxt, $j, 1);
		}
	    }
	    $outtxt .= $fchar;
	}
    }
   return $outtxt;
}


# Function to sort markups by position and type before removing overlapping markups
sub bypos {
    ($starta, $enda, $typea) = split(/\t/, $a);
    ($startb, $endb, $typeb) = split(/\t/, $b);
    $starta <=> $startb 
	or
    $enda <=> $endb 
	or 
    $hierarchy{$typea} <=> $hierarchy{$typeb};
}





sub isPercent {
    my($token) = shift;
    if ($token =~ m/^[0-9][0-9]*(\.[0-9]+)?\%/) {
	return 1;
    } else {
	return 0;
    }
}

sub isChineseName {
    my($token) = shift;
    print "Is Chinese name: $token\n";

    if (length($token) < 4 or length($token) > 6) {
	return 0;
    }

    my $surchar = substr($token, 0, 2);
    if (!defined($csurname{$surchar})) {
	return 0;
    }
    if (defined($cwords{$token}) and $cwords{$token} == 1) {
	return 0;
    }
    if (defined($geonames{$token})) {
	return 0;
    }
    return 1;
}


sub isPossibleChineseName {
    my($token) = shift;
    my($surchar);
    if (length($token) < 4 or length($token) > 6) {
	return 0;
    }
    $surchar = substr($token, 0, 2);
    if (!defined($csurname{$surchar}) and !defined($uncommoncsurname{$surchar})) {
	return 0;
    }
    if ($cwords{$token} == 1) {
	return 0;
    }
    if (defined($geonames{$token})) {
	return 0;
    }
    return 1;
}


sub addnewline {
    my($tagtxt, $filetxt) = @_;
    my($i, $j, $outtxt, $fchar, $tchar);

    for ($i = 0, $j=0; $j < length($tagtxt); $i++, $j++) {
	$fchar = substr($filetxt, $i, 1);
	while ($fchar eq "\n") {
	    $outtxt .= "\n";
	    $i++;
	    $fchar = substr($filetxt, $i, 1);
	}
	$tchar = substr($tagtxt, $j, 1);

	if ($fchar eq $tchar) {
	    $outtxt .= $fchar;
	} else {
	    while ($fchar ne $tchar and $j < length($tagtxt)) {
		if ($tchar eq "<") {
		    $outtxt .= $tchar;
		    while ($tchar ne ">") {
			$j++;
			$tchar = substr($tagtxt, $j, 1);
			$outtxt .= $tchar;
		    }
		} else {
		    $outtxt .= $tchar;
		    $j++;
		    $tchar = substr($tagtxt, $j, 1);
		}
	    }
	    $outtxt .= $fchar;
	}
    }
    $outtxt;
}


sub ismonth {
    my($testmonth) = shift;
    if ($testmonth =~ m/��$/) {
	return 1;
    }
}



1;
Anon7 - 2021