|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/cgi-bin/ |
Upload File : |
#!/usr/bin/perl -- -*- coding:cn-gb -*-
#print "Content-type:text/plain\n\n";
# Read in the lexicon
open(WORDS, "wordlist.txt") or print "Content-type: text/plain\n\nCan't open wordlist\n";
$linecount = 0;
while ($line = <WORDS>) {
$line =~ s/[\r\n]*$//;
#print length($line), " ", $linecount, "\n";
$linecount++;
last if $linecount > 50000;
next if $line =~ /^\#/;
$cwords{$line} = 1;
if (length($line) == 6) {
if (!exists($cwords{substr($line, 0, 4)})) {
$cwords{substr($line, 0, 4)} = 2;
}
}
if (length($line) == 8) {
if (!exists($cwords{substr($line, 0, 4)})) {
$cwords{substr($line, 0, 4)} = 2;
}
if (!exists($cwords{substr($line, 0, 6)})) {
$cwords{substr($line, 0, 6)} = 2;
}
}
if (length($line) == 10) {
if (!exists($cwords{substr($line, 0, 4)})) {
$cwords{substr($line, 0, 4)} = 2;
}
if (!exists($cwords{substr($line, 0, 6)})) {
$cwords{substr($line, 0, 6)} = 2;
}
if (!exists($cwords{substr($line, 0, 8)})) {
$cwords{substr($line, 0, 8)} = 2;
}
}
}
close(WORDS);
#print "Loaded\n";
sub addsegword {
my($line) = shift;
next if $line =~ /^\#/;
$cwords{$line} = 1;
if (length($line) == 6) {
if (!exists($cwords{substr($line, 0, 4)})) {
$cwords{substr($line, 0, 4)} = 2;
}
}
if (length($line) == 8) {
if (!exists($cwords{substr($line, 0, 4)})) {
$cwords{substr($line, 0, 4)} = 2;
}
if (!exists($cwords{substr($line, 0, 6)})) {
$cwords{substr($line, 0, 6)} = 2;
}
}
if (length($line) == 10) {
if (!exists($cwords{substr($line, 0, 4)})) {
$cwords{substr($line, 0, 4)} = 2;
}
if (!exists($cwords{substr($line, 0, 6)})) {
$cwords{substr($line, 0, 6)} = 2;
}
if (!exists($cwords{substr($line, 0, 8)})) {
$cwords{substr($line, 0, 8)} = 2;
}
}
}
# Numbers
$numbers = "���һ��������߰˾�ʮ��ǧ���ڣ������������������������";
$numbers .= "��������ئ��Ҽ������½��ƾ�ʰ��Ǫ";
for ($n = 0; $n < length($numbers); $n+=2) {
$cnumbers{substr($numbers, $n, 2)} = 1;
}
# Wide ASCII words
$wascii = "��������������������������������";
$wascii .= "���£ãģţƣǣȣɣʣˣ̣ͣΣϣУѣңӣԣգ֣ףأ٣ڣ�";
$wascii .= "";
for ($n = 0; $n < length($wascii); $n+=2) {
$cascii{substr($wascii, $n, 2)} = 1;
}
# Foreign name transliteration characters
$foreign = "�����������°İͱ����ȱ������������Ĵ��µǵϵٶ���������Ʒѷ�";
$foreign .= "����ǸԸ���Ź����Ϻպ����������٤�ֽܽ����¿ƿɿ˿�";
$foreign .= "������������������¬³����������������ĦĪīķ��������";
$foreign .= "����Ŭŵ�����Ƥƥ����������Ƚ��ɣɭɳ��ʲʷʿ˹��̩̹��ͼ�������";
$foreign .= "ά���������ϣл���ѷ������Ү�ӡ�����ղ��������";
for ($n = 0; $n < length($foreign); $n+=2) {
$cforeign{substr($foreign, $n, 2)} = 1;
}
#Chinese surnames
$surname = "�������װ����������ϱ߱�ز��̲�᯲���³̳ٳ��ҳ�";
$surname .= "���������˵ҵ��Ŷ˶η�������쳷ѷ���������Ǹ�";
$surname .= "�߸깢�����������ȹŹ˹ٹعܹ���º̺κغպ�����";
$surname .= "���ƻ�����ͼ��ּ��彪������������ӿ��¿տ������";
$surname .= "����������������������������¡��¥¦¬�³";
$surname .= "½���������������éë÷����������ĪIJ������ťũ����";
$surname .= "����Ƥ��ƽ������������Ǯǿ��������������Ƚ����������";
$surname .= "������ɳ������������ʢʯʷ����˹������̷̸ۢ������";
$surname .= "��١���Ϳ�����ΣΤκ�ε�����������������ϰ�����";
$surname .= "�����л������������Ѧ�������������������ҦҶ��������";
$surname .= "������������������ξԪԬ��������ղտ����������";
$surname .= "֣����������ףׯ��������";
$uncommonsurname = "����ȫ������·�ţȨʱˮ��ϯӦӢ��"; # ��ͬ
for ($n = 0; $n < length($surname); $n+=2) {
$csurname{substr($surname, $n, 2)} = 1;
}
for ($n = 0; $n < length($uncommonsurname); $n+=2) {
$uncommoncsurname{substr($uncommonsurname, $n, 2)} = 1;
}
# Add in 2 character surnames; also add to lexicon so they'll be segmented as one unit
$csurname{"����"} = 1; $cwords{"����"} = 1;
$csurname{"����"} = 1; $cwords{"����"} = 1;
$csurname{"�ʸ�"} = 1; $cwords{"�ʸ�"} = 1;
$csurname{"Ľ��"} = 1; $cwords{"Ľ��"} = 1;
$csurname{"ŷ��"} = 1; $cwords{"ŷ��"} = 1;
$csurname{"����"} = 1; $cwords{"����"} = 1;
$csurname{"˾��"} = 1; $cwords{"˾��"} = 1;
$csurname{"˾��"} = 1; $cwords{"˾��"} = 1;
$csurname{"˾ͽ"} = 1; $cwords{"˾ͽ"} = 1;
$csurname{"�̨"} = 1; $cwords{"�̨"} = 1;
$csurname{"���"} = 1; $cwords{"���"} = 1;
$punctuation .= "����������������ѡ�����������������������";
#Not in name
$notname = "��˵���ں��DZ���������н�������Ϊ��";
$notname .= $punctuation;
for ($n = 0; $n < length($notname); $n+=2) {
$cnotname{substr($notname, $n, 2)} = 1;
}
sub add_ChineseNames {
($tmpline) = @_;
$tlen = length($tmpline);
$newline = "";
for ($m = 0; $m < $tlen; $m++) {
$tchar = substr($tmpline, $m, 1);
$currtoken = "";
if ($tchar =~ /^\s$/) {
$newline .= $tchar;
} else {
$currtoken = "";
while ($tchar !~ /^\s$/ and $m < $tlen) {
$currtoken .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
if (defined($csurname{$currtoken}) or
defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
# go past following spaces
$tchar = substr($tmpline, $m, 1);
$spaces = "";
while ($tchar =~ /\s/ and $m < $tlen) {
$spaces .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# Get next token
$tchar = substr($tmpline, $m, 1);
$currtoken2 = "";
while ($tchar !~ /\s/ and $m < $tlen) {
$currtoken2 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# go past following spaces
$tchar = substr($tmpline, $m, 1);
$spaces2 = "";
while ($tchar =~ /\s/ and $m < $tlen) {
$spaces2 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# Get next token
$tchar = substr($tmpline, $m, 1);
$currtoken3 = "";
while ($tchar !~ /\s/ and $m < $tlen) {
$currtoken3 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
if (isChinese($currtoken2) and (length($currtoken2) == 2) and
(!defined($cnotname{$currtoken2})) and
isChinese($currtoken3) and length($currtoken3) == 2 and
!defined($cnotname{$currtoken3}))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $currtoken3 . $cname[1];
#$cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
#$cwords{$currtoken . $currtoken2} = 2; # short version for checking
} elsif (isChinese($currtoken2) and (length($currtoken2) == 2)
and (!defined($cnotname{$currtoken2})))
{
$newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
} elsif (defined($csurname{$currtoken}) and
isChinese($currtoken2) and (length($currtoken2) == 4) and
($cwords{$currtoken2} != 1) and
(!defined($cnotname{$currtoken2})))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
$cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
} elsif (defined($uncommoncsurname{$currtoken}) and
isChinese($currtoken2) and (length($currtoken2) == 4)
and (!defined($cnotname{$currtoken2}))
and ($cwords{$currtoken2} != 1))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
$cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
} else {
$newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
}
} else {
$newline .= $currtoken;
}
$m--; # reset so won't skip space
}
}
$newline;
}
#sub cword_start {
# my($tword) = @_;
# if (grep(/^$tword/, @cwordlist) > 0) {
# return 1;
# } else {
# return 0;
# }
#}
sub isChinese {
my($cchar) = shift;
for ($b = 0; $b < length($cchar); $b++) {
if (unpack("C", substr($cchar, $b, 1)) < 128) {
return 0;
}
}
return 1;
}
sub allnum {
my ($localnum) = shift;
# Need this if?
if ($localnum =~ m/[0-9][0-9]*(\.[0-9]+)?/) {
return 1;
}
for ($k = 0; $k < length($localnum); $k+=2) {
if (!defined($cnumbers{substr($localnum, $k, 2)})) {
return 0;
}
}
return 1;
}
sub allnumbers {
my($localnum) = shift;
if ($localnum =~ m/[0-9][0-9]*(\.[0-9]+)?/) {
return 1;
}
for ($k = 0; $k < length($localnum); $k+=2) {
if (!defined($cnumbers{substr($localnum, $k, 2)})) {
return 0;
}
}
return 1;
}
sub allwascii {
my($localstr) = shift;
for ($k = 0; $k < length($localstr); $k+=2) {
if (!defined($cascii{substr($localstr, $k, 2)})) {
return 0;
}
}
return 1;
}
sub allforeign {
my($localstr) = shift;
for ($k = 0; $k < length($localstr); $k+=2) {
if (!defined($cforeign{substr($localstr, $k, 2)})) {
return 0;
}
}
return 1;
}
sub segmentline() {
my($line) = shift;
$chinaccum = "";
$outline = "";
$linelen = length($line);
for ($i = 0; $i < $linelen; $i++) {
$char1 = substr($line, $i, 1);
if (unpack("C", $char1) > 127) {
$chinchar = substr($line, $i, 2);
if ($chinaccum eq "") {
$outline .= " " unless $i == 0;
$chinaccum = $chinchar;
} else {
if (defined($cwords{$chinaccum . $chinchar}) and
$cwords{$chinaccum . $chinchar} == 1) { # is in lexicon
$chinaccum .= $chinchar;
} elsif (allnum($chinaccum) and defined($cnumbers{$chinchar})) {
$chinaccum .= $chinchar;
} elsif (allwascii($chinaccum) and defined($cascii{$chinchar})) {
$chinaccum .= $chinchar;
} elsif (allforeign($chinaccum) and defined($cforeign{$chinchar}) and
defined($cwords{substr($line, $i, 4)}) and
$cwords{substr($line, $i, 4)} != 1 and
$cwords{substr($line, $i, 4)} != 2) {
$chinaccum .= $chinchar;
} elsif (defined($cwords{$chinaccum . $chinchar}) and
($cwords{$chinaccum . $chinchar} == 2) and
defined($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)}) and
(($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)} == 1) or
($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)} == 2)))
{ # starts a word in the lexicon
$chinaccum .= $chinchar;
} else {
$outline .= $chinaccum . " ";
$chinaccum = $chinchar; # start anew
}
}
$i++;
} else { # Plain ascii text, attach any accumulated Chinese and then ascii
if ($chinaccum ne "") {
$outline .= $chinaccum . " ";
$chinaccum = "";
}
$outline .= $char1;
}
}
$outline .= $chinaccum;
$chinline = add_ChineseNames($outline);
return $chinline;
}
#=cut
1;