|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/ |
Upload File : |
#!/usr/local/bin/perl
#
# Erik Peterson
# LING 467 Information Retrieval
# Assignment #5
# March 15, 1999
#
# Program to segment words in one or more text files
# or web pages.
# Based on:
# Simple program to count words in a text
# Case Insensitive - everything mapped to lower case
# split on whitespace or on non-word characters
# Author: George Wilson
&Init_Indexer;
$document_separator = '\-{6,}'; # Default document separator
$title_re = '^TITLE:\s+(.*)';
$document_number = 0;
$print_summary = 0;
open(DOC, "> erik_docindex.txt") or die "Can't open document data file\n";
open(WRD, "> erik_wordlist.txt") or die "Can't open word list file\n";
open(IDX, "> erik_wordindex.txt") or die "Can't open word index file\n";
binmode(IDX);
# Read in the files and/or web pages, segment, and print out totals
$total = 0;
if (@ARGV > 0) { # Read from files or web page
while (@ARGV > 0) {
$arg = shift @ARGV;
if ($arg eq "-help" or $arg eq "-h") {
print_help();
exit;
} elsif ($arg eq "-engfreq") {
# I downloaded a English word frequency list from the BNC
# Corpus. I could use this frequency list to compare with
# the word frequencies for a given document
# (Not yet implemented)
} elsif ($arg eq "-docsep") { # The document separator
$document_separator = shift @ARGV;
} elsif ($arg eq "-docnum") {
# User can select a document number to start counting from,
# Instead of the default zero
if ($ARGV[0] =~ m/^\d+$/) {
$document_number = shift @ARGV;
} else {
die "Invalid starting document number: $ARGV[0]\nExiting.";
}
} elsif ($arg eq "-summary") {
$print_summary = 1;
} elsif ($arg eq "-titlere") {
$title_re = shift @ARGV;
} else { # A file name or URL
# I need to extend this to handle wildcards in the filenames
if ($arg =~ m!http://!) { # HTML document
open(FD, `lynx -dump $arg |`);
#s/\[\d+\]//g;
#s/\nReferences\s+\d\..*$//s;
} elsif (-e $arg) { # Local file that exists
open(FD, $arg);
} else {
die "Argument $arg not valid file name or URL\n";
}
$current_offset = 0;
while (($current_document = Get_NextDoc(\*FD, $document_separator)) ne "") {
@words = &FindWords($current_document, "");
$title = &FindTitle($current_document, $title_re);
if ($title =~ m/^\s*$/) { # Skip empty docs
next;
}
@doclines = ($current_document =~ m/\n/gm);
$doctotal = 0;
undef %docuniq;
foreach $word (@words) {
next if defined($stopwords{$word});
next if $word =~ m/^\d([0-9,]*)?(\.\d+)?$/;
if (defined($wordindex{$word})) {
push @{$wordindex{$word}}, $document_number;
} else {
$wordindex{$word} = [];
push @{$wordindex{$word}}, $document_number;
}
$doctotal++;
$total++;
$docuniq{$word}++;
$wcount{$word}++;
}
#printf "DOC#: %5d Lines: %5d Words: %5d Uniq: %5d\n",
#$document_number, $#doclines+1, $doctotal, scalar(keys(%docuniq));
print DOC $title, "\t", $arg, "\t", $current_offset, "\t",
$document_length, "\n";
# End of current document. Can do something here later.
$current_offset += $document_length;
$document_number++;
}
close(FD);
# End of current file. Can do something here later.
}
}
} else { # No Arguments, read from STDIN
while (($current_document = Get_NextDoc(\*STDIN, $document_separator)) ne "") {
#print "$current_document";
@words = &FindWords($current_document, "");
@dl = ($current_document =~ m/\n/gm);
$doctotal = 0;
undef %docuniq;
foreach $word (@words) {
$doctotal++;
$total++;
$docuniq{$word}++;
$wcount{$word}++;
}
printf "DOC#: %5d Lines: %5d Words: %5d Uniq: %5d\n",
$document_number, $#dl+1, $doctotal, scalar(keys(%docuniq));
$document_number++;
}
}
# Write out word list and word index file
$current_offset = 0;
foreach $word (sort keys %wordindex) {
$docnos = join(",", @{$wordindex{$word}});
$docnos .= "\n";
print WRD $word, "\t", $current_offset, "\t", length($docnos), "\t", @{$wordindex{$word}} + 0, "\n";
print IDX $docnos;
$current_offset += length($docnos);
}
# Print out summary of words seen during indexing
if ($print_summary) {
$unique = 0;
foreach $word (sort keys(%wcount)) {
#next if defined($stopwords{$word}); # Skip the stopwords
printf "%7d %s\n", $wcount{$word}, $word;
$unique++;
}
printf "Total words: %d\n", $total;
printf "Unique words: %d\n", $unique;
}
#################### END of Main Routine ######################
sub Get_NextDoc {
my($fd, $sep) = @_;
my($curdoc) = "";
my($line);
while ($line = <$fd>) {
if ($line =~ m/$sep/) {
if ($curdoc eq "") {
# Handles a case where the document separator starts the
# file or two document separators happen one after another
$curdoc = " ";
}
$document_length = length($curdoc) + length($line);
return $curdoc;
} else {
$curdoc .= $line;
}
}
return $curdoc;
}
sub FindTitle {
my($document, $title_re) = @_;
if ($document =~ m/$title_re/m) {
return $1;
} elsif ($document =~ m/$\n*([^\n]+)\n/) {
return $1;
}
return "";
}
sub FindWords {
my($string, $flags) = @_;
$_ = $string;
# List of special words, names, etc. that may need to be
# recombined and how to recombine them
@special_words = (['AT\s+\&\s+T', 'AT\&T'],
['U\s+\.\s+S\s+\.', 'U.S.']
);
# translate to lower case
tr/A-Z/a-z/;
# Add a space to all possible word breaks, then rejoin what
# needs to be rejoined, get rid of tokens without a single word
# character, and break on spaces
@basewords = split(/(\W+)/);
# Add a space between all possible word boundaries
s/(\S)\b(\W+)/$1 $2/sg;
s/(\W+)\b(\S)/$1 $2/sg;
# Recombine the special cases
for ($i = 0; $i < @special_words; $i++) {
s/${$special_words[$i]}[0]/${$special_words[$i]}[1]/sg;
}
#print; # Can use this for debug
# Split on spaces
@words = split(/\s+/);
# Rejoin E-mail addresses
for ($i = 0; $i < @words; $i++) {
$word = "";
if ($words[$i] =~ m/^\w+$/ and
$i+1 < @words and $words[$i+1] eq "@" and
$i+2 < @words and $words[$i+2] =~ m/^\w+$/)
{
$word = $words[$i] . $words[$i+1] . $words[$i+2];
$j = $i + 3;
while ($j < @words and $words[$j] eq "." and
$j+1 < @words and $words[$j+1] =~ m/^\w+$/) {
$word .= $words[$j] . $words[$j+1];
$j+=2;
}
splice @words, $i, $j-$i, $word;
}
}
for ($i = 0; $i < @words; $i++) {
$number = "";
if ($words[$i] =~ m/^\d+(,\d+)?/) {
if ($i+1 < @words and $words[$i+1] eq "," and
$i+2 < @words and $words[$i+2] =~ /\d+/)
{
# Join comma separated numbers
$number = $words[$i] . $words[$i+1] . $words[$i+2];
splice @words, $i, 3, $number;
$i--; # Allows this to be linked to next token
} elsif ($i+1 < @words and $words[$i+1] eq "." and
$i+2 < @words and $words[$i+2] =~ /\d+/)
{
# Join decimal separated numbers
$number = $words[$i] . $words[$i+1] . $words[$i+2];
splice @words, $i, 3, $number;
}
}
}
# Money (must be done after combining numbers)
for ($i = 0; $i < @words; $i++) {
$money = "";
if ($words[$i] eq "\$" and
$i+1 < @words and $words[$i] =~ m/^\d+((,\d+)*(\.\d+))/)
{
$money = $words[$i] . $words[$i+1];
$j = $i+2;
while ($words[$j] =~ m/(hundred|thousand|million|billion)/) {
$money .= " " . $words[$j++];
}
splice @words, $i, $j-$i, $money;
}
}
# Times
for ($i = 0; $i < @words; $i++) {
last; # Delete if want to do times as whole units
$word = "";
if ($words[$i] =~ m/^\d{1,2}/ and
$i+1 < @words and $words[$i+1] eq ":" and
$i+2 < @words and $words[$i+2] =~ m/^\d+$/)
{
$word = $words[$i] . $words[$i+1] . $words[$i+2];
$j = $i + 3;
while ($j < @words and
$words[$j] =~ /(pm|am|est|mst|pst|edt|mdt|pdt)/) {
$word .= " " . $words[$j++];
}
splice @words, $i, $j-$i, $word;
}
}
# Other types that may be good to recombine
# 1. Person names
# 2. Web URL's
# 3. Dates
# 4. Percentages
# 5. General named entities
# Remove tokens that do not have at least one word character
for ($i = 0; $i < @words; $i++) {
if ($words[$i] !~ m/\w/) {
splice @words, $i, 1;
$i--;
} elsif ($words[$i] =~ m/^\W*$/) {
splice @words, $i, 1;
$i--;
}
}
return @words;
}
sub Init_Indexer {
# Create an array and hash of stop words for possible future use
open(STOP, "stopwords.txt") or die "Can't open stopword list.";
while (<STOP>) {
chomp;
push @stopwords, $_;
}
foreach $stopword (@stopwords) {
$stopwords{$stopword} = 1;
}
}
sub print_help {
print "
Usage: erikcount.pl [parameters]
Optional arguments:
-docnum [number] :
Document number to use as base for assigning new document id's.
-docsep [regular expression] :
Regular expression to use to find document boundaries. Can
repeat this before each filename to allow different separators
for different types of file formats.
-summary :
Print out summary information about all data processed, including
a listing of unique words found in the data, along with a count
of the total words and total unique words.
-titlere :
Regular expression to use for finding titles in a document.
Can repeat before each file for different documen types.
filename :
One or more filenames to be indexed.
URL :
URL of a web page to be indexed.
This program will read data from the files and\/or URL\'s given on
the command line and return a listing of all the words in the data,
sorted by frequency of occurance in the document. The total number of
words and total number of unique words is also returned. If no arguments
are given to the program, it will read data from STDIN.
";
}