.

KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.4.62
System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64
User : www ( 80)
PHP Version : 8.3.8
Disable Function : NONE
Directory : /domains/mandarintools/
Upload File :
Current File : /domains/mandarintools/erikindex6.pl
#!/usr/local/bin/perl
#

# Erik Peterson
# LING 467 Information Retrieval
# Assignment #5
# March 15, 1999
#
# Program to segment words in one or more text files
# or web pages.

# Based on:
# Simple program to count words in a text
# Case Insensitive - everything mapped to lower case
# split on whitespace or on non-word characters
# Author: George Wilson


&Init_Indexer;

$document_separator = '\-{6,}';  # Default document separator
$title_re = '^TITLE:\s+(.*)';
$document_number = 0;
$print_summary = 0;

open(DOC, "> erik_docindex.txt") or die "Can't open document data file\n";
open(WRD, "> erik_wordlist.txt") or die "Can't open word list file\n";
open(IDX, "> erik_wordindex.txt") or die "Can't open word index file\n";
binmode(IDX);

# Read in the files and/or web pages, segment, and print out totals
$total  = 0;

if (@ARGV > 0) {   # Read from files or web page
    while (@ARGV > 0) {
	$arg = shift @ARGV;
	if ($arg eq "-help" or $arg eq "-h") {
	    print_help();
	    exit;
	} elsif ($arg eq "-engfreq") {
	    # I downloaded a English word frequency list from the BNC
	    # Corpus.  I could use this frequency list to compare with
	    # the word frequencies for a given document
	    # (Not yet implemented)
	} elsif ($arg eq "-docsep") {     # The document separator
	    $document_separator = shift @ARGV;
	} elsif ($arg eq "-docnum") {
	    # User can select a document number to start counting from,
	    # Instead of the default zero
	    if ($ARGV[0] =~ m/^\d+$/) {
		$document_number = shift @ARGV;
	    } else {
		die "Invalid starting document number: $ARGV[0]\nExiting.";
	    }
	} elsif ($arg eq "-summary") {
	    $print_summary = 1;
	} elsif ($arg eq "-titlere") {
	    $title_re = shift @ARGV;
	} else {  # A file name or URL
	    # I need to extend this to handle wildcards in the filenames

	    if ($arg =~ m!http://!) {  # HTML document
		open(FD, `lynx -dump $arg |`);
		#s/\[\d+\]//g;
		#s/\nReferences\s+\d\..*$//s;
	    } elsif (-e $arg) {  # Local file that exists
		open(FD, $arg);
	    } else {
		die "Argument $arg not valid file name or URL\n";
	    }

	    $current_offset = 0;
	    while (($current_document = Get_NextDoc(\*FD, $document_separator)) ne "") {
		@words = &FindWords($current_document, "");

		$title = &FindTitle($current_document, $title_re);
		if ($title =~ m/^\s*$/) {  # Skip empty docs
		    next;
		}

		@doclines = ($current_document =~ m/\n/gm);
		$doctotal = 0;
		undef %docuniq;
		foreach $word (@words) {
		    next if defined($stopwords{$word});
		    next if $word =~ m/^\d([0-9,]*)?(\.\d+)?$/;
		    if (defined($wordindex{$word})) {
			push @{$wordindex{$word}}, $document_number;
 		    } else {
                        $wordindex{$word} = [];
			push @{$wordindex{$word}}, $document_number;
		    }

		    $doctotal++;
		    $total++;
		    $docuniq{$word}++;
		    $wcount{$word}++;
		}

		#printf "DOC#: %5d   Lines: %5d   Words: %5d   Uniq: %5d\n",
		#$document_number, $#doclines+1, $doctotal, scalar(keys(%docuniq));
		print DOC $title, "\t", $arg, "\t", $current_offset, "\t",
		  $document_length, "\n";

		# End of current document.  Can do something here later.

		$current_offset += $document_length;
		$document_number++;
	    }
	    close(FD);

	    # End of current file.  Can do something here later.
	}
    }
} else {  # No Arguments, read from STDIN
    while (($current_document = Get_NextDoc(\*STDIN, $document_separator)) ne "") {
	#print "$current_document";
	@words = &FindWords($current_document, "");

	@dl = ($current_document =~ m/\n/gm);
	$doctotal = 0;
	undef %docuniq;
	foreach $word (@words) {
	    $doctotal++;
	    $total++;
	    $docuniq{$word}++;
	    $wcount{$word}++;
	}

	printf "DOC#: %5d   Lines: %5d   Words: %5d   Uniq: %5d\n",
	$document_number, $#dl+1, $doctotal, scalar(keys(%docuniq));

	$document_number++;
    }
}



# Write out word list and word index file
$current_offset = 0;
foreach $word (sort keys %wordindex) {
    $docnos = join(",", @{$wordindex{$word}});
    $docnos .= "\n";
    print WRD $word, "\t", $current_offset, "\t", length($docnos), "\t", @{$wordindex{$word}} + 0, "\n";
    print IDX $docnos;
    $current_offset += length($docnos);
}



# Print out summary of words seen during indexing
if ($print_summary) {
    $unique = 0;
    foreach $word (sort keys(%wcount)) {
	#next if defined($stopwords{$word});  # Skip the stopwords
	printf "%7d %s\n", $wcount{$word}, $word;
	$unique++;
    }

    printf "Total  words: %d\n", $total;
    printf "Unique words: %d\n", $unique;
}

####################  END of Main Routine ######################




sub Get_NextDoc {
    my($fd, $sep) = @_;
    my($curdoc) = "";
    my($line);

    while ($line = <$fd>) {
	if ($line =~ m/$sep/) {
	    if ($curdoc eq "") {
		# Handles a case where the document separator starts the
		# file or two document separators happen one after another
		$curdoc = " ";
	    }
	    $document_length = length($curdoc) + length($line);
	    return $curdoc;
	} else {
	    $curdoc .= $line;
	}
    }

    return $curdoc;
}


sub FindTitle {
    my($document, $title_re) = @_;

    if ($document =~ m/$title_re/m) {
	return $1;
    } elsif ($document =~ m/$\n*([^\n]+)\n/) {
	return $1;
    }

    return "";
}


sub FindWords {
    my($string, $flags) = @_;

    $_ = $string;

    # List of special words, names, etc. that may need to be
    # recombined and how to recombine them
    @special_words = (['AT\s+\&\s+T', 'AT\&T'], 
		      ['U\s+\.\s+S\s+\.', 'U.S.']
		      );


    # translate to lower case
    tr/A-Z/a-z/;

    # Add a space to all possible word breaks, then rejoin what
    # needs to be rejoined, get rid of tokens without a single word
    # character, and break on spaces

    @basewords = split(/(\W+)/);

    # Add a space between all possible word boundaries
    s/(\S)\b(\W+)/$1 $2/sg;
    s/(\W+)\b(\S)/$1 $2/sg;

    # Recombine the special cases
    for ($i = 0; $i < @special_words; $i++) {
	s/${$special_words[$i]}[0]/${$special_words[$i]}[1]/sg;
    }

    #print; # Can use this for debug

		      

    # Split on spaces
    @words = split(/\s+/);

    # Rejoin E-mail addresses
    for ($i = 0; $i < @words; $i++) {
	$word = "";
	if ($words[$i] =~ m/^\w+$/ and 
	    $i+1 < @words and $words[$i+1] eq "@" and
	    $i+2 < @words and $words[$i+2] =~ m/^\w+$/)
	{
	    $word = $words[$i] . $words[$i+1] . $words[$i+2];
	    $j = $i + 3;
	    while ($j < @words and $words[$j] eq "." and
		   $j+1 < @words and $words[$j+1] =~ m/^\w+$/) {
		$word .= $words[$j] . $words[$j+1];
		$j+=2;
	    }
	    splice @words, $i, $j-$i, $word;
	}
    }

    for ($i = 0; $i < @words; $i++) {
	$number = "";
	if ($words[$i] =~ m/^\d+(,\d+)?/) {
	    if ($i+1 < @words and $words[$i+1] eq "," and
		$i+2 < @words and $words[$i+2] =~ /\d+/) 
	    {
		# Join comma separated numbers
		$number = $words[$i] . $words[$i+1] . $words[$i+2];
		splice @words, $i, 3, $number;
		$i--;  # Allows this to be linked to next token
	    } elsif ($i+1 < @words and $words[$i+1] eq "." and
		$i+2 < @words and $words[$i+2] =~ /\d+/) 
	    {
		# Join decimal separated numbers
		$number = $words[$i] . $words[$i+1] . $words[$i+2];
		splice @words, $i, 3, $number;
	    }
	}
    }

    # Money (must be done after combining numbers)
    for ($i = 0; $i < @words; $i++) {
	$money = "";
	if ($words[$i] eq "\$" and 
	    $i+1 < @words and $words[$i] =~ m/^\d+((,\d+)*(\.\d+))/)
	{
	    $money = $words[$i] . $words[$i+1];
	    $j = $i+2;
	    while ($words[$j] =~ m/(hundred|thousand|million|billion)/) {
		$money .= " " . $words[$j++];
	    }
	    splice @words, $i, $j-$i, $money;
	}
    }


    # Times
    for ($i = 0; $i < @words; $i++) {
        last;  # Delete if want to do times as whole units
	$word = "";
	if ($words[$i] =~ m/^\d{1,2}/ and 
	    $i+1 < @words and $words[$i+1] eq ":" and
	    $i+2 < @words and $words[$i+2] =~ m/^\d+$/) 
	{
	    $word = $words[$i] . $words[$i+1] . $words[$i+2];
	    $j = $i + 3;
	    while ($j < @words and 
		   $words[$j] =~ /(pm|am|est|mst|pst|edt|mdt|pdt)/) { 
		$word .= " " . $words[$j++];
	    }
	    splice @words, $i, $j-$i, $word;
	}
    }



    # Other types that may be good to recombine
    #  1. Person names
    #  2. Web URL's
    #  3. Dates
    #  4. Percentages 
    #  5. General named entities

    # Remove tokens that do not have at least one word character
    for ($i = 0; $i < @words; $i++) {
	if ($words[$i] !~ m/\w/) {
	    splice @words, $i, 1;
	    $i--;
	} elsif ($words[$i] =~ m/^\W*$/) {
	    splice @words, $i, 1;
	    $i--;
	}
    }
    

    return @words;
}

sub Init_Indexer {
    # Create an array and hash of stop words for possible future use
    
    open(STOP, "stopwords.txt") or die "Can't open stopword list.";
    while (<STOP>) {
	chomp;
	push @stopwords, $_;
    }

    foreach $stopword (@stopwords) {
	$stopwords{$stopword} = 1;
    }
}


sub print_help {
    print "
Usage:  erikcount.pl [parameters]

Optional arguments:
   -docnum [number] :  
          Document number to use as base for assigning new document id's.
   -docsep [regular expression] :
          Regular expression to use to find document boundaries.  Can
          repeat this before each filename to allow different separators
          for different types of file formats.
   -summary :
          Print out summary information about all data processed, including
          a listing of unique words found in the data, along with a count
          of the total words and total unique words.
   -titlere :
          Regular expression to use for finding titles in a document.
          Can repeat before each file for different documen types.
   filename :  
          One or more filenames to be indexed.
   URL :
          URL of a web page to be indexed. 

    This program will read data from the files and\/or URL\'s given on
the command line and return a listing of all the words in the data,
sorted by frequency of occurance in the document.  The total number of
words and total number of unique words is also returned.  If no arguments
are given to the program, it will read data from STDIN.

";

}
Anon7 - 2021