Theory of Information Retrieval, Florida State University LIS-5263 (Fall,
2003)
Written by Rich Ackerman, September 25. 2003
Version 1.0
http://www.hray.com/
There are three programs to run. You run them in this order:
You'll find the include file util.pl at the bottom.
#perl
# rich ackerman
#
# idf.pl builds a file with each word and its idf, log N/n
# N = total # of docs
# n = # of docs containing word
#
# idf.txt is output file.
# Each line is IDF, "word"
#
###########################################################################
$folder= "drj";
$N = 0;
opendir (DIR, $folder) or die "Could not open data directory: $1";
while ( defined ($filename = readdir(DIR))) {
next if ($filename eq ".");
next if ($filename eq "..");
$N++;
%doc_dict=();
open (SRC, "$folder/$filename") or die "Cannot open file: $1";
while (<SRC>) {
# break down current line into array of words
# and count each word in the global dictionary
if ($verbose) { print "."; }
@words = split / /;
for (@words) {
$clean = clean($_);
if (OK($clean) == 1) {
$doc_dict{$clean}++;
# The first time we add a word to the document file,
# we bump the document counter
if ($doc_dict{$clean} == 1) {
$doc_counter{$clean} ++;
}
}
}
}
close (SRC);
}
$max = 0;
$file = "idf.txt";
open (IDF, ">$file") || die "Cannot open $file: $!\n";
for (sort keys %doc_counter) {
printf "%16s %d %d %f %f \n", $_, $doc_counter{$_}, $N, $N/$doc_counter{$_},
log10($N/$doc_counter{$_});
$idf = log10 ( $N / $doc_counter{$_} );
printf IDF "$idf, $_\n";
}
close (IDF);
closedir (DIR);
sub OK {
local($word)= @_;
if (length($word) < 4) { return 0; }
if ($word eq "2003") { return 0;}
if ($word =~/session/) { return 0; }
if ($word =~/EDT/) { return 0; }
return 1;
}
sub log10 {
my $n = shift;
return log($n)/log(10);
}
sub clean {
local($word) = @_;
$word = lc($word); # convert to lower case
$word =~ s/\W//g; # strip out non-word 0-9a-zA-Z
return $word;
}
#perl
# rich ackerman
#
##############################################################################
#
# weight.pl
#
# Input:
#
# 1) Raw text in underlying data directory coincidentally named after
# professor.
#
# 2) Reads the IDF file with inverse document frequencies
#
# Output:
#
# Two files: fij.txt and weight.txt
#
# 1) fij.txt has tuples of [normalized term frequency, word, document name]
# for the words in the input files.
#
# 2) weight.txt has tuples of [w(i,j), word, document name]
#
###############################################################################
require "util.pl";
$folder= "drj";
$N = 0;
# prepare output file for records from all files
$file = "fij.txt";
open (FIJ, ">$file") || die "Cannot open $file: $!\n";
open (W, ">weight.txt") || die "Cannot open weight.txt $!
\n";
########################################################
# LOAD UP the IDF hash - word and IDF
########################################################
%idf = ();
open (IDF, "idf.txt") || die "Cannot open idf.txt $! \n";
while (<IDF>) {
chomp;
($myidf, $mykeyword) = split(/, /, $_, 2);
# print "loading IDF: $myidf, $mykeyword\n";
$idf{$mykeyword} = $myidf;
}
close IDF;
#########################################################
# DO FIJ STUFF: normalized freq for each document.
#########################################################
#
# Process all files in the data directory
# We are going to open up the directory and
# just loop through it until we have processed all
# the files in it. There are two nodes named
# " "." and ".." on Unix systems that we need to skip.
opendir (DIR, $folder) or die "Could not open data directory: $1";
while ( defined ($filename = readdir(DIR))) {
next if ($filename eq ".");
next if ($filename eq "..");
# N keeps track of number of files processed as in the book
# We need this later for normalization.
$N++;
# The document dictionary is a hash table where each word
# in each document gets counted. It gets initialized for
# every document that we process, since we want the count
# of each word in each document. Thus here we
# start with an empty document dictionary:
%doc_dict = ();
# for each file, calculate the raw count [freq(i,j)] and
# the normalized frequency [f(i,j)]
# All we have to do is open the next file and
# then read it one line at a time.
open (SRC, "$folder/$filename") or die "Cannot open file:
$1";
while (<SRC>) {
# Break down current line into array of words.
# Then we loop through the array of words from
# the current line. We clean the word first,
# then see if it passes our rules. If so we
# count it in the global dictionary
@words = split / /;
for (@words) {
$word = clean($_);
if (OK($word) == 1) {
$doc_dict{$word}++;
}
}
}
close (SRC);
# At this point we have input an entire document
# into the document dictionary. Now we need to
# find the most common word in this document.
# There's probably some perl idiom to do this
# but I'll use brute force here.
$max = 0;
for (sort keys %doc_dict) {
if ($doc_dict{$_} > $max) {
$max = $doc_dict{$_};
$max_word = $_;
}
}
print "\nMax case: $max_word $max\n";
print "Number of documents processed: $N\n";
# normalized frequency is f(i,j) = freq(i,j) / max(l) * freq(l, j)
# or is it just freq(i,j) / max(freq(i,j)
for (sort keys %doc_dict) {
$keyword = $_;
$fij = $doc_dict{$keyword} / $max;
# print "$fij, $keyword, $filename, $doc_dict{$keyword}\n";
printf FIJ "$fij, $_, $filename\n";
# add in the weight calculation
$weight = $fij * $idf{$keyword};
print "$weight, $keyword, $filename\n";
print W "$weight, $keyword, $filename\n";
}
}
close FIJ;
close W;
closedir (DIR);
sub OK {
local($word)= @_;
if (length($word) < 4) { return 0; }
if ($word eq "2003") { return 0;}
if ($word =~/session/) { return 0; }
if ($word =~/EDT/) { return 0; }
return 1;
}
#!/usr/bin/perl
#
# find.cgi ranks the files according to the input terms
#
######################################################
#
# INPUT:
#
# weight.txt has: weight, keyword, document name
# idf.txt has: weight, keyword (used for query vector)
# @ARGV is a list of terms to search for.
#
# OUTPUT:
#
# The output is a ranking of all of the files that
# matched the input terms
#
########################################################
#
# First build the query vector.
# That means finding the idf weight of each word in ARGV
# The result will be a hash, qvector, with
# the search term as the key and the query weight
# as the value.
#
# If a term is used more than once in the list we could
# make it more important but we don't do that now. See
# p. 30 of book.
#
########################################################
# initialize HTTP state
use CGI;
$query = new CGI;
print "Content-type: text/html\n\n";
my $searchlist = $query->param("TextLine");
my @SearchSet = split / /, $searchlist;
# LOAD UP the IDF hash - word and IDF
open (IDF, "idf.txt") || die "Cannot open idf.txt $! \n";
while (<IDF>) {
chomp;
($myidf, $mykeyword) = split(/, /, $_, 2);
# print "loading IDF: $myidf, $mykeyword<br>\n";
$idf{ $mykeyword } = $myidf;
}
# Find the idf value for each input keyword and put into
# qvector array
for (@SearchSet) {
$word = lc ($_);
if ($idf{$word} > 0) {$qvector{$word} = $idf{$word}; }
else { $qvector{$word} = 0; }
}
# Print out the query vector just for debugging:
print "<h3>Query vector</h3>\n";
for (sort keys %qvector) {
print "QUERY VECTOR: $qvector{$_}, $_ <br>\n";
}
# Load up WEIGHTS (weight, word, filename)
open (WEIGHTS, "weight.txt") || die "Cannot open weight.txt $!
\n";
@weights = <WEIGHTS>;
# For debugging just print out a few lines to see what we have to work
with.
#
#print "<h3>Weights vector (small sample)</h3>";
#$i = 1500;
#while ($i < 1515) {
# print "WEIGHTS ARRAY: $weights[$i]<br>\n ";
# ++$i;
#}
#
# For each element in the query list, we want to:
# 1) extract all the lines of weights that contain the query term
# 2) add them to an array (@allmatches) for later processing
@allmatches = ();
for (sort keys %qvector) {
$keyword = $_;
@onematch = grep /\b$keyword\b/, @weights;
push(@allmatches, @onematch);
}
# debug that step
print "<h3>Matching query vector to document vectors </h3>";
for (@allmatches) {
print "MATCH: $_ <br>\n";
}
# so now we are almost done!
# Our hash "scores" will have the filename as the key
# and the running total as the value.
print "<h3>Adding products of query * doc weights </h3>";
for (@allmatches) {
chomp;
($weight, $keyword, $filename) = split /, /;
print "ADDING: $keyword in $filename w= $weight * qv= $qvector{$keyword}<br>\n
";
$scores{$filename} += $weight * $qvector{$keyword};
}
# finally output the rankings:
print "<h3>RANKINGS (alphabetic)</h3>\n";
for (sort keys %scores) {
$me = "$scores{$_}, $_ \n";
print "$me<br>\n";
push @listing, $me;
}
print "<h3>\nRANKINGS BY VALUE</h3>";
@newlist = sort numericallydown @listing;
for(@newlist) {
print "$_<br>\n";
}
print "<p><a href='http://www.hray.com/5264/find.htm'>Play some more.</a>";
sub numericallydown {
$b <=> $a;
}