5263 - Vector Model Information Retrieval

Theory of Information Retrieval, Florida State University LIS-5263 (Fall, 2003)
Written by Rich Ackerman, September 25. 2003
Version 1.0
http://www.hray.com/


There are three programs to run. You run them in this order:

  1. idf.pl calculates inverse document frequencies and outputs idf.txt
  2. weight.pl calculates normalized term frequencies and document vector weights, outputting fij.txt and weight.txt
  3. find.cgi runs the search

You'll find the include file util.pl at the bottom.


idf.pl


#perl
# rich ackerman
#
# idf.pl builds a file with each word and its idf, log N/n
# N = total # of docs
# n = # of docs containing word
#
# idf.txt is output file.
# Each line is IDF, "word"
#
###########################################################################

$folder= "drj";
$N = 0;

opendir (DIR, $folder) or die "Could not open data directory: $1";

while ( defined ($filename = readdir(DIR))) {
next if ($filename eq ".");
next if ($filename eq "..");

$N++;
%doc_dict=();
open (SRC, "$folder/$filename") or die "Cannot open file: $1";
while (<SRC>) {

# break down current line into array of words
# and count each word in the global dictionary

if ($verbose) { print "."; }
@words = split / /;
for (@words) {
$clean = clean($_);
if (OK($clean) == 1) {

$doc_dict{$clean}++;

# The first time we add a word to the document file,
# we bump the document counter

if ($doc_dict{$clean} == 1) {
$doc_counter{$clean} ++;
}
}
}
}
close (SRC);
}
$max = 0;

$file = "idf.txt";
open (IDF, ">$file") || die "Cannot open $file: $!\n";
for (sort keys %doc_counter) {
printf "%16s %d %d %f %f \n", $_, $doc_counter{$_}, $N, $N/$doc_counter{$_}, log10($N/$doc_counter{$_});
$idf = log10 ( $N / $doc_counter{$_} );
printf IDF "$idf, $_\n";
}

close (IDF);
closedir (DIR);

sub OK {
local($word)= @_;
if (length($word) < 4) { return 0; }
if ($word eq "2003") { return 0;}
if ($word =~/session/) { return 0; }
if ($word =~/EDT/) { return 0; }
return 1;
}

sub log10 {
my $n = shift;
return log($n)/log(10);
}

sub clean {
local($word) = @_;
$word = lc($word); # convert to lower case
$word =~ s/\W//g; # strip out non-word 0-9a-zA-Z
return $word;
}


weight.pl

#perl
# rich ackerman
#
##############################################################################
#
# weight.pl
#
# Input:
#
# 1) Raw text in underlying data directory coincidentally named after
# professor.
#
# 2) Reads the IDF file with inverse document frequencies
#
# Output:
#
# Two files: fij.txt and weight.txt
#
# 1) fij.txt has tuples of [normalized term frequency, word, document name]
# for the words in the input files.
#
# 2) weight.txt has tuples of [w(i,j), word, document name]
#
###############################################################################

require "util.pl";

$folder= "drj";
$N = 0;

# prepare output file for records from all files

$file = "fij.txt";

open (FIJ, ">$file") || die "Cannot open $file: $!\n";
open (W, ">weight.txt") || die "Cannot open weight.txt $! \n";

########################################################
# LOAD UP the IDF hash - word and IDF
########################################################

%idf = ();
open (IDF, "idf.txt") || die "Cannot open idf.txt $! \n";
while (<IDF>) {
chomp;
($myidf, $mykeyword) = split(/, /, $_, 2);
# print "loading IDF: $myidf, $mykeyword\n";
$idf{$mykeyword} = $myidf;
}
close IDF;

#########################################################
# DO FIJ STUFF: normalized freq for each document.
#########################################################
#
# Process all files in the data directory
# We are going to open up the directory and
# just loop through it until we have processed all
# the files in it. There are two nodes named
# " "." and ".." on Unix systems that we need to skip.

opendir (DIR, $folder) or die "Could not open data directory: $1";
while ( defined ($filename = readdir(DIR))) {
next if ($filename eq ".");
next if ($filename eq "..");

# N keeps track of number of files processed as in the book
# We need this later for normalization.

$N++;

# The document dictionary is a hash table where each word
# in each document gets counted. It gets initialized for
# every document that we process, since we want the count
# of each word in each document. Thus here we
# start with an empty document dictionary:

%doc_dict = ();

# for each file, calculate the raw count [freq(i,j)] and
# the normalized frequency [f(i,j)]
# All we have to do is open the next file and
# then read it one line at a time.

open (SRC, "$folder/$filename") or die "Cannot open file: $1";
while (<SRC>) {

# Break down current line into array of words.
# Then we loop through the array of words from
# the current line. We clean the word first,
# then see if it passes our rules. If so we
# count it in the global dictionary

@words = split / /;
for (@words) {
$word = clean($_);
if (OK($word) == 1) {
$doc_dict{$word}++;
}
}
}
close (SRC);

# At this point we have input an entire document
# into the document dictionary. Now we need to
# find the most common word in this document.
# There's probably some perl idiom to do this
# but I'll use brute force here.

$max = 0;
for (sort keys %doc_dict) {
if ($doc_dict{$_} > $max) {
$max = $doc_dict{$_};
$max_word = $_;
}
}

print "\nMax case: $max_word $max\n";
print "Number of documents processed: $N\n";

# normalized frequency is f(i,j) = freq(i,j) / max(l) * freq(l, j)
# or is it just freq(i,j) / max(freq(i,j)

for (sort keys %doc_dict) {
$keyword = $_;
$fij = $doc_dict{$keyword} / $max;
# print "$fij, $keyword, $filename, $doc_dict{$keyword}\n";
printf FIJ "$fij, $_, $filename\n";

# add in the weight calculation
$weight = $fij * $idf{$keyword};
print "$weight, $keyword, $filename\n";
print W "$weight, $keyword, $filename\n";
}
}

close FIJ;
close W;
closedir (DIR);

sub OK {
local($word)= @_;
if (length($word) < 4) { return 0; }
if ($word eq "2003") { return 0;}
if ($word =~/session/) { return 0; }
if ($word =~/EDT/) { return 0; }
return 1;
}


find.cgi

#!/usr/bin/perl
#
# find.cgi ranks the files according to the input terms
#
######################################################
#
# INPUT:
#
# weight.txt has: weight, keyword, document name
# idf.txt has: weight, keyword (used for query vector)
# @ARGV is a list of terms to search for.
#
# OUTPUT:
#
# The output is a ranking of all of the files that
# matched the input terms
#
########################################################
#
# First build the query vector.
# That means finding the idf weight of each word in ARGV
# The result will be a hash, qvector, with
# the search term as the key and the query weight
# as the value.
#
# If a term is used more than once in the list we could
# make it more important but we don't do that now. See
# p. 30 of book.
#
########################################################

# initialize HTTP state

use CGI;
$query = new CGI;

print "Content-type: text/html\n\n";

my $searchlist = $query->param("TextLine");
my @SearchSet = split / /, $searchlist;

# LOAD UP the IDF hash - word and IDF

open (IDF, "idf.txt") || die "Cannot open idf.txt $! \n";
while (<IDF>) {
chomp;
($myidf, $mykeyword) = split(/, /, $_, 2);
# print "loading IDF: $myidf, $mykeyword<br>\n";
$idf{ $mykeyword } = $myidf;
}

# Find the idf value for each input keyword and put into
# qvector array

for (@SearchSet) {
$word = lc ($_);
if ($idf{$word} > 0) {$qvector{$word} = $idf{$word}; }
else { $qvector{$word} = 0; }
}

# Print out the query vector just for debugging:

print "<h3>Query vector</h3>\n";
for (sort keys %qvector) {
print "QUERY VECTOR: $qvector{$_}, $_ <br>\n";
}

# Load up WEIGHTS (weight, word, filename)
open (WEIGHTS, "weight.txt") || die "Cannot open weight.txt $! \n";
@weights = <WEIGHTS>;

# For debugging just print out a few lines to see what we have to work with.
#
#print "<h3>Weights vector (small sample)</h3>";
#$i = 1500;
#while ($i < 1515) {
# print "WEIGHTS ARRAY: $weights[$i]<br>\n ";
# ++$i;
#}
#
# For each element in the query list, we want to:
# 1) extract all the lines of weights that contain the query term
# 2) add them to an array (@allmatches) for later processing

@allmatches = ();
for (sort keys %qvector) {
$keyword = $_;
@onematch = grep /\b$keyword\b/, @weights;
push(@allmatches, @onematch);
}


# debug that step
print "<h3>Matching query vector to document vectors </h3>";
for (@allmatches) {
print "MATCH: $_ <br>\n";
}

# so now we are almost done!
# Our hash "scores" will have the filename as the key
# and the running total as the value.

print "<h3>Adding products of query * doc weights </h3>";

for (@allmatches) {
chomp;
($weight, $keyword, $filename) = split /, /;
print "ADDING: $keyword in $filename w= $weight * qv= $qvector{$keyword}<br>\n ";
$scores{$filename} += $weight * $qvector{$keyword};
}

# finally output the rankings:

print "<h3>RANKINGS (alphabetic)</h3>\n";
for (sort keys %scores) {
$me = "$scores{$_}, $_ \n";
print "$me<br>\n";
push @listing, $me;
}
print "<h3>\nRANKINGS BY VALUE</h3>";
@newlist = sort numericallydown @listing;
for(@newlist) {
print "$_<br>\n";
}

print "<p><a href='http://www.hray.com/5264/find.htm'>Play some more.</a>";

sub numericallydown {
$b <=> $a;
}


util.pl



sub clean {
local($word) = @_;
$word = lc($word); # convert to lower case
$word =~ s/\W//g; # strip out non-word 0-9a-zA-Z
return $word;
}

return 1;