Download Inverted Index for Web Documents PDF

TitleInverted Index for Web Documents
TagsComputer Programming Computer Data Letter Case Search Engine Indexing
File Size191.2 KB
Total Pages12
Document Text Contents
Page 6

5

/**
*
*/
private static final long serialVersionUID = 1L;

public DataFrame() {};

public static IntWritable max(IntWritable a, IntWritable b)
{

return (a.get() > b.get()) ? a : b;
}

public void addWordEntrt(Text fileName, IntWritable offset) {
if (this.containsKey(fileName)) {

this.get(fileName).getValue().add(offset);
} else {

PairOfWritables<IntWritable,
ArrayListWritable<IntWritable>> p = new PairOfWritables<IntWritable,
ArrayListWritable<IntWritable>>();

ArrayListWritable<IntWritable> list = new
ArrayListWritable<IntWritable>();

list.add(offset);
p.set(new IntWritable(0), list);
this.put(new Text(fileName), p);

}
}

public void mergeFrame(DataFrame other) {
Iterator<Entry<Text, PairOfWritables<IntWritable,

ArrayListWritable<IntWritable>>>> it = other.entrySet().iterator();
while (it.hasNext()) {

Entry<Text, PairOfWritables<IntWritable,
ArrayListWritable<IntWritable>>> pairs = (Entry<Text,
PairOfWritables<IntWritable, ArrayListWritable<IntWritable>>>)
it.next();

Text fileName = new Text();
fileName.set(pairs.getKey().toString());

PairOfWritables<IntWritable,
ArrayListWritable<IntWritable>> p = pairs.getValue();

if (this.containsKey(fileName)) {
PairOfWritables<IntWritable,

ArrayListWritable<IntWritable>> p1 = this.get(fileName);
PairOfWritables<IntWritable,

ArrayListWritable<IntWritable>> p3 = new PairOfWritables<IntWritable,
ArrayListWritable<IntWritable>>();

ArrayListWritable<IntWritable> l =
p1.getValue();

l.addAll(p.getValue());
HashSet hs = new HashSet();
hs.addAll(l);
l.clear();
l.addAll(hs);
p3.set(max(p1.getKey(), p.getKey()), l);
this.remove(fileName);
this.put(fileName, p3);

Page 11

10

job.setMapOutputValueClass(DataFrame.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DataFrame.class);

// Set the input and output file paths
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));

// Time the job whilst it is running
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() -

startTime)
/ 1000.0 + " seconds");

// Returning 0 lets everyone know the job was successful
return 0;

}

public static void main(String[] args) throws Exception
{

ToolRunner.run(new BasicInvertedIndex(), args);

// char[] p = {'n', 'o', 't', 'h', 'i', 'n', 'g'};
// Stemmer s = new Stemmer();
// s.add(p, 7);
// s.stem();
// System.out.println(s.toString());

}
}

Page 12

11

5. Output samples

Bart {Bart_the_Mother.txt.gz=(1, [820, 1, 548, 408, 2067, 1914, 10, 2074,

674, 1774, 1226, 1234, 144, 574, 571, 2061, 1894, 1614, 1600, 1872, 1256,

2083, 669, 531, 53, 2091, 1757, 539, 2212, 610, 204, 1849, 1022, 1038,

1704, 631, 2114, 321, 1587, 1961, 450, 878, 508, 1343, 381, 1813, 109,

2149, 1561, 1206, 354, 1798]), Bart_the_General.txt.gz=(1, [1, 10, 59, 69,

107, 131, 181, 207, 225, 243, 262, 277, 301, 304, 335, 435, 641, 758, 834,

879, 1001, 1262, 1284, 1418, 1506, 1512, 1519, 1528, 1550, 1560, 1601]),

Bart_the_Murderer.txt.gz=(1, [1, 136, 1096, 142, 10, 1365, 1225, 286,

2078, 2017, 2050, 2023, 1660, 264, 1790, 389, 2001, 2103, 1130, 161, 2008,

437, 523, 1871, 2084, 1265, 1995, 179, 207, 477, 1574, 619, 79, 1439,

1169, 1839, 457, 1293, 2124, 633, 1720, 582, 233, 707, 108, 1537, 250,

1206, 1084, 247, 602, 723, 1077]), Bart_the_Genius.txt.gz=(1, [1, 883,

2035, 141, 544, 412, 1309, 677, 10, 1363, 742, 2045, 1772, 194, 461, 1958,

1005, 1712, 1174, 1353, 93, 390, 1420, 329, 373, 1135, 1067, 103, 308,

2007, 506, 2015, 1985, 1747, 1991, 2086, 355, 58, 486, 1277, 1998, 240]),

Bart_the_Lover.txt.gz=(1, [136, 1, 2308, 2071, 2314, 1160, 10, 950, 2391,

2117, 2321, 154, 1782, 2053, 2385, 83, 1661, 2330, 2028, 1284, 926, 369,

919, 111, 1333, 2410, 1196, 117, 185, 2357, 963, 2431, 724, 301, 2093]),

Bart_the_Fink.txt.gz=(1, [479, 274, 204, 1, 1975, 1917, 1849, 1699, 748,

10, 1841, 407, 1832, 1413, 1648, 1825, 149, 1822, 1551, 1665, 507, 232,

108, 167, 1530, 1876, 1321, 670, 1627, 528, 1682, 1935, 1451])}

Similer Documents