private static ArrayListWritable<PairOfInts> deserializePosting(BytesWritable inputBytes) { ArrayListWritable<PairOfInts> posting = new ArrayListWritable<PairOfInts>(); DataInputStream dataIn = new DataInputStream(new ByteArrayInputStream(inputBytes.getBytes())); int prevDocID = 0; try { while (true) { int left = WritableUtils.readVInt(dataIn); int right = WritableUtils.readVInt(dataIn); if (right != 0) { posting.add(new PairOfInts(left + prevDocID, right)); prevDocID += left; } } } catch (EOFException e) { } catch (IOException e) { } try { dataIn.close(); } catch (IOException e) { } return posting; }
@Test public void testInvertedIndexing() throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"), edu.umd.cloud9.example.ir.BuildInvertedIndex.class.getCanonicalName(), IntegrationUtils.LOCAL_ARGS, "-libjars=" + IntegrationUtils.getJar("lib", "guava"), "-input", collectionPath.toString(), "-output", tmpPrefix, "-numReducers", "1" }; IntegrationUtils.exec(Joiner.on(" ").join(args)); MapFile.Reader reader = new MapFile.Reader(new Path(tmpPrefix + "/part-r-00000"), conf); Text key = new Text(); PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>(); key.set("gold"); reader.get(key, value); assertEquals(584, value.getLeftElement().get()); ArrayListWritable<PairOfInts> postings = value.getRightElement(); assertEquals(584, value.getLeftElement().get()); assertEquals(5303, postings.get(0).getLeftElement()); assertEquals(684030, postings.get(100).getLeftElement()); assertEquals(1634312, postings.get(200).getLeftElement()); reader.close(); }
public void map( ArrayListOfLongsWritable key, ArrayListWritable<IntWritable> sentenceList, OutputCollector<PairOfInts, IntWritable> output, Reporter reporter) throws IOException { IntWritable s1; IntWritable s2; int s1line; int s2line; for (int i = 0; i < sentenceList.size() && i < MAXSIZE; i++) { s1 = sentenceList.get(i); s1line = s1.get() / nSamples; for (int j = i + 1; j < sentenceList.size() && j < MAXSIZE; j++) { s2 = sentenceList.get(j); s2line = s2.get() / nSamples; pairOut = new PairOfInts(); if (s1line == s2line) { continue; } /* if(((s1.get()/nSamples)%2 == 0 && (s2.get()/nSamples)%2 == 0) || ((s1.get()/nSamples)%2 != 0 && (s2.get()/nSamples)%2 != 0)){ continue; } */ if (s1.get() < s2.get()) { // pairOut.set(s1.get(), s2.get()); pairOut.set(s1line, s2line); } else { // pairOut.set(s2.get(), s1.get()); pairOut.set(s2line, s1line); } output.collect(pairOut, ONE); } } }
/** Runs this tool. */ @SuppressWarnings({"static-access"}) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); ArrayListWritable<PairOfInts> postings; BytesWritable bytesValue = new BytesWritable(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); // ArrayListWritable<PairOfVInts> postings = value; for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } bytesValue = new BytesWritable(); key.set("gold"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("silver"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("bronze"); Writable w = reader.get(key, bytesValue); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }