@Test public void get() throws Exception { // vv MapFileSeekTest Text value = new Text(); reader.get(new IntWritable(496), value); assertThat(value.toString(), is("One, two, buckle my shoe")); // ^^ MapFileSeekTest }
public static void printComparisonList(JobConf job, FileSystem hdfs) throws IOException { MapFile.Reader partCompListReader = new MapFile.Reader(hdfs, partitionComparisonList.getName(), job); Text part = new Text(); TextArrayWritable array = new TextArrayWritable(); partCompListReader.get(part, array); System.out.println(part.toString() + " neighbors: "); for (int i = 0; i < array.get().length; i++) System.out.print(array.get()[i] + ", "); }
@Test public void testInvertedIndexing() throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"), edu.umd.cloud9.example.ir.BuildInvertedIndex.class.getCanonicalName(), IntegrationUtils.LOCAL_ARGS, "-libjars=" + IntegrationUtils.getJar("lib", "guava"), "-input", collectionPath.toString(), "-output", tmpPrefix, "-numReducers", "1" }; IntegrationUtils.exec(Joiner.on(" ").join(args)); MapFile.Reader reader = new MapFile.Reader(new Path(tmpPrefix + "/part-r-00000"), conf); Text key = new Text(); PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>(); key.set("gold"); reader.get(key, value); assertEquals(584, value.getLeftElement().get()); ArrayListWritable<PairOfInts> postings = value.getRightElement(); assertEquals(584, value.getLeftElement().get()); assertEquals(5303, postings.get(0).getLeftElement()); assertEquals(684030, postings.get(100).getLeftElement()); assertEquals(1634312, postings.get(200).getLeftElement()); reader.close(); }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { context.getCounter(MYCOUNTER.RECORD_COUNT).increment(1); if (value.toString().length() > 0) { String arrEmpAttributes[] = value.toString().split("\\t"); txtMapLookupKey.set(arrEmpAttributes[6].toString()); try { // txtMapLookupKey = deptNo // txtMapLookupValue = deptName deptMapReader.get(txtMapLookupKey, txtMapLookupValue); } finally { txtMapLookupValue.set( (txtMapLookupValue.equals(null) || txtMapLookupValue.equals("")) ? "NOT-FOUND" : txtMapLookupValue.toString()); } txtMapOutputKey.set(arrEmpAttributes[0].toString()); // empNo --> joinKey txtMapOutputValue.set( arrEmpAttributes[1].toString() + "\t" + arrEmpAttributes[1].toString() + "\t" + arrEmpAttributes[2].toString() + "\t" + arrEmpAttributes[3].toString() + "\t" + arrEmpAttributes[4].toString() + "\t" + arrEmpAttributes[5].toString() + "\t" + arrEmpAttributes[6].toString() + "\t" // deptNo + txtMapLookupValue.toString()); // deptName } context.write(txtMapOutputKey, txtMapOutputValue); txtMapLookupValue.set(""); txtMapLookupKey.set(""); }
/** Runs this tool. */ @SuppressWarnings({"static-access"}) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); ArrayListWritable<PairOfInts> postings; BytesWritable bytesValue = new BytesWritable(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); // ArrayListWritable<PairOfVInts> postings = value; for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } bytesValue = new BytesWritable(); key.set("gold"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("silver"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("bronze"); Writable w = reader.get(key, bytesValue); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }