@Test public void seek() throws Exception { assertThat(reader.seek(new IntWritable(496)), is(true)); assertThat(reader.next(key, value), is(true)); assertThat(((IntWritable) key).get(), is(497)); assertThat(((Text) value).toString(), is("Three, four, shut the door")); }
public static void printComparisonList(JobConf job, FileSystem hdfs) throws IOException { MapFile.Reader partCompListReader = new MapFile.Reader(hdfs, partitionComparisonList.getName(), job); Text part = new Text(); TextArrayWritable array = new TextArrayWritable(); partCompListReader.get(part, array); System.out.println(part.toString() + " neighbors: "); for (int i = 0; i < array.get().length; i++) System.out.print(array.get()[i] + ", "); }
@Before public void setUp() throws IOException { MapFileDemo.main(new String[] {MAP_URI}); Configuration conf = new Configuration(); fs = FileSystem.get(URI.create(MAP_URI), conf); reader = new MapFile.Reader(fs, MAP_URI, conf); key = (WritableComparable<?>) ReflectionUtils.newInstance(reader.getKeyClass(), conf); value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); }
public static void main(int step, Path inputDir, JobConf job) throws IOException { FileSystem hdfs = inputDir.getFileSystem(job); if (!hdfs.exists(Collector.partitionSizesPath)) { System.out.println("Partition sizes file does not exists!"); return; } debugStages = job.getBoolean(Config.DEBUG_STAGES_PROPERTY, Config.DEBUG_STAGES_VALUE); MapFile.Reader partitionSizeReader = new MapFile.Reader(hdfs, Collector.partitionSizesPath.getName(), new JobConf()); Text partitionK = new Text(); LongWritable partSizeV = new LongWritable(); try { while (partitionSizeReader.next(partitionK, partSizeV)) { partitionsNames.add(partitionK.toString()); // useless? partitionsSizes.put(partitionK.toString(), partSizeV.get()); } } catch (Exception e) {; } for (int i = 0; i < partitionsNames.size(); i++) { System.out.println( "Partition " + partitionsNames.get(i) + " has " + partitionsSizes.get(partitionsNames.get(i)) + " vectors."); } if (partitionsNames.size() <= 1) return; stage0(); printUndirectedNeighbors("Stage0"); printPartitionsStat("Stage0"); printCircularPartitionsWeight("\nCircular"); calcCWStandardDeviation(); stage1(); printDirectedNeighbors("Stage1"); System.out.println("Stage 1 final weights: "); printPartitionsWeights("Stage1"); if ((step == 2) || (step == 12)) { stage2(); printDirectedNeighbors("Stage2"); System.out.println("Stage 2 final weights: "); printPartitionsWeights("Stage2"); } // stage3(job, hdfs); writeComparisonList(job, hdfs); // printComparisonList(job, hdfs);// remove }
@Test public void get() throws Exception { // vv MapFileSeekTest Text value = new Text(); reader.get(new IntWritable(496), value); assertThat(value.toString(), is("One, two, buckle my shoe")); // ^^ MapFileSeekTest }
public BucketCache(Configuration conf) throws IOException { bucketCache = new HashMap<IntWritable, Bucket>(); for (String cachePath : PathUtils.getCachePaths(conf)) { String bucketCachePath = cachePath + BUCKET_CACHE_FOLDER; MapFile.Reader reader = new MapFile.Reader(new Path(bucketCachePath), conf); IntWritable key = new IntWritable(); Bucket value = new Bucket(); while (reader.next(key, value)) { bucketCache.put(new IntWritable(key.get()), new Bucket(value)); } } for (IntWritable i : bucketCache.keySet()) { System.out.println("Loaded bucket from cache:" + i.get() + ":" + bucketCache.get(i)); } }
@Test public void testInvertedIndexing() throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"), edu.umd.cloud9.example.ir.BuildInvertedIndex.class.getCanonicalName(), IntegrationUtils.LOCAL_ARGS, "-libjars=" + IntegrationUtils.getJar("lib", "guava"), "-input", collectionPath.toString(), "-output", tmpPrefix, "-numReducers", "1" }; IntegrationUtils.exec(Joiner.on(" ").join(args)); MapFile.Reader reader = new MapFile.Reader(new Path(tmpPrefix + "/part-r-00000"), conf); Text key = new Text(); PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>(); key.set("gold"); reader.get(key, value); assertEquals(584, value.getLeftElement().get()); ArrayListWritable<PairOfInts> postings = value.getRightElement(); assertEquals(584, value.getLeftElement().get()); assertEquals(5303, postings.get(0).getLeftElement()); assertEquals(684030, postings.get(100).getLeftElement()); assertEquals(1634312, postings.get(200).getLeftElement()); reader.close(); }
@Override public long next(HdfsInputStream hdfsistr, Holder<Object> key, Holder<Object> value) { try { MapFile.Reader reader = (BloomMapFile.Reader) hdfsistr.getIn(); Holder<Integer> keySize = new Holder<Integer>(); WritableComparable<?> keyWritable = (WritableComparable<?>) ReflectionUtils.newInstance(reader.getKeyClass(), new Configuration()); Holder<Integer> valueSize = new Holder<Integer>(); Writable valueWritable = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), new Configuration()); if (reader.next(keyWritable, valueWritable)) { key.value = getObject(keyWritable, keySize); value.value = getObject(valueWritable, valueSize); return keySize.value + valueSize.value; } else { return 0; } } catch (Exception ex) { throw new RuntimeCamelException(ex); } }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { context.getCounter(MYCOUNTER.RECORD_COUNT).increment(1); if (value.toString().length() > 0) { String arrEmpAttributes[] = value.toString().split("\\t"); txtMapLookupKey.set(arrEmpAttributes[6].toString()); try { // txtMapLookupKey = deptNo // txtMapLookupValue = deptName deptMapReader.get(txtMapLookupKey, txtMapLookupValue); } finally { txtMapLookupValue.set( (txtMapLookupValue.equals(null) || txtMapLookupValue.equals("")) ? "NOT-FOUND" : txtMapLookupValue.toString()); } txtMapOutputKey.set(arrEmpAttributes[0].toString()); // empNo --> joinKey txtMapOutputValue.set( arrEmpAttributes[1].toString() + "\t" + arrEmpAttributes[1].toString() + "\t" + arrEmpAttributes[2].toString() + "\t" + arrEmpAttributes[3].toString() + "\t" + arrEmpAttributes[4].toString() + "\t" + arrEmpAttributes[5].toString() + "\t" + arrEmpAttributes[6].toString() + "\t" // deptNo + txtMapLookupValue.toString()); // deptName } context.write(txtMapOutputKey, txtMapOutputValue); txtMapLookupValue.set(""); txtMapLookupKey.set(""); }
/** Runs this tool. */ @SuppressWarnings({"static-access"}) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); ArrayListWritable<PairOfInts> postings; BytesWritable bytesValue = new BytesWritable(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); // ArrayListWritable<PairOfVInts> postings = value; for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } bytesValue = new BytesWritable(); key.set("gold"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("silver"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("bronze"); Writable w = reader.get(key, bytesValue); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }
@Override protected void cleanup(Context context) throws IOException, InterruptedException { deptMapReader.close(); }
@SuppressWarnings("static-access") @Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(matchOutput)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(nomatchOutput)); options.addOption( OptionBuilder.withArgName("integer") .hasArg() .withDescription("number of samples") .create(nSamplesOption)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(matchOutput) || !cmdline.hasOption(nomatchOutput) || !cmdline.hasOption(nSamplesOption)) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String matchOutputPath = cmdline.getOptionValue(matchOutput); String nomatchOutputPath = cmdline.getOptionValue(nomatchOutput); String nSamplesIn = cmdline.getOptionValue(nSamplesOption); LOG.info("Tool name: " + this.getClass().getName()); // LOG.info(" - input file: " + inputPath); // LOG.info(" - output file: " + outputPath); JobConf conf = new JobConf(getConf(), JaccardCompare.class); conf.setJobName(String.format("JaccardCompare")); // FileInputFormat.setInputPaths(conf, new Path(inputPath)); // FileOutputFormat.setOutputPath(conf, new Path(outputPath)); int nSentences = 1000; int nSamples = Integer.parseInt(nSamplesIn); try { File matchFile = new File(matchOutputPath); File nomatchFile = new File(nomatchOutputPath); FileOutputStream fosM = null, fosNM = null; BufferedWriter dosM = null, dosNM = null; fosM = new FileOutputStream(matchFile); fosNM = new FileOutputStream(nomatchFile); dosM = new BufferedWriter(new OutputStreamWriter(fosM)); dosNM = new BufferedWriter(new OutputStreamWriter(fosNM)); MapFile.Reader id2sentenceReader = new MapFile.Reader(new Path("id2sentence.map/part-00000"), conf); HashMap<Integer, ArrayListWritable<Text>> id2sentence = new HashMap<Integer, ArrayListWritable<Text>>(); IntWritable key = new IntWritable(); ArrayListWritable<Text> val = new ArrayListWritable<Text>(); while (id2sentenceReader.next(key, val)) { id2sentence.put(key.get(), val); val = new ArrayListWritable<Text>(); } MapFile.Reader sentence2translationReader = new MapFile.Reader(new Path("sentence2translation.map/part-00000"), conf); HashMap<Integer, ArrayListOfIntsWritable> sentence2translation = new HashMap<Integer, ArrayListOfIntsWritable>(); IntWritable key2 = new IntWritable(); ArrayListOfIntsWritable val2 = new ArrayListOfIntsWritable(); while (sentence2translationReader.next(key2, val2)) { sentence2translation.put(key2.get(), val2); val2 = new ArrayListOfIntsWritable(); } MapFile.Reader sentencematchReader = new MapFile.Reader(new Path("sentencematchpairs.map/part-00000"), conf); HashSet<PairOfInts> sentencematchpairs = new HashSet<PairOfInts>(); PairOfInts key3 = new PairOfInts(); IntWritable val3 = new IntWritable(); while (sentencematchReader.next(key3, val3)) { sentencematchpairs.add(key3); key3 = new PairOfInts(); } System.out.println("Done reading"); PairOfInts p = new PairOfInts(); IntWritable match; IntWritable eLineNum = new IntWritable(); IntWritable eLineId = new IntWritable(); ArrayListWritable<Text> eSentence = new ArrayListWritable<Text>(); for (int i = 0; i < nSentences; i++) { if (i % 100 == 0) System.out.println("eLine " + i); // eLineNum.set(2*i); ArrayListOfIntsWritable transIdList = sentence2translation.get(2 * i); // ArrayListOfIntsWritable transIdList = new ArrayListOfIntsWritable(); // sentence2translationReader.get(eLineNum, transIdList); // System.out.println("transIdList " + transIdList); for (int j = 0; j < nSentences; j++) { // System.out.println("fLine " + j); ArrayListWritable<Text> fSentence = id2sentence.get((2 * j + 1) * nSamples); // ArrayListWritable<Text> fSentence = new ArrayListWritable<Text>(); // IntWritable fLineId = new IntWritable(); // fLineId.set((2*j+1)*nSamples); // id2sentenceReader.get(fLineId, fSentence); // System.out.println("fLineId " + (2*j+1)*nSamples + " FSentence " + fSentence); float jsimMax = -1.0f; float jsimAvg = 0.0f; for (int id : transIdList) { eSentence = id2sentence.get(id); // eLineId.set(id); // id2sentenceReader.get(eLineId, eSentence); float jsim = JaccardSim.jaccardSim(eSentence, fSentence); // System.out.println("\teSentence " + eSentence + " " + jsim); jsimAvg += jsim; if (jsim > jsimMax) { jsimMax = jsim; } } jsimAvg = jsimAvg / transIdList.size(); if (2 * i < 2 * j + 1) { p.set(2 * i, 2 * j + 1); } else { p.set(2 * j + 1, 2 * i); } // match = new IntWritable(); // match = (IntWritable) sentencematchReader.get(p, match); // if(match != null){ if (sentencematchpairs.contains(p)) { if (jsimMax < .5) { System.out.println("Low match: "); System.out.println("\teSentence: " + i + " " + eSentence); System.out.println("\tfSentence: " + j + " " + fSentence); } // System.out.println("match"); dosM.write(Float.toString(jsimMax)); // dosM.write(Float.toString(jsimAvg)); dosM.write("\n"); } else { // System.out.println("no match"); dosNM.write(Float.toString(jsimMax)); // dosNM.write(Float.toString(jsimAvg)); dosNM.write("\n"); } } } sentencematchReader.close(); sentence2translationReader.close(); id2sentenceReader.close(); dosM.close(); dosNM.close(); } catch (IOException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } // Delete the output directory if it exists already. // Path outputDir = new Path(outputPath); // FileSystem.get(conf).delete(outputDir, true); // JobClient.runJob(conf); return 0; }