private static void loadPairs( HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter) { try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); String pwsimFile = job.get("PwsimPairs"); for (Path localFile : localFiles) { if (localFile.toString().contains(getFilename(pwsimFile))) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); int cnt = 0; while (reader.next(key, value)) { int fDocno = key.getRightElement(); int eDocno = key.getLeftElement(); if ((eDocno == 6127 && fDocno == 1000000074) || (eDocno == 6127 && fDocno == 1000000071)) { sLogger.info(key); } if (langID == CLIRUtils.E) { if (!pwsimMapping.containsKey(eDocno)) { pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(eDocno) .add( fDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } else { if (!pwsimMapping.containsKey(fDocno)) { pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(fDocno) .add( eDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } cnt++; key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile); } } } catch (Exception e) { throw new RuntimeException(e); } }
@Test public void testIterable() { Int2IntFrequencyDistribution fd = new Int2IntFrequencyDistributionOpen(); fd.set(1, 1); fd.set(4, 3); fd.set(2, 4); fd.set(5, 7); fd.set(6, 9); fd.set(3, 2); assertEquals(6, fd.getNumberOfEvents()); assertEquals(26, fd.getSumOfCounts()); SortedSet<PairOfInts> list = new TreeSet<PairOfInts>(); for (PairOfInts pair : fd) { list.add(pair.clone()); } assertEquals(6, list.size()); Iterator<PairOfInts> iter = list.iterator(); PairOfInts e = iter.next(); assertEquals(1, e.getLeftElement()); assertEquals(1, e.getRightElement()); e = iter.next(); assertEquals(2, e.getLeftElement()); assertEquals(4, e.getRightElement()); e = iter.next(); assertEquals(3, e.getLeftElement()); assertEquals(2, e.getRightElement()); e = iter.next(); assertEquals(4, e.getLeftElement()); assertEquals(3, e.getRightElement()); e = iter.next(); assertEquals(5, e.getLeftElement()); assertEquals(7, e.getRightElement()); e = iter.next(); assertEquals(6, e.getLeftElement()); assertEquals(9, e.getRightElement()); }
/** Runs this tool. */ @SuppressWarnings({"static-access"}) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); ArrayListWritable<PairOfInts> postings; BytesWritable bytesValue = new BytesWritable(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); // ArrayListWritable<PairOfVInts> postings = value; for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } bytesValue = new BytesWritable(); key.set("gold"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("silver"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("bronze"); Writable w = reader.get(key, bytesValue); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }