private static void loadPairs( HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter) { try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); String pwsimFile = job.get("PwsimPairs"); for (Path localFile : localFiles) { if (localFile.toString().contains(getFilename(pwsimFile))) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); int cnt = 0; while (reader.next(key, value)) { int fDocno = key.getRightElement(); int eDocno = key.getLeftElement(); if ((eDocno == 6127 && fDocno == 1000000074) || (eDocno == 6127 && fDocno == 1000000071)) { sLogger.info(key); } if (langID == CLIRUtils.E) { if (!pwsimMapping.containsKey(eDocno)) { pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(eDocno) .add( fDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } else { if (!pwsimMapping.containsKey(fDocno)) { pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(fDocno) .add( eDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } cnt++; key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile); } } } catch (Exception e) { throw new RuntimeException(e); } }
public void map( IntWritable docnoKey, WikipediaPage p, OutputCollector<PairOfInts, PairOfIntString> output, Reporter reporter) throws IOException { int docno = docnoKey.get(); String title = p.getTitle(); String lang = p.getLanguage(); int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E; if (langID == CLIRUtils.F) { docno += 1000000000; if (samplesMap != null && !samplesMap.containsKey(docno)) { return; } } // we only load the mapping once, during the first map() call of a mapper. // this works b/c all input kv pairs of a given mapper will have same lang id (reason // explained above) if (pwsimMapping.isEmpty()) { loadPairs(pwsimMapping, langID, mJob, reporter); sLogger.info("Mapping loaded: " + pwsimMapping.size()); } // if no similar docs for docno, return if (pwsimMapping.containsKey(docno)) { similarDocnos = pwsimMapping.get(docno); } else { return; } for (int similarDocno : similarDocnos) { if (langID == CLIRUtils.E) { if (samplesMap != null && !samplesMap.containsKey(similarDocno)) { continue; } keyOut.set(similarDocno, docno); } else { keyOut.set(docno, similarDocno); } valOut.set(langID, title); output.collect(keyOut, valOut); } }
public void map( ArrayListOfLongsWritable key, ArrayListWritable<IntWritable> sentenceList, OutputCollector<PairOfInts, IntWritable> output, Reporter reporter) throws IOException { IntWritable s1; IntWritable s2; int s1line; int s2line; for (int i = 0; i < sentenceList.size() && i < MAXSIZE; i++) { s1 = sentenceList.get(i); s1line = s1.get() / nSamples; for (int j = i + 1; j < sentenceList.size() && j < MAXSIZE; j++) { s2 = sentenceList.get(j); s2line = s2.get() / nSamples; pairOut = new PairOfInts(); if (s1line == s2line) { continue; } /* if(((s1.get()/nSamples)%2 == 0 && (s2.get()/nSamples)%2 == 0) || ((s1.get()/nSamples)%2 != 0 && (s2.get()/nSamples)%2 != 0)){ continue; } */ if (s1.get() < s2.get()) { // pairOut.set(s1.get(), s2.get()); pairOut.set(s1line, s2line); } else { // pairOut.set(s2.get(), s1.get()); pairOut.set(s2line, s1line); } output.collect(pairOut, ONE); } } }
/** Runs this tool. */ @SuppressWarnings({"static-access"}) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); ArrayListWritable<PairOfInts> postings; BytesWritable bytesValue = new BytesWritable(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); // ArrayListWritable<PairOfVInts> postings = value; for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } bytesValue = new BytesWritable(); key.set("gold"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'gold': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("silver"); reader.get(key, bytesValue); postings = deserializePosting(bytesValue); System.out.println( "Complete postings list for 'silver': (" + postings.size() + ", " + postings + ")"); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); // postings = value; for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } bytesValue = new BytesWritable(); key.set("bronze"); Writable w = reader.get(key, bytesValue); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }
@Test public void testIterable() { Int2IntFrequencyDistribution fd = new Int2IntFrequencyDistributionOpen(); fd.set(1, 1); fd.set(4, 3); fd.set(2, 4); fd.set(5, 7); fd.set(6, 9); fd.set(3, 2); assertEquals(6, fd.getNumberOfEvents()); assertEquals(26, fd.getSumOfCounts()); SortedSet<PairOfInts> list = new TreeSet<PairOfInts>(); for (PairOfInts pair : fd) { list.add(pair.clone()); } assertEquals(6, list.size()); Iterator<PairOfInts> iter = list.iterator(); PairOfInts e = iter.next(); assertEquals(1, e.getLeftElement()); assertEquals(1, e.getRightElement()); e = iter.next(); assertEquals(2, e.getLeftElement()); assertEquals(4, e.getRightElement()); e = iter.next(); assertEquals(3, e.getLeftElement()); assertEquals(2, e.getRightElement()); e = iter.next(); assertEquals(4, e.getLeftElement()); assertEquals(3, e.getRightElement()); e = iter.next(); assertEquals(5, e.getLeftElement()); assertEquals(7, e.getRightElement()); e = iter.next(); assertEquals(6, e.getLeftElement()); assertEquals(9, e.getRightElement()); }
@SuppressWarnings("static-access") @Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(matchOutput)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("output path") .create(nomatchOutput)); options.addOption( OptionBuilder.withArgName("integer") .hasArg() .withDescription("number of samples") .create(nSamplesOption)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(matchOutput) || !cmdline.hasOption(nomatchOutput) || !cmdline.hasOption(nSamplesOption)) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String matchOutputPath = cmdline.getOptionValue(matchOutput); String nomatchOutputPath = cmdline.getOptionValue(nomatchOutput); String nSamplesIn = cmdline.getOptionValue(nSamplesOption); LOG.info("Tool name: " + this.getClass().getName()); // LOG.info(" - input file: " + inputPath); // LOG.info(" - output file: " + outputPath); JobConf conf = new JobConf(getConf(), JaccardCompare.class); conf.setJobName(String.format("JaccardCompare")); // FileInputFormat.setInputPaths(conf, new Path(inputPath)); // FileOutputFormat.setOutputPath(conf, new Path(outputPath)); int nSentences = 1000; int nSamples = Integer.parseInt(nSamplesIn); try { File matchFile = new File(matchOutputPath); File nomatchFile = new File(nomatchOutputPath); FileOutputStream fosM = null, fosNM = null; BufferedWriter dosM = null, dosNM = null; fosM = new FileOutputStream(matchFile); fosNM = new FileOutputStream(nomatchFile); dosM = new BufferedWriter(new OutputStreamWriter(fosM)); dosNM = new BufferedWriter(new OutputStreamWriter(fosNM)); MapFile.Reader id2sentenceReader = new MapFile.Reader(new Path("id2sentence.map/part-00000"), conf); HashMap<Integer, ArrayListWritable<Text>> id2sentence = new HashMap<Integer, ArrayListWritable<Text>>(); IntWritable key = new IntWritable(); ArrayListWritable<Text> val = new ArrayListWritable<Text>(); while (id2sentenceReader.next(key, val)) { id2sentence.put(key.get(), val); val = new ArrayListWritable<Text>(); } MapFile.Reader sentence2translationReader = new MapFile.Reader(new Path("sentence2translation.map/part-00000"), conf); HashMap<Integer, ArrayListOfIntsWritable> sentence2translation = new HashMap<Integer, ArrayListOfIntsWritable>(); IntWritable key2 = new IntWritable(); ArrayListOfIntsWritable val2 = new ArrayListOfIntsWritable(); while (sentence2translationReader.next(key2, val2)) { sentence2translation.put(key2.get(), val2); val2 = new ArrayListOfIntsWritable(); } MapFile.Reader sentencematchReader = new MapFile.Reader(new Path("sentencematchpairs.map/part-00000"), conf); HashSet<PairOfInts> sentencematchpairs = new HashSet<PairOfInts>(); PairOfInts key3 = new PairOfInts(); IntWritable val3 = new IntWritable(); while (sentencematchReader.next(key3, val3)) { sentencematchpairs.add(key3); key3 = new PairOfInts(); } System.out.println("Done reading"); PairOfInts p = new PairOfInts(); IntWritable match; IntWritable eLineNum = new IntWritable(); IntWritable eLineId = new IntWritable(); ArrayListWritable<Text> eSentence = new ArrayListWritable<Text>(); for (int i = 0; i < nSentences; i++) { if (i % 100 == 0) System.out.println("eLine " + i); // eLineNum.set(2*i); ArrayListOfIntsWritable transIdList = sentence2translation.get(2 * i); // ArrayListOfIntsWritable transIdList = new ArrayListOfIntsWritable(); // sentence2translationReader.get(eLineNum, transIdList); // System.out.println("transIdList " + transIdList); for (int j = 0; j < nSentences; j++) { // System.out.println("fLine " + j); ArrayListWritable<Text> fSentence = id2sentence.get((2 * j + 1) * nSamples); // ArrayListWritable<Text> fSentence = new ArrayListWritable<Text>(); // IntWritable fLineId = new IntWritable(); // fLineId.set((2*j+1)*nSamples); // id2sentenceReader.get(fLineId, fSentence); // System.out.println("fLineId " + (2*j+1)*nSamples + " FSentence " + fSentence); float jsimMax = -1.0f; float jsimAvg = 0.0f; for (int id : transIdList) { eSentence = id2sentence.get(id); // eLineId.set(id); // id2sentenceReader.get(eLineId, eSentence); float jsim = JaccardSim.jaccardSim(eSentence, fSentence); // System.out.println("\teSentence " + eSentence + " " + jsim); jsimAvg += jsim; if (jsim > jsimMax) { jsimMax = jsim; } } jsimAvg = jsimAvg / transIdList.size(); if (2 * i < 2 * j + 1) { p.set(2 * i, 2 * j + 1); } else { p.set(2 * j + 1, 2 * i); } // match = new IntWritable(); // match = (IntWritable) sentencematchReader.get(p, match); // if(match != null){ if (sentencematchpairs.contains(p)) { if (jsimMax < .5) { System.out.println("Low match: "); System.out.println("\teSentence: " + i + " " + eSentence); System.out.println("\tfSentence: " + j + " " + fSentence); } // System.out.println("match"); dosM.write(Float.toString(jsimMax)); // dosM.write(Float.toString(jsimAvg)); dosM.write("\n"); } else { // System.out.println("no match"); dosNM.write(Float.toString(jsimMax)); // dosNM.write(Float.toString(jsimAvg)); dosNM.write("\n"); } } } sentencematchReader.close(); sentence2translationReader.close(); id2sentenceReader.close(); dosM.close(); dosNM.close(); } catch (IOException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } // Delete the output directory if it exists already. // Path outputDir = new Path(outputPath); // FileSystem.get(conf).delete(outputDir, true); // JobClient.runJob(conf); return 0; }