@Override public void reduce( PairOfInts docnoPair, Iterator<PairOfIntString> titles, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { eTitle.clear(); fTitle.clear(); sLogger.info(docnoPair); int cnt = 0; while (titles.hasNext()) { PairOfIntString title = titles.next(); sLogger.info(title); if (title.getLeftElement() == CLIRUtils.E) { eTitle.set(title.getRightElement()); cnt++; } else if (title.getLeftElement() == CLIRUtils.F) { fTitle.set(title.getRightElement()); cnt++; } else { throw new RuntimeException("Unknown language ID: " + title.getLeftElement()); } } if (cnt == 2) { output.collect(fTitle, eTitle); } else { sLogger.info("Incomplete data for " + docnoPair + ":" + fTitle + "," + eTitle); } }
public void map( IntWritable docnoKey, WikipediaPage p, OutputCollector<PairOfInts, PairOfIntString> output, Reporter reporter) throws IOException { int docno = docnoKey.get(); String title = p.getTitle(); String lang = p.getLanguage(); int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E; if (langID == CLIRUtils.F) { docno += 1000000000; if (samplesMap != null && !samplesMap.containsKey(docno)) { return; } } // we only load the mapping once, during the first map() call of a mapper. // this works b/c all input kv pairs of a given mapper will have same lang id (reason // explained above) if (pwsimMapping.isEmpty()) { loadPairs(pwsimMapping, langID, mJob, reporter); sLogger.info("Mapping loaded: " + pwsimMapping.size()); } // if no similar docs for docno, return if (pwsimMapping.containsKey(docno)) { similarDocnos = pwsimMapping.get(docno); } else { return; } for (int similarDocno : similarDocnos) { if (langID == CLIRUtils.E) { if (samplesMap != null && !samplesMap.containsKey(similarDocno)) { continue; } keyOut.set(similarDocno, docno); } else { keyOut.set(docno, similarDocno); } valOut.set(langID, title); output.collect(keyOut, valOut); } }