/** Serializes this object. */ public void readFields(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte[] bytes = new byte[length]; in.readFully(bytes, 0, length); WikipediaPage.readPage(this, new String(bytes, "UTF-8")); language = in.readUTF(); }
public void map( IntWritable docnoKey, WikipediaPage p, OutputCollector<PairOfInts, PairOfIntString> output, Reporter reporter) throws IOException { int docno = docnoKey.get(); String title = p.getTitle(); String lang = p.getLanguage(); int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E; if (langID == CLIRUtils.F) { docno += 1000000000; if (samplesMap != null && !samplesMap.containsKey(docno)) { return; } } // we only load the mapping once, during the first map() call of a mapper. // this works b/c all input kv pairs of a given mapper will have same lang id (reason // explained above) if (pwsimMapping.isEmpty()) { loadPairs(pwsimMapping, langID, mJob, reporter); sLogger.info("Mapping loaded: " + pwsimMapping.size()); } // if no similar docs for docno, return if (pwsimMapping.containsKey(docno)) { similarDocnos = pwsimMapping.get(docno); } else { return; } for (int similarDocno : similarDocnos) { if (langID == CLIRUtils.E) { if (samplesMap != null && !samplesMap.containsKey(similarDocno)) { continue; } keyOut.set(similarDocno, docno); } else { keyOut.set(docno, similarDocno); } valOut.set(langID, title); output.collect(keyOut, valOut); } }
/** * Reads a raw XML string into a <code>WikipediaPage</code> object. * * @param page the <code>WikipediaPage</code> object * @param s raw XML string */ public static void readPage(WikipediaPage page, String s) { page.page = s; page.processPage(s); }