Beispiel #1
0
 /** Serializes this object. */
 public void readFields(DataInput in) throws IOException {
   int length = WritableUtils.readVInt(in);
   byte[] bytes = new byte[length];
   in.readFully(bytes, 0, length);
   WikipediaPage.readPage(this, new String(bytes, "UTF-8"));
   language = in.readUTF();
 }
Beispiel #2
0
    public void map(
        IntWritable docnoKey,
        WikipediaPage p,
        OutputCollector<PairOfInts, PairOfIntString> output,
        Reporter reporter)
        throws IOException {
      int docno = docnoKey.get();
      String title = p.getTitle();
      String lang = p.getLanguage();
      int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E;

      if (langID == CLIRUtils.F) {
        docno += 1000000000;
        if (samplesMap != null && !samplesMap.containsKey(docno)) {
          return;
        }
      }

      // we only load the mapping once, during the first map() call of a mapper.
      // this works b/c all input kv pairs of a given mapper will have same lang id (reason
      // explained above)
      if (pwsimMapping.isEmpty()) {
        loadPairs(pwsimMapping, langID, mJob, reporter);
        sLogger.info("Mapping loaded: " + pwsimMapping.size());
      }

      // if no similar docs for docno, return
      if (pwsimMapping.containsKey(docno)) {
        similarDocnos = pwsimMapping.get(docno);
      } else {
        return;
      }

      for (int similarDocno : similarDocnos) {
        if (langID == CLIRUtils.E) {
          if (samplesMap != null && !samplesMap.containsKey(similarDocno)) {
            continue;
          }
          keyOut.set(similarDocno, docno);
        } else {
          keyOut.set(docno, similarDocno);
        }
        valOut.set(langID, title);
        output.collect(keyOut, valOut);
      }
    }
Beispiel #3
0
 /**
  * Reads a raw XML string into a <code>WikipediaPage</code> object.
  *
  * @param page the <code>WikipediaPage</code> object
  * @param s raw XML string
  */
 public static void readPage(WikipediaPage page, String s) {
   page.page = s;
   page.processPage(s);
 }