Example #1
0
    @Override
    public void reduce(
        PairOfInts docnoPair,
        Iterator<PairOfIntString> titles,
        OutputCollector<Text, Text> output,
        Reporter reporter)
        throws IOException {
      eTitle.clear();
      fTitle.clear();
      sLogger.info(docnoPair);

      int cnt = 0;
      while (titles.hasNext()) {
        PairOfIntString title = titles.next();
        sLogger.info(title);
        if (title.getLeftElement() == CLIRUtils.E) {
          eTitle.set(title.getRightElement());
          cnt++;
        } else if (title.getLeftElement() == CLIRUtils.F) {
          fTitle.set(title.getRightElement());
          cnt++;
        } else {
          throw new RuntimeException("Unknown language ID: " + title.getLeftElement());
        }
      }

      if (cnt == 2) {
        output.collect(fTitle, eTitle);
      } else {
        sLogger.info("Incomplete data for " + docnoPair + ":" + fTitle + "," + eTitle);
      }
    }
Example #2
0
    public void map(
        IntWritable docnoKey,
        WikipediaPage p,
        OutputCollector<PairOfInts, PairOfIntString> output,
        Reporter reporter)
        throws IOException {
      int docno = docnoKey.get();
      String title = p.getTitle();
      String lang = p.getLanguage();
      int langID = lang.equals(srcLang) ? CLIRUtils.F : CLIRUtils.E;

      if (langID == CLIRUtils.F) {
        docno += 1000000000;
        if (samplesMap != null && !samplesMap.containsKey(docno)) {
          return;
        }
      }

      // we only load the mapping once, during the first map() call of a mapper.
      // this works b/c all input kv pairs of a given mapper will have same lang id (reason
      // explained above)
      if (pwsimMapping.isEmpty()) {
        loadPairs(pwsimMapping, langID, mJob, reporter);
        sLogger.info("Mapping loaded: " + pwsimMapping.size());
      }

      // if no similar docs for docno, return
      if (pwsimMapping.containsKey(docno)) {
        similarDocnos = pwsimMapping.get(docno);
      } else {
        return;
      }

      for (int similarDocno : similarDocnos) {
        if (langID == CLIRUtils.E) {
          if (samplesMap != null && !samplesMap.containsKey(similarDocno)) {
            continue;
          }
          keyOut.set(similarDocno, docno);
        } else {
          keyOut.set(docno, similarDocno);
        }
        valOut.set(langID, title);
        output.collect(keyOut, valOut);
      }
    }