public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter)
        throws IOException {

      // key: a single sentence in both languages and alignment
      // ignore value. each key is parallel sentence and its alignment, in xml format

      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());

      // Chunk is an array of tokens in the sentence, without any special tokenization (just
      // separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }

      // ec,fc: English/French sentence represented as sequence of words
      // vocE,vocF: vocabularies for english and french, of type VocabularyWritable

      // ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

      // e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);

      edu.umd.hooka.PhrasePair b = new PhrasePair(f, e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
      oc.collect(ok, b);
    }
    public void configure(JobConf job) {
      sLogger.setLevel(Level.OFF);
      src = Language.languageForISO639_1(job.get(SRC_LANG));
      tgt = Language.languageForISO639_1(job.get(TGT_LANG));
      sLogger.debug("Source language: " + src.code());
      sLogger.debug("Target language: " + tgt.code());

      boolean useVocabServer = false;
      if (!useVocabServer) {
        if (vocE == null) vocE = new VocabularyWritable();
        if (vocF == null) vocF = new VocabularyWritable();
      } else {
        try {
          vocE =
              new VocabServerClient(
                  job.get("ha.vocabserver.host"),
                  Integer.parseInt(job.get("ha.vocabserver.port1")));
          vocF =
              new VocabServerClient(
                  job.get("ha.vocabserver.host"),
                  Integer.parseInt(job.get("ha.vocabserver.port2")));
        } catch (IOException e) {
          e.printStackTrace();
          throw new RuntimeException(e);
        }
      }
      lp = LanguagePair.languageForISO639_1Pair(src.code() + "-" + tgt.code());

      if (job.getBoolean("ha.trunc.use", true)) {
        sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job);
        tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job);
      } else {
        sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
        tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
      }
      job_ = job;
    }