public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter) throws IOException { // key: a single sentence in both languages and alignment // ignore value. each key is parallel sentence and its alignment, in xml format ParallelChunk c = pcr.parseString(key.toString()); ok.set(c.idString()); // Chunk is an array of tokens in the sentence, without any special tokenization (just // separated by spaces) Chunk fc = c.getChunk(src); Chunk ec = c.getChunk(tgt); if (fc == null || ec == null) { reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1); return; } if (fc.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1); return; } if (ec.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1); return; } // ec,fc: English/French sentence represented as sequence of words // vocE,vocF: vocabularies for english and french, of type VocabularyWritable // ee,fe: integer representation of words in sentences ec and fc sLogger.debug("Target sentence:"); int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE); sLogger.debug("Source sentence:"); int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF); // e,f: phrase from whole sentence Phrase e = new Phrase(ee, 0); Phrase f = new Phrase(fe, 1); edu.umd.hooka.PhrasePair b = new PhrasePair(f, e); ReferenceAlignment ra = c.getReferenceAlignment(lp); if (ra != null) { b.setAlignment(ra); } reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length); reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length); reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1); oc.collect(ok, b); }
public void configure(JobConf job) { sLogger.setLevel(Level.OFF); src = Language.languageForISO639_1(job.get(SRC_LANG)); tgt = Language.languageForISO639_1(job.get(TGT_LANG)); sLogger.debug("Source language: " + src.code()); sLogger.debug("Target language: " + tgt.code()); boolean useVocabServer = false; if (!useVocabServer) { if (vocE == null) vocE = new VocabularyWritable(); if (vocF == null) vocF = new VocabularyWritable(); } else { try { vocE = new VocabServerClient( job.get("ha.vocabserver.host"), Integer.parseInt(job.get("ha.vocabserver.port1"))); vocF = new VocabServerClient( job.get("ha.vocabserver.host"), Integer.parseInt(job.get("ha.vocabserver.port2"))); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } lp = LanguagePair.languageForISO639_1Pair(src.code() + "-" + tgt.code()); if (job.getBoolean("ha.trunc.use", true)) { sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job); tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job); } else { sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job); tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job); } job_ = job; }