Java AlignmentWordPreprocessor примеры использования

Язык программирования: Java

Примеров на hotexamples.com: 2

Java AlignmentWordPreprocessor - 2 примера найдено. Это лучшие примеры Java кода для AlignmentWordPreprocessor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CreatePreprocessor(1)

preprocessWordsForAlignment(1)

Пример #1

Показать файл

Файл: CorpusVocabNormalizerAndNumberizer.java Проект: rahulbhawsar/Cloud9

    public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter)
        throws IOException {

      // key: a single sentence in both languages and alignment
      // ignore value. each key is parallel sentence and its alignment, in xml format

      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());

      // Chunk is an array of tokens in the sentence, without any special tokenization (just
      // separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }

      // ec,fc: English/French sentence represented as sequence of words
      // vocE,vocF: vocabularies for english and french, of type VocabularyWritable

      // ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

      // e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);

      edu.umd.hooka.PhrasePair b = new PhrasePair(f, e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
      oc.collect(ok, b);
    }

Пример #2

Показать файл

Файл: CorpusVocabNormalizerAndNumberizer.java Проект: rahulbhawsar/Cloud9

    public void configure(JobConf job) {
      sLogger.setLevel(Level.OFF);
      src = Language.languageForISO639_1(job.get(SRC_LANG));
      tgt = Language.languageForISO639_1(job.get(TGT_LANG));
      sLogger.debug("Source language: " + src.code());
      sLogger.debug("Target language: " + tgt.code());

      boolean useVocabServer = false;
      if (!useVocabServer) {
        if (vocE == null) vocE = new VocabularyWritable();
        if (vocF == null) vocF = new VocabularyWritable();
      } else {
        try {
          vocE =
              new VocabServerClient(
                  job.get("ha.vocabserver.host"),
                  Integer.parseInt(job.get("ha.vocabserver.port1")));
          vocF =
              new VocabServerClient(
                  job.get("ha.vocabserver.host"),
                  Integer.parseInt(job.get("ha.vocabserver.port2")));
        } catch (IOException e) {
          e.printStackTrace();
          throw new RuntimeException(e);
        }
      }
      lp = LanguagePair.languageForISO639_1Pair(src.code() + "-" + tgt.code());

      if (job.getBoolean("ha.trunc.use", true)) {
        sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job);
        tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job);
      } else {
        sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
        tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
      }
      job_ = job;
    }