@Override
 public void globalProcessing(Context context) {
   assert abbreviationDictionary != null;
   Set<String> abbrevs = abbreviationDictionary.getAbbrevSet();
   BufferedReader br = null;
   try {
     br =
         new BufferedReader(
             new InputStreamReader(new FileInputStream(context.getTargetFilePath())));
     String strLine;
     int lineCount = 0;
     while ((strLine = br.readLine()) != null) {
       strLine = strLine.trim();
       for (String abbrev : abbrevs) {
         int pos = 0;
         for (String word : strLine.split("\\s+")) {
           if (word.equals(abbrev)) {
             String position = lineCount + "-" + pos;
             position2abbrev.put(position, abbrev);
           }
           pos++;
         }
       }
       lineCount++;
     }
   } catch (Exception e) {
     e.printStackTrace();
   } finally {
     if (br != null) {
       try {
         br.close();
       } catch (IOException e) {
         e.printStackTrace();
       }
     }
   }
 }
예제 #2
0
  /** runs the BB features */
  public void runBB() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    File file = new File(resourceManager.getString("output"));
    if (!file.exists()) {
      System.err.println("Creating dir: " + resourceManager.getString("output"));
      Logger.log("Creating dir: " + resourceManager.getString("output"));
      file.mkdirs();
    } else {
      Logger.log("output dir exists: " + resourceManager.getString("output"));
    }

    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));

    // FileModel fm = new FileModel(sourceFile,
    //     resourceManager.getString("source" + ".corpus"));

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    String sourcePosOutput = null;
    String targetPosOutput = null;
    PPLProcessor pplPosTarget = null;
    if (!isBaseline) {
      sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
      targetPosOutput = runPOS(targetFile, targetLang, "target");

      String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
      System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
      pplPosTarget =
          new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});
    }

    loadGiza();
    processNGrams();
    boolean gl = false;
    String temp0 = resourceManager.getString("GL");
    if (null != temp0 && temp0.equals("1")) {
      gl = true;
    }

    if (gl) {
      loadGlobalLexicon();
    }

    // Preparing the indices for IR_similarity_features
    Lucene sourceLuc = null;
    Lucene targetLuc = null;
    if (featureManager.hasFeature("1700")) {
      // The indices reside under lang_resources path
      String lang_resources = workDir + File.separator + "lang_resources";
      // Indices are saved under: luceneIndex folder
      String source_lucene_path =
          lang_resources + File.separator + sourceLang + File.separator + "luceneIndex";
      // The corpus to index
      String source_lucene_corpus = source_lucene_path + File.separator + sourceLang + ".corpus";
      //			System.out.println("SOURCE: " + source_lucene_path + " ||| " + source_lucene_corpus);
      try {
        sourceLuc = new Lucene(source_lucene_path, source_lucene_corpus, true, true, "Source");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      String target_lucene_path =
          lang_resources + File.separator + targetLang + File.separator + "luceneIndex";
      String target_lucene_corpus = target_lucene_path + File.separator + targetLang + ".corpus";
      //			System.out.println("TARGET: " + target_lucene_path + " ||| " + target_lucene_corpus);
      try {
        targetLuc = new Lucene(target_lucene_path, target_lucene_corpus, true, true, "Target");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    // MQM kicks in
    MQMManager.getInstance().initialize(resourceManager);
    Context context = new Context();
    context.setSourceFilePath(sourceFile);
    context.setTargetFilePath(targetFile);
    MQMManager.getInstance().globalProcessing(context);

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;

      // lefterav: Berkeley parser modifications start here
      // Check if user has defined the grammar files for source
      // and target language

      //   if ( ResourceManager.isRegistered("BParser")){
      boolean bp = false;
      String temp = resourceManager.getString("BP");
      if (null != temp && temp.equals("1")) {
        bp = true;
      }

      BParserProcessor sourceParserProcessor = null;
      BParserProcessor targetParserProcessor = null;

      if (bp) {
        sourceParserProcessor = new BParserProcessor();
        targetParserProcessor = new BParserProcessor();
        sourceParserProcessor.initialize(sourceFile, resourceManager, sourceLang);
        targetParserProcessor.initialize(targetFile, resourceManager, targetLang);
      }
      // }

      /** BEGIN: Added by Raphael Rubino for the Topic Model Features */
      boolean tm = false;
      String temp1 = resourceManager.getString("TM");
      if (temp1 != null && temp1.equals("1")) {
        tm = true;
      }
      TopicDistributionProcessor sourceTopicDistributionProcessor = null;
      TopicDistributionProcessor targetTopicDistributionProcessor = null;
      if (tm) {
        String sourceTopicDistributionFile =
            resourceManager.getString(sourceLang + ".topic.distribution");
        String targetTopicDistributionFile =
            resourceManager.getString(targetLang + ".topic.distribution");
        sourceTopicDistributionProcessor =
            new TopicDistributionProcessor(sourceTopicDistributionFile, "sourceTopicDistribution");
        targetTopicDistributionProcessor =
            new TopicDistributionProcessor(targetTopicDistributionFile, "targetTopicDistribution");
      }
      /* END: Added by Raphael Rubino for the Topic Model Features
       */

      if (!isBaseline) {
        if (posSourceExists) {
          posSourceProc = new POSProcessor(sourcePosOutput);
          posSource =
              new BufferedReader(
                  new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
        }
        if (posTargetExists) {
          posTargetProc = new POSProcessor(targetPosOutput);
          posTarget =
              new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
        }
      }
      ResourceManager.printResources();
      Sentence sourceSent;
      Sentence targetSent;
      int sentCount = 0;

      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();

      /** Triggers (by David Langlois) */
      boolean tr = false;
      String temp2 = resourceManager.getString("TR");
      if (temp2 != null && temp2.equals("1")) {
        tr = true;
      }

      Triggers itl_target = null;
      TriggersProcessor itl_target_p = null;
      Triggers itl_source = null;
      TriggersProcessor itl_source_p = null;
      // TriggersProcessor itl_source_p = null;
      Triggers itl_source_target = null;
      TriggersProcessor itl_source_target_p = null;

      if (tr) {

        itl_target =
            new Triggers(
                resourceManager.getString("target.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.target.intra")),
                resourceManager.getString("phrase.separator"));
        itl_target_p = new TriggersProcessor(itl_target);

        itl_source =
            new Triggers(
                resourceManager.getString("source.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.intra")),
                resourceManager.getString("phrase.separator"));
        itl_source_p = new TriggersProcessor(itl_source);

        itl_source_target =
            new Triggers(
                resourceManager.getString("source.target.inter.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.target.inter")),
                resourceManager.getString("phrase.separator"));
        itl_source_target_p = new TriggersProcessor(itl_source_target);
      }
      /*
       * End modification for Triggers
       */

      // read in each line from the source and target files
      // create a sentence from each
      // process each sentence
      // run the features on the sentences
      while ((lineSource != null) && (lineTarget != null)) {

        // lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        //       System.out.println("Processing sentence "+sentCount);
        //     System.out.println("SORCE: " + sourceSent.getText());
        //   System.out.println("TARGET: " + targetSent.getText());

        if (posSourceExists) {
          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {
          posTargetProc.processSentence(targetSent);
        }
        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);
        pplProcSource.processNextSentence(sourceSent);
        pplProcTarget.processNextSentence(targetSent);
        if (!isBaseline) {
          pplPosTarget.processNextSentence(targetSent);
        }

        // lefterav: Parse code here

        if (bp) {
          sourceParserProcessor.processNextSentence(sourceSent);
          targetParserProcessor.processNextSentence(targetSent);
        }

        if (tm) {

          sourceTopicDistributionProcessor.processNextSentence(sourceSent);
          targetTopicDistributionProcessor.processNextSentence(targetSent);
        }

        // modified by David
        if (tr) {
          itl_source_p.processNextSentence(sourceSent);
          itl_target_p.processNextSentence(targetSent);
          itl_source_target_p.processNextParallelSentences(sourceSent, targetSent);
        }
        // end modification by David

        // MQM kicks in
        MQMManager.getInstance().processNextParallelSentences(sourceSent, targetSent);

        // Ergun
        if (featureManager.hasFeature("1700")) {
          sourceLuc.processNextSentence(sourceSent);
          targetLuc.processNextSentence(targetSent);
        }

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.newLine();
        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }