Exemple #1
0
 private void setFileText(File file, String text) throws Exception {
   BufferedWriter bw =
       new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
   bw.write(text, 0, text.length());
   bw.flush();
   bw.close();
 }
  /** runs the GB features */
  public void runGB() {
    MTOutputProcessor mtop = null;

    if (gbMode == 1) gbXML = initialiseGBResources();

    String nbestSentPath =
        resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp";
    String ngramExecPath = resourceManager.getString("tools.ngram.path");

    mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize);
    //		MorphAnalysisProcessor map = new MorphAnalysisProcessor(madaFile);

    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();

    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    String out = resourceManager.getString("output") + File.separator + getMod() + outputFileName;
    System.out.println("Output will be: " + out);

    String lineTarget;

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));

      ResourceManager.printResources();

      Sentence targetSent;
      Sentence sourceSent;
      int sentCount = 0;

      String lineSource;

      while (((lineSource = brSource.readLine()) != null)
          && ((lineTarget = brTarget.readLine()) != null)) {

        lineSource = lineSource.trim().substring(lineSource.indexOf(" "));
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        // map.processNextSentence(sourceSent);
        mtop.processNextSentence(sourceSent);

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.write("\r\n");
      }
      brSource.close();
      brTarget.close();
      output.close();
      featureManager.printFeatureIndeces();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  private void applyLabel(String inPointsFile, String outPointsFile, List<String> symbols) {
    System.out.println("Applying labels for points file: " + inPointsFile);
    FileReader input;
    BufferedWriter bufWriter = null;
    try {
      FileOutputStream fos = new FileOutputStream(outPointsFile);
      bufWriter = new BufferedWriter(new OutputStreamWriter(fos));

      File inFile = new File(inPointsFile);
      if (!inFile.exists()) {
        System.out.println("ERROR: In file doens't exist");
        return;
      }
      input = new FileReader(inPointsFile);
      BufferedReader bufRead = new BufferedReader(input);
      String inputLine;
      int index = 0;
      while ((inputLine = bufRead.readLine()) != null && index < symbols.size()) {
        Point p = Utils.readPoint(inputLine);
        String symbol = symbols.get(index);
        int clazz = 0;
        if (this.invertedFixedClases.containsKey(symbol)) {
          clazz = this.invertedFixedClases.get(symbol);
        } else {
          // get the corresponding symbol
          // get the class for this one
          String sector = invertedSectors.get(symbol);
          if (sector != null) {
            clazz = sectorToClazz.get(sector);
          } else {
            //                    System.out.println("No sector: " + symbol);
          }
        }
        p.setClazz(clazz);
        String s = p.serialize();
        bufWriter.write(s);
        bufWriter.newLine();
        index++;
      }
      System.out.println("Read lines: " + index);
    } catch (Exception e) {
      throw new RuntimeException("Failed to read/write file", e);
    } finally {
      if (bufWriter != null) {
        try {
          bufWriter.close();
        } catch (IOException ignore) {
        }
      }
    }
  }
  /** runs the BB features */
  public void runBB() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    File file = new File(resourceManager.getString("output"));
    if (!file.exists()) {
      System.err.println("Creating dir: " + resourceManager.getString("output"));
      Logger.log("Creating dir: " + resourceManager.getString("output"));
      file.mkdirs();
    } else {
      Logger.log("output dir exists: " + resourceManager.getString("output"));
    }

    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));

    // FileModel fm = new FileModel(sourceFile,
    //     resourceManager.getString("source" + ".corpus"));

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    String sourcePosOutput = null;
    String targetPosOutput = null;
    PPLProcessor pplPosTarget = null;
    if (!isBaseline) {
      sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
      targetPosOutput = runPOS(targetFile, targetLang, "target");

      String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
      System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
      pplPosTarget =
          new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});
    }

    loadGiza();
    processNGrams();
    boolean gl = false;
    String temp0 = resourceManager.getString("GL");
    if (null != temp0 && temp0.equals("1")) {
      gl = true;
    }

    if (gl) {
      loadGlobalLexicon();
    }

    // Preparing the indices for IR_similarity_features
    Lucene sourceLuc = null;
    Lucene targetLuc = null;
    if (featureManager.hasFeature("1700")) {
      // The indices reside under lang_resources path
      String lang_resources = workDir + File.separator + "lang_resources";
      // Indices are saved under: luceneIndex folder
      String source_lucene_path =
          lang_resources + File.separator + sourceLang + File.separator + "luceneIndex";
      // The corpus to index
      String source_lucene_corpus = source_lucene_path + File.separator + sourceLang + ".corpus";
      //			System.out.println("SOURCE: " + source_lucene_path + " ||| " + source_lucene_corpus);
      try {
        sourceLuc = new Lucene(source_lucene_path, source_lucene_corpus, true, true, "Source");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      String target_lucene_path =
          lang_resources + File.separator + targetLang + File.separator + "luceneIndex";
      String target_lucene_corpus = target_lucene_path + File.separator + targetLang + ".corpus";
      //			System.out.println("TARGET: " + target_lucene_path + " ||| " + target_lucene_corpus);
      try {
        targetLuc = new Lucene(target_lucene_path, target_lucene_corpus, true, true, "Target");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    // MQM kicks in
    MQMManager.getInstance().initialize(resourceManager);
    Context context = new Context();
    context.setSourceFilePath(sourceFile);
    context.setTargetFilePath(targetFile);
    MQMManager.getInstance().globalProcessing(context);

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;

      // lefterav: Berkeley parser modifications start here
      // Check if user has defined the grammar files for source
      // and target language

      //   if ( ResourceManager.isRegistered("BParser")){
      boolean bp = false;
      String temp = resourceManager.getString("BP");
      if (null != temp && temp.equals("1")) {
        bp = true;
      }

      BParserProcessor sourceParserProcessor = null;
      BParserProcessor targetParserProcessor = null;

      if (bp) {
        sourceParserProcessor = new BParserProcessor();
        targetParserProcessor = new BParserProcessor();
        sourceParserProcessor.initialize(sourceFile, resourceManager, sourceLang);
        targetParserProcessor.initialize(targetFile, resourceManager, targetLang);
      }
      // }

      /** BEGIN: Added by Raphael Rubino for the Topic Model Features */
      boolean tm = false;
      String temp1 = resourceManager.getString("TM");
      if (temp1 != null && temp1.equals("1")) {
        tm = true;
      }
      TopicDistributionProcessor sourceTopicDistributionProcessor = null;
      TopicDistributionProcessor targetTopicDistributionProcessor = null;
      if (tm) {
        String sourceTopicDistributionFile =
            resourceManager.getString(sourceLang + ".topic.distribution");
        String targetTopicDistributionFile =
            resourceManager.getString(targetLang + ".topic.distribution");
        sourceTopicDistributionProcessor =
            new TopicDistributionProcessor(sourceTopicDistributionFile, "sourceTopicDistribution");
        targetTopicDistributionProcessor =
            new TopicDistributionProcessor(targetTopicDistributionFile, "targetTopicDistribution");
      }
      /* END: Added by Raphael Rubino for the Topic Model Features
       */

      if (!isBaseline) {
        if (posSourceExists) {
          posSourceProc = new POSProcessor(sourcePosOutput);
          posSource =
              new BufferedReader(
                  new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
        }
        if (posTargetExists) {
          posTargetProc = new POSProcessor(targetPosOutput);
          posTarget =
              new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
        }
      }
      ResourceManager.printResources();
      Sentence sourceSent;
      Sentence targetSent;
      int sentCount = 0;

      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();

      /** Triggers (by David Langlois) */
      boolean tr = false;
      String temp2 = resourceManager.getString("TR");
      if (temp2 != null && temp2.equals("1")) {
        tr = true;
      }

      Triggers itl_target = null;
      TriggersProcessor itl_target_p = null;
      Triggers itl_source = null;
      TriggersProcessor itl_source_p = null;
      // TriggersProcessor itl_source_p = null;
      Triggers itl_source_target = null;
      TriggersProcessor itl_source_target_p = null;

      if (tr) {

        itl_target =
            new Triggers(
                resourceManager.getString("target.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.target.intra")),
                resourceManager.getString("phrase.separator"));
        itl_target_p = new TriggersProcessor(itl_target);

        itl_source =
            new Triggers(
                resourceManager.getString("source.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.intra")),
                resourceManager.getString("phrase.separator"));
        itl_source_p = new TriggersProcessor(itl_source);

        itl_source_target =
            new Triggers(
                resourceManager.getString("source.target.inter.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.target.inter")),
                resourceManager.getString("phrase.separator"));
        itl_source_target_p = new TriggersProcessor(itl_source_target);
      }
      /*
       * End modification for Triggers
       */

      // read in each line from the source and target files
      // create a sentence from each
      // process each sentence
      // run the features on the sentences
      while ((lineSource != null) && (lineTarget != null)) {

        // lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        //       System.out.println("Processing sentence "+sentCount);
        //     System.out.println("SORCE: " + sourceSent.getText());
        //   System.out.println("TARGET: " + targetSent.getText());

        if (posSourceExists) {
          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {
          posTargetProc.processSentence(targetSent);
        }
        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);
        pplProcSource.processNextSentence(sourceSent);
        pplProcTarget.processNextSentence(targetSent);
        if (!isBaseline) {
          pplPosTarget.processNextSentence(targetSent);
        }

        // lefterav: Parse code here

        if (bp) {
          sourceParserProcessor.processNextSentence(sourceSent);
          targetParserProcessor.processNextSentence(targetSent);
        }

        if (tm) {

          sourceTopicDistributionProcessor.processNextSentence(sourceSent);
          targetTopicDistributionProcessor.processNextSentence(targetSent);
        }

        // modified by David
        if (tr) {
          itl_source_p.processNextSentence(sourceSent);
          itl_target_p.processNextSentence(targetSent);
          itl_source_target_p.processNextParallelSentences(sourceSent, targetSent);
        }
        // end modification by David

        // MQM kicks in
        MQMManager.getInstance().processNextParallelSentences(sourceSent, targetSent);

        // Ergun
        if (featureManager.hasFeature("1700")) {
          sourceLuc.processNextSentence(sourceSent);
          targetLuc.processNextSentence(targetSent);
        }

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.newLine();
        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  public void runAll() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";
    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    MTOutputProcessor mtop = null;

    if (gbMode == 1) gbXML = initialiseGBResources();

    String nbestSentPath =
        resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp";
    String ngramExecPath = resourceManager.getString("tools.ngram.path");

    mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize);

    // wlv.mt.features.coherence.Coherence coh = new wlv.mt.features.coherence.Coherence(
    //		getTargetFile());

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));
    String sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
    String targetPosOutput = runPOS(targetFile, targetLang, "target");

    String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
    System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
    PPLProcessor pplPosTarget =
        new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});

    loadGiza();
    processNGrams();

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFileName));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFileName));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;
      if (posSourceExists) {
        posSourceProc = new POSProcessor(sourcePosOutput);
        posSource =
            new BufferedReader(
                new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
      }
      if (posTargetExists) {
        posTargetProc = new POSProcessor(targetPosOutput);
        posTarget = new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
      }
      ResourceManager.printResources();

      Sentence targetSent;
      // HACK
      Sentence sourceSent;
      int sentCount = 0;
      // HACK
      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();
      // HACK
      int result;

      while ((lineSource != null) && (lineTarget != null)) {

        // the MADA-tokenised files contain start each sentence with the setence ID. We put it there
        // (why?) - no we've got to remove it

        lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);
        System.out.println("Processing sentence " + sentCount);
        if (posSourceExists) {

          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {

          posTargetProc.processSentence(targetSent);
        }

        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);

        pplProcSource.processNextSentence(sourceSent);

        pplProcTarget.processNextSentence(targetSent);

        pplPosTarget.processNextSentence(targetSent);

        //				coh.processNextSentence(targetSent);

        mtop.processNextSentence(sourceSent);

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.write("\r\n");

        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      //		featureManager.printFeatureIndeces();
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();

      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }