Пример #1
0
  /** runs the GB features */
  public void runGB() {
    MTOutputProcessor mtop = null;

    if (gbMode == 1) gbXML = initialiseGBResources();

    String nbestSentPath =
        resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp";
    String ngramExecPath = resourceManager.getString("tools.ngram.path");

    mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize);
    //		MorphAnalysisProcessor map = new MorphAnalysisProcessor(madaFile);

    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();

    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    String out = resourceManager.getString("output") + File.separator + getMod() + outputFileName;
    System.out.println("Output will be: " + out);

    String lineTarget;

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));

      ResourceManager.printResources();

      Sentence targetSent;
      Sentence sourceSent;
      int sentCount = 0;

      String lineSource;

      while (((lineSource = brSource.readLine()) != null)
          && ((lineTarget = brTarget.readLine()) != null)) {

        lineSource = lineSource.trim().substring(lineSource.indexOf(" "));
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        // map.processNextSentence(sourceSent);
        mtop.processNextSentence(sourceSent);

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.write("\r\n");
      }
      brSource.close();
      brTarget.close();
      output.close();
      featureManager.printFeatureIndeces();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Пример #2
0
  public static void main(String args[]) throws IOException, ParseException {
    Options options = new Options();
    options.addOption("u", "uniquehits", false, "only output hits with a single mapping");
    options.addOption(
        "s",
        "nosuboptimal",
        false,
        "do not include hits whose score is not equal to the best score for the read");
    CommandLineParser parser = new GnuParser();
    CommandLine cl = parser.parse(options, args, false);
    boolean uniqueOnly = cl.hasOption("uniquehits");
    boolean filterSubOpt = cl.hasOption("nosuboptimal");

    ArrayList<String[]> lines = new ArrayList<String[]>();

    String line;
    String lastRead = "";
    BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
    while ((line = reader.readLine()) != null) {
      String pieces[] = line.split("\t");
      if (!pieces[0].equals(lastRead)) {
        printLines(lines, uniqueOnly, filterSubOpt);
        lines.clear();
      }
      lines.add(pieces);
      lastRead = pieces[0];
    }
    printLines(lines, uniqueOnly, filterSubOpt);
  }
Пример #3
0
 public static String[] generateFileNamesFromOptions(CommandLine cli) {
   String[] filenames = cli.getArgs();
   List<String> fileList = new ArrayList<String>(filenames.length);
   boolean errors = false;
   for (String filename : filenames) {
     if (filename.startsWith("@")) {
       try {
         BufferedReader br = new BufferedReader(new FileReader(filename.substring(1)));
         String file;
         while ((file = br.readLine()) != null) {
           fileList.add(file);
         }
       } catch (IOException ioe) {
         System.err.println("error: file not readable: " + filename.substring(1));
         errors = true;
       }
     } else {
       fileList.add(filename);
     }
   }
   if (errors) {
     return null;
   } else {
     return fileList.toArray(new String[fileList.size()]);
   }
 }
Пример #4
0
  private Map<String, List<String>> loadHistoSectors(String sectorFile) {
    FileReader input;
    Map<String, List<String>> sectors = new HashMap<String, List<String>>();
    try {
      input = new FileReader(sectorFile);
      BufferedReader bufRead = new BufferedReader(input);
      String line;

      int i = 1;
      while ((line = bufRead.readLine()) != null) {
        Bin sectorRecord = Utils.readBin(line);
        List<String> stockList = sectorRecord.symbols;
        String startEnd = formatter.format(sectorRecord.end);
        String key = intgerFormaterr.format(i) + ":" + startEnd;
        sectors.put(key, stockList);
        for (String s : stockList) {
          invertedSectors.put(s, key);
        }
        i++;
      }
    } catch (IOException e) {
      throw new RuntimeException("Failed to load sector file", e);
    }
    return sectors;
  }
Пример #5
0
  private Map<String, Integer> loadFixedClasses(String file) {
    FileReader input;
    try {
      Map<Integer, List<String>> fixedClaszzes = new HashMap<Integer, List<String>>();
      Map<String, Integer> invertedFixedClasses = new HashMap<String, Integer>();
      File f = new File(file);
      if (!f.exists()) {
        System.out.println("Extra classes file doesn't exist: " + fixedClassesFile);
        return invertedFixedClasses;
      }
      input = new FileReader(f);
      BufferedReader bufRead = new BufferedReader(input);
      String line;
      while ((line = bufRead.readLine()) != null) {
        String parts[] = line.split(",");
        int clazz = Integer.parseInt(parts[0]);
        List<String> symbols = new ArrayList<String>();
        symbols.addAll(Arrays.asList(parts).subList(1, parts.length));
        fixedClaszzes.put(clazz, symbols);
      }

      for (Map.Entry<Integer, List<String>> e : fixedClaszzes.entrySet()) {
        for (String s : e.getValue()) {
          invertedFixedClasses.put(s, e.getKey());
        }
      }
      return invertedFixedClasses;
    } catch (IOException e) {
      e.printStackTrace();
    }
    return null;
  }
Пример #6
0
  private void applyLabel(String inPointsFile, String outPointsFile, List<String> symbols) {
    System.out.println("Applying labels for points file: " + inPointsFile);
    FileReader input;
    BufferedWriter bufWriter = null;
    try {
      FileOutputStream fos = new FileOutputStream(outPointsFile);
      bufWriter = new BufferedWriter(new OutputStreamWriter(fos));

      File inFile = new File(inPointsFile);
      if (!inFile.exists()) {
        System.out.println("ERROR: In file doens't exist");
        return;
      }
      input = new FileReader(inPointsFile);
      BufferedReader bufRead = new BufferedReader(input);
      String inputLine;
      int index = 0;
      while ((inputLine = bufRead.readLine()) != null && index < symbols.size()) {
        Point p = Utils.readPoint(inputLine);
        String symbol = symbols.get(index);
        int clazz = 0;
        if (this.invertedFixedClases.containsKey(symbol)) {
          clazz = this.invertedFixedClases.get(symbol);
        } else {
          // get the corresponding symbol
          // get the class for this one
          String sector = invertedSectors.get(symbol);
          if (sector != null) {
            clazz = sectorToClazz.get(sector);
          } else {
            //                    System.out.println("No sector: " + symbol);
          }
        }
        p.setClazz(clazz);
        String s = p.serialize();
        bufWriter.write(s);
        bufWriter.newLine();
        index++;
      }
      System.out.println("Read lines: " + index);
    } catch (Exception e) {
      throw new RuntimeException("Failed to read/write file", e);
    } finally {
      if (bufWriter != null) {
        try {
          bufWriter.close();
        } catch (IOException ignore) {
        }
      }
    }
  }
Пример #7
0
 public static String getLicensee() {
   InputStream is = Q2.class.getResourceAsStream(LICENSEE);
   ByteArrayOutputStream baos = new ByteArrayOutputStream();
   if (is != null) {
     BufferedReader br = new BufferedReader(new InputStreamReader(is));
     PrintStream p = new PrintStream(baos);
     p.println();
     p.println();
     try {
       while (br.ready()) p.println(br.readLine());
     } catch (Exception e) {
       e.printStackTrace(p);
     }
   }
   return baos.toString();
 }
Пример #8
0
  private Map<String, List<String>> loadStockSectors(String sectorFile) {
    FileReader input;
    Map<String, List<String>> sectors = new HashMap<String, List<String>>();
    try {
      input = new FileReader(sectorFile);
      BufferedReader bufRead = new BufferedReader(input);
      String line;

      while ((line = bufRead.readLine()) != null) {
        SectorRecord sectorRecord = Utils.readSectorRecord(line);
        List<String> stockList = sectors.get(sectorRecord.getSector());
        if (stockList == null) {
          stockList = new ArrayList<String>();
          sectors.put(sectorRecord.getSector(), stockList);
        }
        stockList.add(sectorRecord.getSymbol());

        invertedSectors.put(sectorRecord.getSymbol(), sectorRecord.getSector());
      }
    } catch (IOException e) {
      throw new RuntimeException("Failed to load sector file", e);
    }
    return sectors;
  }
Пример #9
0
  /** runs the BB features */
  public void runBB() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    File file = new File(resourceManager.getString("output"));
    if (!file.exists()) {
      System.err.println("Creating dir: " + resourceManager.getString("output"));
      Logger.log("Creating dir: " + resourceManager.getString("output"));
      file.mkdirs();
    } else {
      Logger.log("output dir exists: " + resourceManager.getString("output"));
    }

    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));

    // FileModel fm = new FileModel(sourceFile,
    //     resourceManager.getString("source" + ".corpus"));

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    String sourcePosOutput = null;
    String targetPosOutput = null;
    PPLProcessor pplPosTarget = null;
    if (!isBaseline) {
      sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
      targetPosOutput = runPOS(targetFile, targetLang, "target");

      String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
      System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
      pplPosTarget =
          new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});
    }

    loadGiza();
    processNGrams();
    boolean gl = false;
    String temp0 = resourceManager.getString("GL");
    if (null != temp0 && temp0.equals("1")) {
      gl = true;
    }

    if (gl) {
      loadGlobalLexicon();
    }

    // Preparing the indices for IR_similarity_features
    Lucene sourceLuc = null;
    Lucene targetLuc = null;
    if (featureManager.hasFeature("1700")) {
      // The indices reside under lang_resources path
      String lang_resources = workDir + File.separator + "lang_resources";
      // Indices are saved under: luceneIndex folder
      String source_lucene_path =
          lang_resources + File.separator + sourceLang + File.separator + "luceneIndex";
      // The corpus to index
      String source_lucene_corpus = source_lucene_path + File.separator + sourceLang + ".corpus";
      //			System.out.println("SOURCE: " + source_lucene_path + " ||| " + source_lucene_corpus);
      try {
        sourceLuc = new Lucene(source_lucene_path, source_lucene_corpus, true, true, "Source");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      String target_lucene_path =
          lang_resources + File.separator + targetLang + File.separator + "luceneIndex";
      String target_lucene_corpus = target_lucene_path + File.separator + targetLang + ".corpus";
      //			System.out.println("TARGET: " + target_lucene_path + " ||| " + target_lucene_corpus);
      try {
        targetLuc = new Lucene(target_lucene_path, target_lucene_corpus, true, true, "Target");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    // MQM kicks in
    MQMManager.getInstance().initialize(resourceManager);
    Context context = new Context();
    context.setSourceFilePath(sourceFile);
    context.setTargetFilePath(targetFile);
    MQMManager.getInstance().globalProcessing(context);

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;

      // lefterav: Berkeley parser modifications start here
      // Check if user has defined the grammar files for source
      // and target language

      //   if ( ResourceManager.isRegistered("BParser")){
      boolean bp = false;
      String temp = resourceManager.getString("BP");
      if (null != temp && temp.equals("1")) {
        bp = true;
      }

      BParserProcessor sourceParserProcessor = null;
      BParserProcessor targetParserProcessor = null;

      if (bp) {
        sourceParserProcessor = new BParserProcessor();
        targetParserProcessor = new BParserProcessor();
        sourceParserProcessor.initialize(sourceFile, resourceManager, sourceLang);
        targetParserProcessor.initialize(targetFile, resourceManager, targetLang);
      }
      // }

      /** BEGIN: Added by Raphael Rubino for the Topic Model Features */
      boolean tm = false;
      String temp1 = resourceManager.getString("TM");
      if (temp1 != null && temp1.equals("1")) {
        tm = true;
      }
      TopicDistributionProcessor sourceTopicDistributionProcessor = null;
      TopicDistributionProcessor targetTopicDistributionProcessor = null;
      if (tm) {
        String sourceTopicDistributionFile =
            resourceManager.getString(sourceLang + ".topic.distribution");
        String targetTopicDistributionFile =
            resourceManager.getString(targetLang + ".topic.distribution");
        sourceTopicDistributionProcessor =
            new TopicDistributionProcessor(sourceTopicDistributionFile, "sourceTopicDistribution");
        targetTopicDistributionProcessor =
            new TopicDistributionProcessor(targetTopicDistributionFile, "targetTopicDistribution");
      }
      /* END: Added by Raphael Rubino for the Topic Model Features
       */

      if (!isBaseline) {
        if (posSourceExists) {
          posSourceProc = new POSProcessor(sourcePosOutput);
          posSource =
              new BufferedReader(
                  new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
        }
        if (posTargetExists) {
          posTargetProc = new POSProcessor(targetPosOutput);
          posTarget =
              new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
        }
      }
      ResourceManager.printResources();
      Sentence sourceSent;
      Sentence targetSent;
      int sentCount = 0;

      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();

      /** Triggers (by David Langlois) */
      boolean tr = false;
      String temp2 = resourceManager.getString("TR");
      if (temp2 != null && temp2.equals("1")) {
        tr = true;
      }

      Triggers itl_target = null;
      TriggersProcessor itl_target_p = null;
      Triggers itl_source = null;
      TriggersProcessor itl_source_p = null;
      // TriggersProcessor itl_source_p = null;
      Triggers itl_source_target = null;
      TriggersProcessor itl_source_target_p = null;

      if (tr) {

        itl_target =
            new Triggers(
                resourceManager.getString("target.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.target.intra")),
                resourceManager.getString("phrase.separator"));
        itl_target_p = new TriggersProcessor(itl_target);

        itl_source =
            new Triggers(
                resourceManager.getString("source.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.intra")),
                resourceManager.getString("phrase.separator"));
        itl_source_p = new TriggersProcessor(itl_source);

        itl_source_target =
            new Triggers(
                resourceManager.getString("source.target.inter.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.target.inter")),
                resourceManager.getString("phrase.separator"));
        itl_source_target_p = new TriggersProcessor(itl_source_target);
      }
      /*
       * End modification for Triggers
       */

      // read in each line from the source and target files
      // create a sentence from each
      // process each sentence
      // run the features on the sentences
      while ((lineSource != null) && (lineTarget != null)) {

        // lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        //       System.out.println("Processing sentence "+sentCount);
        //     System.out.println("SORCE: " + sourceSent.getText());
        //   System.out.println("TARGET: " + targetSent.getText());

        if (posSourceExists) {
          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {
          posTargetProc.processSentence(targetSent);
        }
        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);
        pplProcSource.processNextSentence(sourceSent);
        pplProcTarget.processNextSentence(targetSent);
        if (!isBaseline) {
          pplPosTarget.processNextSentence(targetSent);
        }

        // lefterav: Parse code here

        if (bp) {
          sourceParserProcessor.processNextSentence(sourceSent);
          targetParserProcessor.processNextSentence(targetSent);
        }

        if (tm) {

          sourceTopicDistributionProcessor.processNextSentence(sourceSent);
          targetTopicDistributionProcessor.processNextSentence(targetSent);
        }

        // modified by David
        if (tr) {
          itl_source_p.processNextSentence(sourceSent);
          itl_target_p.processNextSentence(targetSent);
          itl_source_target_p.processNextParallelSentences(sourceSent, targetSent);
        }
        // end modification by David

        // MQM kicks in
        MQMManager.getInstance().processNextParallelSentences(sourceSent, targetSent);

        // Ergun
        if (featureManager.hasFeature("1700")) {
          sourceLuc.processNextSentence(sourceSent);
          targetLuc.processNextSentence(targetSent);
        }

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.newLine();
        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Пример #10
0
  public void runAll() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";
    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    MTOutputProcessor mtop = null;

    if (gbMode == 1) gbXML = initialiseGBResources();

    String nbestSentPath =
        resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp";
    String ngramExecPath = resourceManager.getString("tools.ngram.path");

    mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize);

    // wlv.mt.features.coherence.Coherence coh = new wlv.mt.features.coherence.Coherence(
    //		getTargetFile());

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));
    String sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
    String targetPosOutput = runPOS(targetFile, targetLang, "target");

    String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
    System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
    PPLProcessor pplPosTarget =
        new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});

    loadGiza();
    processNGrams();

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFileName));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFileName));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;
      if (posSourceExists) {
        posSourceProc = new POSProcessor(sourcePosOutput);
        posSource =
            new BufferedReader(
                new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
      }
      if (posTargetExists) {
        posTargetProc = new POSProcessor(targetPosOutput);
        posTarget = new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
      }
      ResourceManager.printResources();

      Sentence targetSent;
      // HACK
      Sentence sourceSent;
      int sentCount = 0;
      // HACK
      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();
      // HACK
      int result;

      while ((lineSource != null) && (lineTarget != null)) {

        // the MADA-tokenised files contain start each sentence with the setence ID. We put it there
        // (why?) - no we've got to remove it

        lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);
        System.out.println("Processing sentence " + sentCount);
        if (posSourceExists) {

          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {

          posTargetProc.processSentence(targetSent);
        }

        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);

        pplProcSource.processNextSentence(sourceSent);

        pplProcTarget.processNextSentence(targetSent);

        pplPosTarget.processNextSentence(targetSent);

        //				coh.processNextSentence(targetSent);

        mtop.processNextSentence(sourceSent);

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.write("\r\n");

        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      //		featureManager.printFeatureIndeces();
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();

      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }