Exemplo n.º 1
0
  private static void index_h(String prefix, File file, IndexWriter indexWriter)
      throws IOException {
    Document doc = null;

    if (file.isDirectory()) {
      File files[] = file.listFiles();
      for (File file1 : files) {
        index_h(prefix + FILE_SEPARATOR + file.getName(), file1, indexWriter);
      }
    } else {
      String content = FileUtils.readFileToString(file, "utf-8");

      System.out.println("==============================================================");
      System.out.println("index_h " + content);
      System.out.println("==============================================================");

      String filename = prefix + FILE_SEPARATOR + file.getName();
      String path = file.getAbsolutePath();

      doc = new Document();
      doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("relative_path", filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
      indexWriter.addDocument(doc);
    }
  }
Exemplo n.º 2
0
  /** runs the GB features */
  public void runGB() {
    MTOutputProcessor mtop = null;

    if (gbMode == 1) gbXML = initialiseGBResources();

    String nbestSentPath =
        resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp";
    String ngramExecPath = resourceManager.getString("tools.ngram.path");

    mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize);
    //		MorphAnalysisProcessor map = new MorphAnalysisProcessor(madaFile);

    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();

    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    String out = resourceManager.getString("output") + File.separator + getMod() + outputFileName;
    System.out.println("Output will be: " + out);

    String lineTarget;

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));

      ResourceManager.printResources();

      Sentence targetSent;
      Sentence sourceSent;
      int sentCount = 0;

      String lineSource;

      while (((lineSource = brSource.readLine()) != null)
          && ((lineTarget = brTarget.readLine()) != null)) {

        lineSource = lineSource.trim().substring(lineSource.indexOf(" "));
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        // map.processNextSentence(sourceSent);
        mtop.processNextSentence(sourceSent);

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.write("\r\n");
      }
      brSource.close();
      brTarget.close();
      output.close();
      featureManager.printFeatureIndeces();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Exemplo n.º 3
0
 /*
  * Computes the perplexity and log probability for the source file Required
  * by features 8-13
  */
 private static void runNGramPPL() {
   // required by BB features 8-13
   NGramExec nge = new NGramExec(resourceManager.getString("tools.ngram.path"), forceRun);
   System.out.println("runNgramPPL");
   File f = new File(sourceFile);
   String sourceOutput =
       input + File.separator + sourceLang + File.separator + f.getName() + ".ppl";
   f = new File(targetFile);
   String targetOutput =
       input + File.separator + targetLang + File.separator + f.getName() + ".ppl";
   nge.runNGramPerplex(sourceFile, sourceOutput, resourceManager.getString(sourceLang + ".lm"));
   System.out.println(resourceManager.getString(targetLang + ".lm"));
   nge.runNGramPerplex(targetFile, targetOutput, resourceManager.getString(targetLang + ".lm"));
 }
Exemplo n.º 4
0
 // Take a tree of files starting in a directory in a zip file
 // and copy them to a disk directory, recreating the tree.
 private int unpackZipFile(
     File inZipFile, String directory, String parent, boolean suppressFirstPathElement) {
   int count = 0;
   if (!inZipFile.exists()) return count;
   parent = parent.trim();
   if (!parent.endsWith(File.separator)) parent += File.separator;
   if (!directory.endsWith(File.separator)) directory += File.separator;
   File outFile = null;
   try {
     ZipFile zipFile = new ZipFile(inZipFile);
     Enumeration zipEntries = zipFile.entries();
     while (zipEntries.hasMoreElements()) {
       ZipEntry entry = (ZipEntry) zipEntries.nextElement();
       String name = entry.getName().replace('/', File.separatorChar);
       if (name.startsWith(directory)) {
         if (suppressFirstPathElement) name = name.substring(directory.length());
         outFile = new File(parent + name);
         // Create the directory, just in case
         if (name.indexOf(File.separatorChar) >= 0) {
           String p = name.substring(0, name.lastIndexOf(File.separatorChar) + 1);
           File dirFile = new File(parent + p);
           dirFile.mkdirs();
         }
         if (!entry.isDirectory()) {
           System.out.println("Installing " + outFile);
           // Copy the file
           BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(outFile));
           BufferedInputStream in = new BufferedInputStream(zipFile.getInputStream(entry));
           int size = 1024;
           int n = 0;
           byte[] b = new byte[size];
           while ((n = in.read(b, 0, size)) != -1) out.write(b, 0, n);
           in.close();
           out.flush();
           out.close();
           // Count the file
           count++;
         }
       }
     }
     zipFile.close();
   } catch (Exception e) {
     System.err.println("...an error occured while installing " + outFile);
     e.printStackTrace();
     System.err.println("Error copying " + outFile.getName() + "\n" + e.getMessage());
     return -count;
   }
   System.out.println(count + " files were installed.");
   return count;
 }
Exemplo n.º 5
0
  private boolean deploy(File f) {
    try {
      if (log != null) log.info("deploy:" + f.getCanonicalPath());
      QEntry qentry = (QEntry) dirMap.get(f);
      SAXBuilder builder = createSAXBuilder();
      Document doc;
      if (decorator != null && !f.getName().equals(LOGGER_CONFIG)) {
        doc = decrypt(builder.build(new StringReader(decorator.decorateFile(f))));
      } else {
        doc = decrypt(builder.build(f));
      }

      Element rootElement = doc.getRootElement();
      String iuuid = rootElement.getAttributeValue("instance");
      if (iuuid != null) {
        UUID uuid = UUID.fromString(iuuid);
        if (!uuid.equals(getInstanceId())) {
          deleteFile(f, iuuid);
          return false;
        }
      }
      Object obj = factory.instantiate(this, rootElement);
      qentry.setObject(obj);

      ObjectInstance instance = factory.createQBean(this, doc.getRootElement(), obj);
      qentry.setInstance(instance);
    } catch (InstanceAlreadyExistsException e) {
      /*
       * Ok, the file we tried to deploy, holds an object
       *  that already has been deployed.
       *
       * Rename it out of the way.
       *
       */
      tidyFileAway(f, DUPLICATE_EXTENSION);
      getLog().warn("deploy", e);
      return false;
    } catch (Exception e) {
      getLog().warn("deploy", e);
      tidyFileAway(f, ERROR_EXTENSION);
      // This will also save deploy error repeats...
      return false;
    } catch (Error e) {
      getLog().warn("deploy", e);
      tidyFileAway(f, ENV_EXTENSION);
      // This will also save deploy error repeats...
      return false;
    }
    return true;
  }
Exemplo n.º 6
0
  /**
   * Computes the perplexity and log probability for the POS tagged target file<br>
   * Required by BB features 68-69<br>
   * This function could be merged with
   *
   * @seerunNGramPPL() but I separated them to make the code more readable
   * @param posFile file tagged with parts-of-speech
   */
  private String runNGramPPLPos(String posFile) {
    NGramExec nge = new NGramExec(resourceManager.getString("tools.ngram.path"), forceRun);

    File f = new File(posFile);
    String posTargetOutput =
        input
            + File.separator
            + targetLang
            + File.separator
            + f.getName()
            + resourceManager.getString("tools.ngram.output.ext");
    nge.runNGramPerplex(posFile, posTargetOutput, resourceManager.getString(targetLang + ".poslm"));
    return posTargetOutput;
  }
Exemplo n.º 7
0
  /**
   * Runs the Feature Extractor<br>
   *
   * <ul>
   *   <li>constructs the required folders
   *   <li>runs the pre-processing tools
   *   <li>runs the BB features, GB features or both according to the command line parameters
   * </ul>
   */
  public String initialiseGBResources() {
    // transform the m output to xml
    String xmlOut =
        resourceManager.getString("input") + File.separator + "systems" + File.separator;
    File f = new File(sourceFile);
    if (mtSys == MOSES) {
      xmlOut += "moses_" + f.getName() + ".xml";
      System.out.println(xmlOut);
      MOSES_XMLWrapper cmuwrap =
          new MOSES_XMLWrapper(nbestInput, xmlOut, onebestPhrases, onebestLog);
      cmuwrap.run();

      // now send the xml output from cmuwrap to be processed
    }

    return xmlOut;
  }
Exemplo n.º 8
0
  public void process() {
    File inFolder = new File(vectorFolder);
    if (!inFolder.isDirectory()) {
      System.out.println("In should be a folder: " + vectorFolder);
      return;
    }
    boolean clusterSaved = false;
    this.invertedFixedClases = loadFixedClasses(fixedClassesFile);
    if (!histogram) {
      Map<String, List<String>> sectors = loadStockSectors(sectorFile);
      sectorToClazz = convertSectorsToClazz(sectors);
      if (!clusterSaved) {
        changeClassLabels();
        clusterSaved = true;
      }
      for (Map.Entry<String, Integer> entry : sectorToClazz.entrySet()) {
        System.out.println(entry.getKey() + " : " + entry.getValue());
      }
    }

    for (File inFile : inFolder.listFiles()) {
      String fileName = inFile.getName();
      String fileNameWithOutExt = FilenameUtils.removeExtension(fileName);
      if (histogram) {
        sectorToClazz.clear();
        invertedSectors.clear();

        Map<String, List<String>> sectors =
            loadHistoSectors(sectorFile + "/" + fileNameWithOutExt + ".csv");
        sectorToClazz = convertSectorsToClazz(sectors);
        if (!clusterSaved) {
          changeClassLabels();
          clusterSaved = true;
        }
        for (Map.Entry<String, Integer> entry : sectorToClazz.entrySet()) {
          System.out.println(entry.getKey() + " : " + entry.getValue());
        }
      }
      processFile(fileNameWithOutExt);
    }
  }
Exemplo n.º 9
0
 /**
  * runs the part of speech tagger
  *
  * @param file input file
  * @param lang language
  * @param type source or target
  * @return path to the output file of the POS tagger
  */
 public String runPOS(String file, String lang, String type) {
   String posName = resourceManager.getString(lang + ".postagger");
   String langResPath = input + File.separator + lang;
   File f = new File(file);
   String absoluteSourceFilePath = f.getAbsolutePath();
   String fileName = f.getName();
   String relativeFilePath = langResPath + File.separator + fileName + ".pos";
   String absoluteOutputFilePath = (new File(relativeFilePath)).getAbsolutePath();
   String posSourceTaggerPath = resourceManager.getString(lang + ".postagger.exePath");
   String outPath = "";
   try {
     Class c = Class.forName(posName);
     PosTagger tagger = (PosTagger) c.newInstance();
     tagger.setParameters(
         type, posName, posSourceTaggerPath, absoluteSourceFilePath, absoluteOutputFilePath);
     PosTagger.ForceRun(forceRun);
     outPath = tagger.run();
   } catch (Exception e) {
     e.printStackTrace();
   }
   // returns the path of the output file; this is for convenience only so
   // we do't have to calculate it again
   return outPath;
 }
Exemplo n.º 10
0
  /**
   * Class constructor; creates a new Installer object, displays a JFrame introducing the program,
   * allows the user to select an install directory, and copies files from the jar into the
   * directory.
   */
  public Installer(String args[]) {
    // Inputs are --install-dir INSTALL_DIR
    for (int k = 0; k < args.length; k = k + 2) {

      switch (args[k]) {
        case "--install-dir":
          directory = new File(args[k + 1]);
          System.out.println(directory);
          break;
        case "--port":
          port = Integer.parseInt(args[k + 1]);
          break;
      }
    }

    cp = new Stream();

    // Find the installer program so we can get to the files.
    installer = getInstallerProgramFile();
    String name = installer.getName();
    programName = (name.substring(0, name.indexOf("-"))).toUpperCase();

    // Get the installation information
    thisJava = System.getProperty("java.version");
    thisJavaBits = System.getProperty("sun.arch.data.model") + " bits";

    // Find the ImageIO Tools and get the version
    String javaHome = System.getProperty("java.home");
    File extDir = new File(javaHome);
    extDir = new File(extDir, "lib");
    extDir = new File(extDir, "ext");
    File clib = getFile(extDir, "clibwrapper_jiio", ".jar");
    File jai = getFile(extDir, "jai_imageio", ".jar");
    imageIOTools = (clib != null) && clib.exists() && (jai != null) && jai.exists();
    if (imageIOTools) {
      Hashtable<String, String> jaiManifest = getManifestAttributes(jai);
      imageIOVersion = jaiManifest.get("Implementation-Version");
    }

    // Get the CTP.jar parameters
    Hashtable<String, String> manifest = getJarManifestAttributes("/CTP/libraries/CTP.jar");
    programDate = manifest.get("Date");
    buildJava = manifest.get("Java-Version");

    // Get the util.jar parameters
    Hashtable<String, String> utilManifest = getJarManifestAttributes("/CTP/libraries/util.jar");
    utilJava = utilManifest.get("Java-Version");

    // Get the MIRC.jar parameters (if the plugin is present)
    Hashtable<String, String> mircManifest = getJarManifestAttributes("/CTP/libraries/MIRC.jar");
    if (mircManifest != null) {
      mircJava = mircManifest.get("Java-Version");
      mircDate = mircManifest.get("Date");
      mircVersion = mircManifest.get("Version");
    }

    // Set up the installation information for display
    if (imageIOVersion.equals("")) {
      imageIOVersion = "<b><font color=\"red\">not installed</font></b>";
    } else if (imageIOVersion.startsWith("1.0")) {
      imageIOVersion = "<b><font color=\"red\">" + imageIOVersion + "</font></b>";
    }
    if (thisJavaBits.startsWith("64")) {
      thisJavaBits = "<b><font color=\"red\">" + thisJavaBits + "</font></b>";
    }
    boolean javaOK = (thisJava.compareTo(buildJava) >= 0);
    javaOK &= (thisJava.compareTo(utilJava) >= 0);
    javaOK &= (thisJava.compareTo(mircJava) >= 0);
    if (!javaOK) {
      thisJava = "<b><font color=\"red\">" + thisJava + "</font></b>";
    }

    if (directory == null) exit();

    // Point to the parent of the directory in which to install the program.
    // so the copy process works correctly for directory trees.
    //
    // If the user has selected a directory named "CTP",
    // then assume that this is the directory in which
    // to install the program.
    //
    // If the directory is not CTP, see if it is called "RSNA" and contains
    // the Launcher program, in which case we can assume that it is an
    // installation that was done with Bill Weadock's all-in-one installer for Windows.
    //
    // If neither of those cases is true, then this is already the parent of the
    // directory in which to install the program
    if (directory.getName().equals("CTP")) {
      directory = directory.getParentFile();
    } else if (directory.getName().equals("RSNA")
        && (new File(directory, "Launcher.jar")).exists()) {
      suppressFirstPathElement = true;
    }

    // Cleanup old releases
    cleanup(directory);

    // Get a port for the server.
    if (port < 0) {
      if (checkServer(-port, false)) {
        System.err.println(
            "CTP appears to be running.\nPlease stop CTP and run the installer again.");
        System.exit(0);
      }
    }

    // Now install the files and report the results.
    int count =
        unpackZipFile(installer, "CTP", directory.getAbsolutePath(), suppressFirstPathElement);
    if (count > 0) {
      // Create the service installer batch files.
      updateWindowsServiceInstaller();
      updateLinuxServiceInstaller();

      // If this was a new installation, set up the config file and set the port
      installConfigFile(port);

      // Make any necessary changes in the config file to reflect schema evolution
      fixConfigSchema();

      System.out.println("Installation complete.");
      System.out.println(programName + " has been installed successfully.");
      System.out.println(count + " files were installed.");
    } else {
      System.err.println("Installation failed.");
      System.err.println(programName + " could not be fully installed.");
    }
    if (!programName.equals("ISN") && startRunner(new File(directory, "CTP"))) System.exit(0);
  }
Exemplo n.º 11
0
  private void cleanup(File directory) {
    // Clean up from old installations, removing or renaming files.
    // Note that directory is the parent of the CTP directory
    // unless the original installation was done by Bill Weadock's
    // all-in-one installer for Windows.

    // Get a file pointing to the CTP directory.
    // This might be the current directory, or
    // it might be the CTP child.
    File dir;
    if (directory.getName().equals("RSNA")) dir = directory;
    else dir = new File(directory, "CTP");

    // If CTP.jar exists in this directory, it is a really
    // old CTP main file - not used anymore
    File ctp = new File(dir, "CTP.jar");
    if (ctp.exists()) ctp.delete();

    // These are old names for the Launcher.jar file
    File launcher = new File(dir, "CTP-launcher.jar");
    if (launcher.exists()) launcher.delete();
    launcher = new File(dir, "TFS-launcher.jar");
    if (launcher.exists()) launcher.delete();

    // Delete the obsolete CTP-runner.jar file
    File runner = new File(dir, "CTP-runner.jar");
    if (runner.exists()) runner.delete();

    // Delete the obsolete MIRC-copier.jar file
    File copier = new File(dir, "MIRC-copier.jar");
    if (copier.exists()) copier.delete();

    // Rename the old versions of the properties files
    File oldprops = new File(dir, "CTP-startup.properties");
    File newprops = new File(dir, "CTP-launcher.properties");
    File correctprops = new File(dir, "Launcher.properties");
    if (oldprops.exists()) {
      if (newprops.exists() || correctprops.exists()) oldprops.delete();
      else oldprops.renameTo(correctprops);
    }
    if (newprops.exists()) {
      if (correctprops.exists()) newprops.delete();
      else newprops.renameTo(correctprops);
    }

    // Get rid of obsolete startup and shutdown programs
    File startup = new File(dir, "CTP-startup.jar");
    if (startup.exists()) startup.delete();
    File shutdown = new File(dir, "CTP-shutdown.jar");
    if (shutdown.exists()) shutdown.delete();

    // Get rid of the obsolete linux directory
    File linux = new File(dir, "linux");
    if (linux.exists()) {
      startup = new File(linux, "CTP-startup.jar");
      if (startup.exists()) startup.delete();
      shutdown = new File(linux, "CTP-shutdown.jar");
      if (shutdown.exists()) shutdown.delete();
      linux.delete();
    }

    // clean up the libraries directory
    File libraries = new File(dir, "libraries");
    if (libraries.exists()) {
      // remove obsolete versions of the slf4j libraries
      // and the dcm4che-imageio libraries
      File[] files = libraries.listFiles();
      for (File file : files) {
        if (file.isFile()) {
          String name = file.getName();
          if (name.startsWith("slf4j-") || name.startsWith("dcm4che-imageio-rle")) {
            file.delete();
          }
        }
      }
      // remove the email subdirectory
      File email = new File(libraries, "email");
      deleteAll(email);
      // remove the xml subdirectory
      File xml = new File(libraries, "xml");
      deleteAll(xml);
      // remove the sftp subdirectory
      File sftp = new File(libraries, "sftp");
      deleteAll(xml);
      // move edtftpj.jar to the ftp directory
      File edtftpj = new File(libraries, "edtftpj.jar");
      if (edtftpj.exists()) {
        File ftp = new File(libraries, "ftp");
        ftp.mkdirs();
        File ftpedtftpj = new File(ftp, "edtftpj.jar");
        edtftpj.renameTo(ftpedtftpj);
      }
    }

    // remove the obsolete xml library under dir
    File xml = new File(dir, "xml");
    deleteAll(xml);

    // remove the dicom profiles so any
    // obsolete files will disappear
    File profiles = new File(dir, "profiles");
    File dicom = new File(profiles, "dicom");
    deleteAll(dicom);
    dicom.mkdirs();

    // Remove the index.html file so it will be rebuilt from
    // example-index.html when the system next starts.
    File root = new File(dir, "ROOT");
    if (root.exists()) {
      File index = new File(root, "index.html");
      index.delete();
    }
  }
Exemplo n.º 12
0
 public boolean accept(File file) {
   String name = file.getName();
   return name.startsWith(nameStart) && name.endsWith(nameEnd);
 }
Exemplo n.º 13
0
 public boolean accept(File f) {
   return f.canRead()
       && (f.getName().endsWith(".xml")
           || (recursive && f.isDirectory() && !"lib".equalsIgnoreCase(f.getName())));
 }
Exemplo n.º 14
0
  /** runs the BB features */
  public void runBB() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";

    File file = new File(resourceManager.getString("output"));
    if (!file.exists()) {
      System.err.println("Creating dir: " + resourceManager.getString("output"));
      Logger.log("Creating dir: " + resourceManager.getString("output"));
      file.mkdirs();
    } else {
      Logger.log("output dir exists: " + resourceManager.getString("output"));
    }

    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));

    // FileModel fm = new FileModel(sourceFile,
    //     resourceManager.getString("source" + ".corpus"));

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    String sourcePosOutput = null;
    String targetPosOutput = null;
    PPLProcessor pplPosTarget = null;
    if (!isBaseline) {
      sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
      targetPosOutput = runPOS(targetFile, targetLang, "target");

      String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
      System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
      pplPosTarget =
          new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});
    }

    loadGiza();
    processNGrams();
    boolean gl = false;
    String temp0 = resourceManager.getString("GL");
    if (null != temp0 && temp0.equals("1")) {
      gl = true;
    }

    if (gl) {
      loadGlobalLexicon();
    }

    // Preparing the indices for IR_similarity_features
    Lucene sourceLuc = null;
    Lucene targetLuc = null;
    if (featureManager.hasFeature("1700")) {
      // The indices reside under lang_resources path
      String lang_resources = workDir + File.separator + "lang_resources";
      // Indices are saved under: luceneIndex folder
      String source_lucene_path =
          lang_resources + File.separator + sourceLang + File.separator + "luceneIndex";
      // The corpus to index
      String source_lucene_corpus = source_lucene_path + File.separator + sourceLang + ".corpus";
      //			System.out.println("SOURCE: " + source_lucene_path + " ||| " + source_lucene_corpus);
      try {
        sourceLuc = new Lucene(source_lucene_path, source_lucene_corpus, true, true, "Source");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      String target_lucene_path =
          lang_resources + File.separator + targetLang + File.separator + "luceneIndex";
      String target_lucene_corpus = target_lucene_path + File.separator + targetLang + ".corpus";
      //			System.out.println("TARGET: " + target_lucene_path + " ||| " + target_lucene_corpus);
      try {
        targetLuc = new Lucene(target_lucene_path, target_lucene_corpus, true, true, "Target");
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    // MQM kicks in
    MQMManager.getInstance().initialize(resourceManager);
    Context context = new Context();
    context.setSourceFilePath(sourceFile);
    context.setTargetFilePath(targetFile);
    MQMManager.getInstance().globalProcessing(context);

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFile));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFile));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;

      // lefterav: Berkeley parser modifications start here
      // Check if user has defined the grammar files for source
      // and target language

      //   if ( ResourceManager.isRegistered("BParser")){
      boolean bp = false;
      String temp = resourceManager.getString("BP");
      if (null != temp && temp.equals("1")) {
        bp = true;
      }

      BParserProcessor sourceParserProcessor = null;
      BParserProcessor targetParserProcessor = null;

      if (bp) {
        sourceParserProcessor = new BParserProcessor();
        targetParserProcessor = new BParserProcessor();
        sourceParserProcessor.initialize(sourceFile, resourceManager, sourceLang);
        targetParserProcessor.initialize(targetFile, resourceManager, targetLang);
      }
      // }

      /** BEGIN: Added by Raphael Rubino for the Topic Model Features */
      boolean tm = false;
      String temp1 = resourceManager.getString("TM");
      if (temp1 != null && temp1.equals("1")) {
        tm = true;
      }
      TopicDistributionProcessor sourceTopicDistributionProcessor = null;
      TopicDistributionProcessor targetTopicDistributionProcessor = null;
      if (tm) {
        String sourceTopicDistributionFile =
            resourceManager.getString(sourceLang + ".topic.distribution");
        String targetTopicDistributionFile =
            resourceManager.getString(targetLang + ".topic.distribution");
        sourceTopicDistributionProcessor =
            new TopicDistributionProcessor(sourceTopicDistributionFile, "sourceTopicDistribution");
        targetTopicDistributionProcessor =
            new TopicDistributionProcessor(targetTopicDistributionFile, "targetTopicDistribution");
      }
      /* END: Added by Raphael Rubino for the Topic Model Features
       */

      if (!isBaseline) {
        if (posSourceExists) {
          posSourceProc = new POSProcessor(sourcePosOutput);
          posSource =
              new BufferedReader(
                  new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
        }
        if (posTargetExists) {
          posTargetProc = new POSProcessor(targetPosOutput);
          posTarget =
              new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
        }
      }
      ResourceManager.printResources();
      Sentence sourceSent;
      Sentence targetSent;
      int sentCount = 0;

      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();

      /** Triggers (by David Langlois) */
      boolean tr = false;
      String temp2 = resourceManager.getString("TR");
      if (temp2 != null && temp2.equals("1")) {
        tr = true;
      }

      Triggers itl_target = null;
      TriggersProcessor itl_target_p = null;
      Triggers itl_source = null;
      TriggersProcessor itl_source_p = null;
      // TriggersProcessor itl_source_p = null;
      Triggers itl_source_target = null;
      TriggersProcessor itl_source_target_p = null;

      if (tr) {

        itl_target =
            new Triggers(
                resourceManager.getString("target.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.target.intra")),
                resourceManager.getString("phrase.separator"));
        itl_target_p = new TriggersProcessor(itl_target);

        itl_source =
            new Triggers(
                resourceManager.getString("source.intra.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.intra")),
                resourceManager.getString("phrase.separator"));
        itl_source_p = new TriggersProcessor(itl_source);

        itl_source_target =
            new Triggers(
                resourceManager.getString("source.target.inter.triggers.file"),
                Integer.parseInt(resourceManager.getString("nb.max.triggers.source.target.inter")),
                resourceManager.getString("phrase.separator"));
        itl_source_target_p = new TriggersProcessor(itl_source_target);
      }
      /*
       * End modification for Triggers
       */

      // read in each line from the source and target files
      // create a sentence from each
      // process each sentence
      // run the features on the sentences
      while ((lineSource != null) && (lineTarget != null)) {

        // lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);

        //       System.out.println("Processing sentence "+sentCount);
        //     System.out.println("SORCE: " + sourceSent.getText());
        //   System.out.println("TARGET: " + targetSent.getText());

        if (posSourceExists) {
          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {
          posTargetProc.processSentence(targetSent);
        }
        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);
        pplProcSource.processNextSentence(sourceSent);
        pplProcTarget.processNextSentence(targetSent);
        if (!isBaseline) {
          pplPosTarget.processNextSentence(targetSent);
        }

        // lefterav: Parse code here

        if (bp) {
          sourceParserProcessor.processNextSentence(sourceSent);
          targetParserProcessor.processNextSentence(targetSent);
        }

        if (tm) {

          sourceTopicDistributionProcessor.processNextSentence(sourceSent);
          targetTopicDistributionProcessor.processNextSentence(targetSent);
        }

        // modified by David
        if (tr) {
          itl_source_p.processNextSentence(sourceSent);
          itl_target_p.processNextSentence(targetSent);
          itl_source_target_p.processNextParallelSentences(sourceSent, targetSent);
        }
        // end modification by David

        // MQM kicks in
        MQMManager.getInstance().processNextParallelSentences(sourceSent, targetSent);

        // Ergun
        if (featureManager.hasFeature("1700")) {
          sourceLuc.processNextSentence(sourceSent);
          targetLuc.processNextSentence(targetSent);
        }

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.newLine();
        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();
      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Exemplo n.º 15
0
  /**
   * Performs some basic processing of the input source and target files For English, this consists
   * of converting the input to lower case and tokenizing For Arabic, this consists of
   * transliteration and tokenization. Please note that the current tools used for tokenizing Arabic
   * also perform POS tagging and morphological analysis Although we could separate the tokenization
   * process from the more in-depth text analysis performed by these tools, for efficiency reasons
   * this is not desirable The input files are also copied to the /input folder. This is necessary
   * because the MADA analyser produces its output in the same folder as the input file, which may
   * cause problems if the right access rights are not available for that particular folder
   */
  private static void preprocessing() {
    String sourceInputFolder = input + File.separator + sourceLang;
    String targetInputFolder = input + File.separator + targetLang;
    File origSourceFile = new File(sourceFile);
    File inputSourceFile = new File(sourceInputFolder + File.separator + origSourceFile.getName());

    System.out.println("source input:" + sourceFile);
    System.out.println("target input:" + targetFile);
    File origTargetFile = new File(targetFile);
    File inputTargetFile = new File(targetInputFolder + File.separator + origTargetFile.getName());
    try {
      System.out.println("copying input to " + inputSourceFile.getPath());
      copyFile(origSourceFile, inputSourceFile);
      System.out.println("copying input to " + inputTargetFile.getPath());
      copyFile(origTargetFile, inputTargetFile);
    } catch (Exception e) {
      e.printStackTrace();
    }

    // run tokenizer for source (English)
    System.out.println("running tokenizer");

    String src_abbr = "";
    if (sourceLang.equalsIgnoreCase("english")) src_abbr = "en";
    else if (sourceLang.equalsIgnoreCase("spanish")) src_abbr = "es";
    else if (sourceLang.equalsIgnoreCase("french")) src_abbr = "fr";
    else if (sourceLang.equalsIgnoreCase("german")) src_abbr = "de";
    else if (targetLang.equalsIgnoreCase("dutch")) src_abbr = "nl";
    else if (targetLang.equalsIgnoreCase("portuguese")) src_abbr = "pt";
    else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs";
    else System.out.println("Don't recognise the source language");

    String tgt_abbr = "";
    if (targetLang.equalsIgnoreCase("english")) tgt_abbr = "en";
    else if (targetLang.equalsIgnoreCase("spanish")) tgt_abbr = "es";
    else if (targetLang.equalsIgnoreCase("french")) tgt_abbr = "fr";
    else if (targetLang.equalsIgnoreCase("german")) tgt_abbr = "de";
    else if (targetLang.equalsIgnoreCase("dutch")) tgt_abbr = "nl";
    else if (targetLang.equalsIgnoreCase("portuguese")) tgt_abbr = "pt";
    else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs";
    else System.out.println("Don't recognise the target language");

    String truecasePath = "";
    if (null != resourceManager.getProperty(sourceLang + ".lowercase")) {
      truecasePath = resourceManager.getProperty(sourceLang + ".lowercase") + " -q ";
    } else {
      truecasePath =
          resourceManager.getString(sourceLang + ".truecase")
              + " --model "
              + resourceManager.getString(sourceLang + ".truecase.model");
    }
    Tokenizer enTok =
        new Tokenizer(
            inputSourceFile.getPath(),
            inputSourceFile.getPath() + ".tok",
            truecasePath,
            resourceManager.getString(sourceLang + ".tokenizer"),
            src_abbr,
            forceRun);

    // Tokenizer enTok = new Tokenizer(inputSourceFile.getPath(), inputSourceFile.getPath() +
    // ".tok", resourceManager.getString("english.lowercase"),
    // resourceManager.getString("english.tokenizer"), "en", forceRun);
    enTok.run();
    sourceFile = enTok.getTok();
    System.out.println(sourceFile);
    // run tokenizer for target (Spanish)
    System.out.println("running tokenizer");
    //        Tokenizer esTok = new Tokenizer(inputTargetFile.getPath(), inputTargetFile.getPath() +
    // ".tok", resourceManager.getString("spanish.lowercase"),
    // resourceManager.getString("spanish.tokenizer"), "es", forceRun);

    if (null != resourceManager.getProperty(targetLang + ".lowercase")) {
      truecasePath = resourceManager.getProperty(targetLang + ".lowercase") + " -q ";
    } else {
      truecasePath =
          resourceManager.getString(targetLang + ".truecase")
              + " --model "
              + resourceManager.getString(targetLang + ".truecase.model");
    }
    Tokenizer esTok =
        new Tokenizer(
            inputTargetFile.getPath(),
            inputTargetFile.getPath() + ".tok",
            truecasePath,
            resourceManager.getString(targetLang + ".tokenizer"),
            tgt_abbr,
            forceRun);

    esTok.run();
    targetFile = esTok.getTok();
    System.out.println(targetFile);

    // Normalize files to avoid strange characters in UTF-8 that may break the PoS tagger
    // normalize_utf8();
  }
Exemplo n.º 16
0
  public void runAll() {
    File f = new File(sourceFile);
    String sourceFileName = f.getName();
    f = new File(targetFile);
    String targetFileName = f.getName();
    String outputFileName = sourceFileName + "_to_" + targetFileName + ".out";
    String out = resourceManager.getString("output") + File.separator + outputFileName;
    System.out.println("Output will be: " + out);

    MTOutputProcessor mtop = null;

    if (gbMode == 1) gbXML = initialiseGBResources();

    String nbestSentPath =
        resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp";
    String ngramExecPath = resourceManager.getString("tools.ngram.path");

    mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize);

    // wlv.mt.features.coherence.Coherence coh = new wlv.mt.features.coherence.Coherence(
    //		getTargetFile());

    String pplSourcePath =
        resourceManager.getString("input")
            + File.separator
            + sourceLang
            + File.separator
            + sourceFileName
            + resourceManager.getString("tools.ngram.output.ext");
    String pplTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + resourceManager.getString("tools.ngram.output.ext");

    String pplPOSTargetPath =
        resourceManager.getString("input")
            + File.separator
            + targetLang
            + File.separator
            + targetFileName
            + PosTagger.getXPOS()
            + resourceManager.getString("tools.ngram.output.ext");
    runNGramPPL();

    PPLProcessor pplProcSource =
        new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"});
    PPLProcessor pplProcTarget =
        new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"});

    FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus"));
    String sourcePosOutput = runPOS(sourceFile, sourceLang, "source");
    String targetPosOutput = runPOS(targetFile, targetLang, "target");

    String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS());
    System.out.println("---------TARGET PPLPOS: " + targetPPLPos);
    PPLProcessor pplPosTarget =
        new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"});

    loadGiza();
    processNGrams();

    try {
      BufferedReader brSource = new BufferedReader(new FileReader(sourceFileName));
      BufferedReader brTarget = new BufferedReader(new FileReader(targetFileName));
      BufferedWriter output = new BufferedWriter(new FileWriter(out));
      BufferedReader posSource = null;
      BufferedReader posTarget = null;
      boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger");
      boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger");
      POSProcessor posSourceProc = null;
      POSProcessor posTargetProc = null;
      if (posSourceExists) {
        posSourceProc = new POSProcessor(sourcePosOutput);
        posSource =
            new BufferedReader(
                new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8"));
      }
      if (posTargetExists) {
        posTargetProc = new POSProcessor(targetPosOutput);
        posTarget = new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput)));
      }
      ResourceManager.printResources();

      Sentence targetSent;
      // HACK
      Sentence sourceSent;
      int sentCount = 0;
      // HACK
      String lineSource = brSource.readLine();
      String lineTarget = brTarget.readLine();
      // HACK
      int result;

      while ((lineSource != null) && (lineTarget != null)) {

        // the MADA-tokenised files contain start each sentence with the setence ID. We put it there
        // (why?) - no we've got to remove it

        lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", "");
        sourceSent = new Sentence(lineSource, sentCount);
        targetSent = new Sentence(lineTarget, sentCount);
        System.out.println("Processing sentence " + sentCount);
        if (posSourceExists) {

          posSourceProc.processSentence(sourceSent);
        }
        if (posTargetExists) {

          posTargetProc.processSentence(targetSent);
        }

        sourceSent.computeNGrams(3);
        targetSent.computeNGrams(3);

        pplProcSource.processNextSentence(sourceSent);

        pplProcTarget.processNextSentence(targetSent);

        pplPosTarget.processNextSentence(targetSent);

        //				coh.processNextSentence(targetSent);

        mtop.processNextSentence(sourceSent);

        ++sentCount;
        output.write(featureManager.runFeatures(sourceSent, targetSent));
        output.write("\r\n");

        lineSource = brSource.readLine();
        lineTarget = brTarget.readLine();
      }
      //		featureManager.printFeatureIndeces();
      if (posSource != null) {
        posSource.close();
      }
      if (posTarget != null) {
        posTarget.close();
      }

      brSource.close();
      brTarget.close();
      output.close();

      Logger.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }