private static void index_h(String prefix, File file, IndexWriter indexWriter) throws IOException { Document doc = null; if (file.isDirectory()) { File files[] = file.listFiles(); for (File file1 : files) { index_h(prefix + FILE_SEPARATOR + file.getName(), file1, indexWriter); } } else { String content = FileUtils.readFileToString(file, "utf-8"); System.out.println("=============================================================="); System.out.println("index_h " + content); System.out.println("=============================================================="); String filename = prefix + FILE_SEPARATOR + file.getName(); String path = file.getAbsolutePath(); doc = new Document(); doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("relative_path", filename, Field.Store.YES, Field.Index.NOT_ANALYZED)); indexWriter.addDocument(doc); } }
/** runs the GB features */ public void runGB() { MTOutputProcessor mtop = null; if (gbMode == 1) gbXML = initialiseGBResources(); String nbestSentPath = resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp"; String ngramExecPath = resourceManager.getString("tools.ngram.path"); mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize); // MorphAnalysisProcessor map = new MorphAnalysisProcessor(madaFile); File f = new File(sourceFile); String sourceFileName = f.getName(); f = new File(targetFile); String targetFileName = f.getName(); String outputFileName = sourceFileName + "_to_" + targetFileName + ".out"; String out = resourceManager.getString("output") + File.separator + getMod() + outputFileName; System.out.println("Output will be: " + out); String lineTarget; try { BufferedReader brSource = new BufferedReader(new FileReader(sourceFile)); BufferedReader brTarget = new BufferedReader(new FileReader(targetFile)); BufferedWriter output = new BufferedWriter(new FileWriter(out)); ResourceManager.printResources(); Sentence targetSent; Sentence sourceSent; int sentCount = 0; String lineSource; while (((lineSource = brSource.readLine()) != null) && ((lineTarget = brTarget.readLine()) != null)) { lineSource = lineSource.trim().substring(lineSource.indexOf(" ")); sourceSent = new Sentence(lineSource, sentCount); targetSent = new Sentence(lineTarget, sentCount); // map.processNextSentence(sourceSent); mtop.processNextSentence(sourceSent); ++sentCount; output.write(featureManager.runFeatures(sourceSent, targetSent)); output.write("\r\n"); } brSource.close(); brTarget.close(); output.close(); featureManager.printFeatureIndeces(); Logger.close(); } catch (Exception e) { e.printStackTrace(); } }
/* * Computes the perplexity and log probability for the source file Required * by features 8-13 */ private static void runNGramPPL() { // required by BB features 8-13 NGramExec nge = new NGramExec(resourceManager.getString("tools.ngram.path"), forceRun); System.out.println("runNgramPPL"); File f = new File(sourceFile); String sourceOutput = input + File.separator + sourceLang + File.separator + f.getName() + ".ppl"; f = new File(targetFile); String targetOutput = input + File.separator + targetLang + File.separator + f.getName() + ".ppl"; nge.runNGramPerplex(sourceFile, sourceOutput, resourceManager.getString(sourceLang + ".lm")); System.out.println(resourceManager.getString(targetLang + ".lm")); nge.runNGramPerplex(targetFile, targetOutput, resourceManager.getString(targetLang + ".lm")); }
// Take a tree of files starting in a directory in a zip file // and copy them to a disk directory, recreating the tree. private int unpackZipFile( File inZipFile, String directory, String parent, boolean suppressFirstPathElement) { int count = 0; if (!inZipFile.exists()) return count; parent = parent.trim(); if (!parent.endsWith(File.separator)) parent += File.separator; if (!directory.endsWith(File.separator)) directory += File.separator; File outFile = null; try { ZipFile zipFile = new ZipFile(inZipFile); Enumeration zipEntries = zipFile.entries(); while (zipEntries.hasMoreElements()) { ZipEntry entry = (ZipEntry) zipEntries.nextElement(); String name = entry.getName().replace('/', File.separatorChar); if (name.startsWith(directory)) { if (suppressFirstPathElement) name = name.substring(directory.length()); outFile = new File(parent + name); // Create the directory, just in case if (name.indexOf(File.separatorChar) >= 0) { String p = name.substring(0, name.lastIndexOf(File.separatorChar) + 1); File dirFile = new File(parent + p); dirFile.mkdirs(); } if (!entry.isDirectory()) { System.out.println("Installing " + outFile); // Copy the file BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(outFile)); BufferedInputStream in = new BufferedInputStream(zipFile.getInputStream(entry)); int size = 1024; int n = 0; byte[] b = new byte[size]; while ((n = in.read(b, 0, size)) != -1) out.write(b, 0, n); in.close(); out.flush(); out.close(); // Count the file count++; } } } zipFile.close(); } catch (Exception e) { System.err.println("...an error occured while installing " + outFile); e.printStackTrace(); System.err.println("Error copying " + outFile.getName() + "\n" + e.getMessage()); return -count; } System.out.println(count + " files were installed."); return count; }
private boolean deploy(File f) { try { if (log != null) log.info("deploy:" + f.getCanonicalPath()); QEntry qentry = (QEntry) dirMap.get(f); SAXBuilder builder = createSAXBuilder(); Document doc; if (decorator != null && !f.getName().equals(LOGGER_CONFIG)) { doc = decrypt(builder.build(new StringReader(decorator.decorateFile(f)))); } else { doc = decrypt(builder.build(f)); } Element rootElement = doc.getRootElement(); String iuuid = rootElement.getAttributeValue("instance"); if (iuuid != null) { UUID uuid = UUID.fromString(iuuid); if (!uuid.equals(getInstanceId())) { deleteFile(f, iuuid); return false; } } Object obj = factory.instantiate(this, rootElement); qentry.setObject(obj); ObjectInstance instance = factory.createQBean(this, doc.getRootElement(), obj); qentry.setInstance(instance); } catch (InstanceAlreadyExistsException e) { /* * Ok, the file we tried to deploy, holds an object * that already has been deployed. * * Rename it out of the way. * */ tidyFileAway(f, DUPLICATE_EXTENSION); getLog().warn("deploy", e); return false; } catch (Exception e) { getLog().warn("deploy", e); tidyFileAway(f, ERROR_EXTENSION); // This will also save deploy error repeats... return false; } catch (Error e) { getLog().warn("deploy", e); tidyFileAway(f, ENV_EXTENSION); // This will also save deploy error repeats... return false; } return true; }
/** * Computes the perplexity and log probability for the POS tagged target file<br> * Required by BB features 68-69<br> * This function could be merged with * * @seerunNGramPPL() but I separated them to make the code more readable * @param posFile file tagged with parts-of-speech */ private String runNGramPPLPos(String posFile) { NGramExec nge = new NGramExec(resourceManager.getString("tools.ngram.path"), forceRun); File f = new File(posFile); String posTargetOutput = input + File.separator + targetLang + File.separator + f.getName() + resourceManager.getString("tools.ngram.output.ext"); nge.runNGramPerplex(posFile, posTargetOutput, resourceManager.getString(targetLang + ".poslm")); return posTargetOutput; }
/** * Runs the Feature Extractor<br> * * <ul> * <li>constructs the required folders * <li>runs the pre-processing tools * <li>runs the BB features, GB features or both according to the command line parameters * </ul> */ public String initialiseGBResources() { // transform the m output to xml String xmlOut = resourceManager.getString("input") + File.separator + "systems" + File.separator; File f = new File(sourceFile); if (mtSys == MOSES) { xmlOut += "moses_" + f.getName() + ".xml"; System.out.println(xmlOut); MOSES_XMLWrapper cmuwrap = new MOSES_XMLWrapper(nbestInput, xmlOut, onebestPhrases, onebestLog); cmuwrap.run(); // now send the xml output from cmuwrap to be processed } return xmlOut; }
public void process() { File inFolder = new File(vectorFolder); if (!inFolder.isDirectory()) { System.out.println("In should be a folder: " + vectorFolder); return; } boolean clusterSaved = false; this.invertedFixedClases = loadFixedClasses(fixedClassesFile); if (!histogram) { Map<String, List<String>> sectors = loadStockSectors(sectorFile); sectorToClazz = convertSectorsToClazz(sectors); if (!clusterSaved) { changeClassLabels(); clusterSaved = true; } for (Map.Entry<String, Integer> entry : sectorToClazz.entrySet()) { System.out.println(entry.getKey() + " : " + entry.getValue()); } } for (File inFile : inFolder.listFiles()) { String fileName = inFile.getName(); String fileNameWithOutExt = FilenameUtils.removeExtension(fileName); if (histogram) { sectorToClazz.clear(); invertedSectors.clear(); Map<String, List<String>> sectors = loadHistoSectors(sectorFile + "/" + fileNameWithOutExt + ".csv"); sectorToClazz = convertSectorsToClazz(sectors); if (!clusterSaved) { changeClassLabels(); clusterSaved = true; } for (Map.Entry<String, Integer> entry : sectorToClazz.entrySet()) { System.out.println(entry.getKey() + " : " + entry.getValue()); } } processFile(fileNameWithOutExt); } }
/** * runs the part of speech tagger * * @param file input file * @param lang language * @param type source or target * @return path to the output file of the POS tagger */ public String runPOS(String file, String lang, String type) { String posName = resourceManager.getString(lang + ".postagger"); String langResPath = input + File.separator + lang; File f = new File(file); String absoluteSourceFilePath = f.getAbsolutePath(); String fileName = f.getName(); String relativeFilePath = langResPath + File.separator + fileName + ".pos"; String absoluteOutputFilePath = (new File(relativeFilePath)).getAbsolutePath(); String posSourceTaggerPath = resourceManager.getString(lang + ".postagger.exePath"); String outPath = ""; try { Class c = Class.forName(posName); PosTagger tagger = (PosTagger) c.newInstance(); tagger.setParameters( type, posName, posSourceTaggerPath, absoluteSourceFilePath, absoluteOutputFilePath); PosTagger.ForceRun(forceRun); outPath = tagger.run(); } catch (Exception e) { e.printStackTrace(); } // returns the path of the output file; this is for convenience only so // we do't have to calculate it again return outPath; }
/** * Class constructor; creates a new Installer object, displays a JFrame introducing the program, * allows the user to select an install directory, and copies files from the jar into the * directory. */ public Installer(String args[]) { // Inputs are --install-dir INSTALL_DIR for (int k = 0; k < args.length; k = k + 2) { switch (args[k]) { case "--install-dir": directory = new File(args[k + 1]); System.out.println(directory); break; case "--port": port = Integer.parseInt(args[k + 1]); break; } } cp = new Stream(); // Find the installer program so we can get to the files. installer = getInstallerProgramFile(); String name = installer.getName(); programName = (name.substring(0, name.indexOf("-"))).toUpperCase(); // Get the installation information thisJava = System.getProperty("java.version"); thisJavaBits = System.getProperty("sun.arch.data.model") + " bits"; // Find the ImageIO Tools and get the version String javaHome = System.getProperty("java.home"); File extDir = new File(javaHome); extDir = new File(extDir, "lib"); extDir = new File(extDir, "ext"); File clib = getFile(extDir, "clibwrapper_jiio", ".jar"); File jai = getFile(extDir, "jai_imageio", ".jar"); imageIOTools = (clib != null) && clib.exists() && (jai != null) && jai.exists(); if (imageIOTools) { Hashtable<String, String> jaiManifest = getManifestAttributes(jai); imageIOVersion = jaiManifest.get("Implementation-Version"); } // Get the CTP.jar parameters Hashtable<String, String> manifest = getJarManifestAttributes("/CTP/libraries/CTP.jar"); programDate = manifest.get("Date"); buildJava = manifest.get("Java-Version"); // Get the util.jar parameters Hashtable<String, String> utilManifest = getJarManifestAttributes("/CTP/libraries/util.jar"); utilJava = utilManifest.get("Java-Version"); // Get the MIRC.jar parameters (if the plugin is present) Hashtable<String, String> mircManifest = getJarManifestAttributes("/CTP/libraries/MIRC.jar"); if (mircManifest != null) { mircJava = mircManifest.get("Java-Version"); mircDate = mircManifest.get("Date"); mircVersion = mircManifest.get("Version"); } // Set up the installation information for display if (imageIOVersion.equals("")) { imageIOVersion = "<b><font color=\"red\">not installed</font></b>"; } else if (imageIOVersion.startsWith("1.0")) { imageIOVersion = "<b><font color=\"red\">" + imageIOVersion + "</font></b>"; } if (thisJavaBits.startsWith("64")) { thisJavaBits = "<b><font color=\"red\">" + thisJavaBits + "</font></b>"; } boolean javaOK = (thisJava.compareTo(buildJava) >= 0); javaOK &= (thisJava.compareTo(utilJava) >= 0); javaOK &= (thisJava.compareTo(mircJava) >= 0); if (!javaOK) { thisJava = "<b><font color=\"red\">" + thisJava + "</font></b>"; } if (directory == null) exit(); // Point to the parent of the directory in which to install the program. // so the copy process works correctly for directory trees. // // If the user has selected a directory named "CTP", // then assume that this is the directory in which // to install the program. // // If the directory is not CTP, see if it is called "RSNA" and contains // the Launcher program, in which case we can assume that it is an // installation that was done with Bill Weadock's all-in-one installer for Windows. // // If neither of those cases is true, then this is already the parent of the // directory in which to install the program if (directory.getName().equals("CTP")) { directory = directory.getParentFile(); } else if (directory.getName().equals("RSNA") && (new File(directory, "Launcher.jar")).exists()) { suppressFirstPathElement = true; } // Cleanup old releases cleanup(directory); // Get a port for the server. if (port < 0) { if (checkServer(-port, false)) { System.err.println( "CTP appears to be running.\nPlease stop CTP and run the installer again."); System.exit(0); } } // Now install the files and report the results. int count = unpackZipFile(installer, "CTP", directory.getAbsolutePath(), suppressFirstPathElement); if (count > 0) { // Create the service installer batch files. updateWindowsServiceInstaller(); updateLinuxServiceInstaller(); // If this was a new installation, set up the config file and set the port installConfigFile(port); // Make any necessary changes in the config file to reflect schema evolution fixConfigSchema(); System.out.println("Installation complete."); System.out.println(programName + " has been installed successfully."); System.out.println(count + " files were installed."); } else { System.err.println("Installation failed."); System.err.println(programName + " could not be fully installed."); } if (!programName.equals("ISN") && startRunner(new File(directory, "CTP"))) System.exit(0); }
private void cleanup(File directory) { // Clean up from old installations, removing or renaming files. // Note that directory is the parent of the CTP directory // unless the original installation was done by Bill Weadock's // all-in-one installer for Windows. // Get a file pointing to the CTP directory. // This might be the current directory, or // it might be the CTP child. File dir; if (directory.getName().equals("RSNA")) dir = directory; else dir = new File(directory, "CTP"); // If CTP.jar exists in this directory, it is a really // old CTP main file - not used anymore File ctp = new File(dir, "CTP.jar"); if (ctp.exists()) ctp.delete(); // These are old names for the Launcher.jar file File launcher = new File(dir, "CTP-launcher.jar"); if (launcher.exists()) launcher.delete(); launcher = new File(dir, "TFS-launcher.jar"); if (launcher.exists()) launcher.delete(); // Delete the obsolete CTP-runner.jar file File runner = new File(dir, "CTP-runner.jar"); if (runner.exists()) runner.delete(); // Delete the obsolete MIRC-copier.jar file File copier = new File(dir, "MIRC-copier.jar"); if (copier.exists()) copier.delete(); // Rename the old versions of the properties files File oldprops = new File(dir, "CTP-startup.properties"); File newprops = new File(dir, "CTP-launcher.properties"); File correctprops = new File(dir, "Launcher.properties"); if (oldprops.exists()) { if (newprops.exists() || correctprops.exists()) oldprops.delete(); else oldprops.renameTo(correctprops); } if (newprops.exists()) { if (correctprops.exists()) newprops.delete(); else newprops.renameTo(correctprops); } // Get rid of obsolete startup and shutdown programs File startup = new File(dir, "CTP-startup.jar"); if (startup.exists()) startup.delete(); File shutdown = new File(dir, "CTP-shutdown.jar"); if (shutdown.exists()) shutdown.delete(); // Get rid of the obsolete linux directory File linux = new File(dir, "linux"); if (linux.exists()) { startup = new File(linux, "CTP-startup.jar"); if (startup.exists()) startup.delete(); shutdown = new File(linux, "CTP-shutdown.jar"); if (shutdown.exists()) shutdown.delete(); linux.delete(); } // clean up the libraries directory File libraries = new File(dir, "libraries"); if (libraries.exists()) { // remove obsolete versions of the slf4j libraries // and the dcm4che-imageio libraries File[] files = libraries.listFiles(); for (File file : files) { if (file.isFile()) { String name = file.getName(); if (name.startsWith("slf4j-") || name.startsWith("dcm4che-imageio-rle")) { file.delete(); } } } // remove the email subdirectory File email = new File(libraries, "email"); deleteAll(email); // remove the xml subdirectory File xml = new File(libraries, "xml"); deleteAll(xml); // remove the sftp subdirectory File sftp = new File(libraries, "sftp"); deleteAll(xml); // move edtftpj.jar to the ftp directory File edtftpj = new File(libraries, "edtftpj.jar"); if (edtftpj.exists()) { File ftp = new File(libraries, "ftp"); ftp.mkdirs(); File ftpedtftpj = new File(ftp, "edtftpj.jar"); edtftpj.renameTo(ftpedtftpj); } } // remove the obsolete xml library under dir File xml = new File(dir, "xml"); deleteAll(xml); // remove the dicom profiles so any // obsolete files will disappear File profiles = new File(dir, "profiles"); File dicom = new File(profiles, "dicom"); deleteAll(dicom); dicom.mkdirs(); // Remove the index.html file so it will be rebuilt from // example-index.html when the system next starts. File root = new File(dir, "ROOT"); if (root.exists()) { File index = new File(root, "index.html"); index.delete(); } }
public boolean accept(File file) { String name = file.getName(); return name.startsWith(nameStart) && name.endsWith(nameEnd); }
public boolean accept(File f) { return f.canRead() && (f.getName().endsWith(".xml") || (recursive && f.isDirectory() && !"lib".equalsIgnoreCase(f.getName()))); }
/** runs the BB features */ public void runBB() { File f = new File(sourceFile); String sourceFileName = f.getName(); f = new File(targetFile); String targetFileName = f.getName(); String outputFileName = sourceFileName + "_to_" + targetFileName + ".out"; File file = new File(resourceManager.getString("output")); if (!file.exists()) { System.err.println("Creating dir: " + resourceManager.getString("output")); Logger.log("Creating dir: " + resourceManager.getString("output")); file.mkdirs(); } else { Logger.log("output dir exists: " + resourceManager.getString("output")); } String out = resourceManager.getString("output") + File.separator + outputFileName; System.out.println("Output will be: " + out); String pplSourcePath = resourceManager.getString("input") + File.separator + sourceLang + File.separator + sourceFileName + resourceManager.getString("tools.ngram.output.ext"); String pplTargetPath = resourceManager.getString("input") + File.separator + targetLang + File.separator + targetFileName + resourceManager.getString("tools.ngram.output.ext"); String pplPOSTargetPath = resourceManager.getString("input") + File.separator + targetLang + File.separator + targetFileName + PosTagger.getXPOS() + resourceManager.getString("tools.ngram.output.ext"); runNGramPPL(); FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus")); // FileModel fm = new FileModel(sourceFile, // resourceManager.getString("source" + ".corpus")); PPLProcessor pplProcSource = new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"}); PPLProcessor pplProcTarget = new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"}); String sourcePosOutput = null; String targetPosOutput = null; PPLProcessor pplPosTarget = null; if (!isBaseline) { sourcePosOutput = runPOS(sourceFile, sourceLang, "source"); targetPosOutput = runPOS(targetFile, targetLang, "target"); String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS()); System.out.println("---------TARGET PPLPOS: " + targetPPLPos); pplPosTarget = new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"}); } loadGiza(); processNGrams(); boolean gl = false; String temp0 = resourceManager.getString("GL"); if (null != temp0 && temp0.equals("1")) { gl = true; } if (gl) { loadGlobalLexicon(); } // Preparing the indices for IR_similarity_features Lucene sourceLuc = null; Lucene targetLuc = null; if (featureManager.hasFeature("1700")) { // The indices reside under lang_resources path String lang_resources = workDir + File.separator + "lang_resources"; // Indices are saved under: luceneIndex folder String source_lucene_path = lang_resources + File.separator + sourceLang + File.separator + "luceneIndex"; // The corpus to index String source_lucene_corpus = source_lucene_path + File.separator + sourceLang + ".corpus"; // System.out.println("SOURCE: " + source_lucene_path + " ||| " + source_lucene_corpus); try { sourceLuc = new Lucene(source_lucene_path, source_lucene_corpus, true, true, "Source"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } String target_lucene_path = lang_resources + File.separator + targetLang + File.separator + "luceneIndex"; String target_lucene_corpus = target_lucene_path + File.separator + targetLang + ".corpus"; // System.out.println("TARGET: " + target_lucene_path + " ||| " + target_lucene_corpus); try { targetLuc = new Lucene(target_lucene_path, target_lucene_corpus, true, true, "Target"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } // MQM kicks in MQMManager.getInstance().initialize(resourceManager); Context context = new Context(); context.setSourceFilePath(sourceFile); context.setTargetFilePath(targetFile); MQMManager.getInstance().globalProcessing(context); try { BufferedReader brSource = new BufferedReader(new FileReader(sourceFile)); BufferedReader brTarget = new BufferedReader(new FileReader(targetFile)); BufferedWriter output = new BufferedWriter(new FileWriter(out)); BufferedReader posSource = null; BufferedReader posTarget = null; boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger"); boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger"); POSProcessor posSourceProc = null; POSProcessor posTargetProc = null; // lefterav: Berkeley parser modifications start here // Check if user has defined the grammar files for source // and target language // if ( ResourceManager.isRegistered("BParser")){ boolean bp = false; String temp = resourceManager.getString("BP"); if (null != temp && temp.equals("1")) { bp = true; } BParserProcessor sourceParserProcessor = null; BParserProcessor targetParserProcessor = null; if (bp) { sourceParserProcessor = new BParserProcessor(); targetParserProcessor = new BParserProcessor(); sourceParserProcessor.initialize(sourceFile, resourceManager, sourceLang); targetParserProcessor.initialize(targetFile, resourceManager, targetLang); } // } /** BEGIN: Added by Raphael Rubino for the Topic Model Features */ boolean tm = false; String temp1 = resourceManager.getString("TM"); if (temp1 != null && temp1.equals("1")) { tm = true; } TopicDistributionProcessor sourceTopicDistributionProcessor = null; TopicDistributionProcessor targetTopicDistributionProcessor = null; if (tm) { String sourceTopicDistributionFile = resourceManager.getString(sourceLang + ".topic.distribution"); String targetTopicDistributionFile = resourceManager.getString(targetLang + ".topic.distribution"); sourceTopicDistributionProcessor = new TopicDistributionProcessor(sourceTopicDistributionFile, "sourceTopicDistribution"); targetTopicDistributionProcessor = new TopicDistributionProcessor(targetTopicDistributionFile, "targetTopicDistribution"); } /* END: Added by Raphael Rubino for the Topic Model Features */ if (!isBaseline) { if (posSourceExists) { posSourceProc = new POSProcessor(sourcePosOutput); posSource = new BufferedReader( new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8")); } if (posTargetExists) { posTargetProc = new POSProcessor(targetPosOutput); posTarget = new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput))); } } ResourceManager.printResources(); Sentence sourceSent; Sentence targetSent; int sentCount = 0; String lineSource = brSource.readLine(); String lineTarget = brTarget.readLine(); /** Triggers (by David Langlois) */ boolean tr = false; String temp2 = resourceManager.getString("TR"); if (temp2 != null && temp2.equals("1")) { tr = true; } Triggers itl_target = null; TriggersProcessor itl_target_p = null; Triggers itl_source = null; TriggersProcessor itl_source_p = null; // TriggersProcessor itl_source_p = null; Triggers itl_source_target = null; TriggersProcessor itl_source_target_p = null; if (tr) { itl_target = new Triggers( resourceManager.getString("target.intra.triggers.file"), Integer.parseInt(resourceManager.getString("nb.max.triggers.target.intra")), resourceManager.getString("phrase.separator")); itl_target_p = new TriggersProcessor(itl_target); itl_source = new Triggers( resourceManager.getString("source.intra.triggers.file"), Integer.parseInt(resourceManager.getString("nb.max.triggers.source.intra")), resourceManager.getString("phrase.separator")); itl_source_p = new TriggersProcessor(itl_source); itl_source_target = new Triggers( resourceManager.getString("source.target.inter.triggers.file"), Integer.parseInt(resourceManager.getString("nb.max.triggers.source.target.inter")), resourceManager.getString("phrase.separator")); itl_source_target_p = new TriggersProcessor(itl_source_target); } /* * End modification for Triggers */ // read in each line from the source and target files // create a sentence from each // process each sentence // run the features on the sentences while ((lineSource != null) && (lineTarget != null)) { // lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", ""); sourceSent = new Sentence(lineSource, sentCount); targetSent = new Sentence(lineTarget, sentCount); // System.out.println("Processing sentence "+sentCount); // System.out.println("SORCE: " + sourceSent.getText()); // System.out.println("TARGET: " + targetSent.getText()); if (posSourceExists) { posSourceProc.processSentence(sourceSent); } if (posTargetExists) { posTargetProc.processSentence(targetSent); } sourceSent.computeNGrams(3); targetSent.computeNGrams(3); pplProcSource.processNextSentence(sourceSent); pplProcTarget.processNextSentence(targetSent); if (!isBaseline) { pplPosTarget.processNextSentence(targetSent); } // lefterav: Parse code here if (bp) { sourceParserProcessor.processNextSentence(sourceSent); targetParserProcessor.processNextSentence(targetSent); } if (tm) { sourceTopicDistributionProcessor.processNextSentence(sourceSent); targetTopicDistributionProcessor.processNextSentence(targetSent); } // modified by David if (tr) { itl_source_p.processNextSentence(sourceSent); itl_target_p.processNextSentence(targetSent); itl_source_target_p.processNextParallelSentences(sourceSent, targetSent); } // end modification by David // MQM kicks in MQMManager.getInstance().processNextParallelSentences(sourceSent, targetSent); // Ergun if (featureManager.hasFeature("1700")) { sourceLuc.processNextSentence(sourceSent); targetLuc.processNextSentence(targetSent); } ++sentCount; output.write(featureManager.runFeatures(sourceSent, targetSent)); output.newLine(); lineSource = brSource.readLine(); lineTarget = brTarget.readLine(); } if (posSource != null) { posSource.close(); } if (posTarget != null) { posTarget.close(); } brSource.close(); brTarget.close(); output.close(); Logger.close(); } catch (Exception e) { e.printStackTrace(); } }
/** * Performs some basic processing of the input source and target files For English, this consists * of converting the input to lower case and tokenizing For Arabic, this consists of * transliteration and tokenization. Please note that the current tools used for tokenizing Arabic * also perform POS tagging and morphological analysis Although we could separate the tokenization * process from the more in-depth text analysis performed by these tools, for efficiency reasons * this is not desirable The input files are also copied to the /input folder. This is necessary * because the MADA analyser produces its output in the same folder as the input file, which may * cause problems if the right access rights are not available for that particular folder */ private static void preprocessing() { String sourceInputFolder = input + File.separator + sourceLang; String targetInputFolder = input + File.separator + targetLang; File origSourceFile = new File(sourceFile); File inputSourceFile = new File(sourceInputFolder + File.separator + origSourceFile.getName()); System.out.println("source input:" + sourceFile); System.out.println("target input:" + targetFile); File origTargetFile = new File(targetFile); File inputTargetFile = new File(targetInputFolder + File.separator + origTargetFile.getName()); try { System.out.println("copying input to " + inputSourceFile.getPath()); copyFile(origSourceFile, inputSourceFile); System.out.println("copying input to " + inputTargetFile.getPath()); copyFile(origTargetFile, inputTargetFile); } catch (Exception e) { e.printStackTrace(); } // run tokenizer for source (English) System.out.println("running tokenizer"); String src_abbr = ""; if (sourceLang.equalsIgnoreCase("english")) src_abbr = "en"; else if (sourceLang.equalsIgnoreCase("spanish")) src_abbr = "es"; else if (sourceLang.equalsIgnoreCase("french")) src_abbr = "fr"; else if (sourceLang.equalsIgnoreCase("german")) src_abbr = "de"; else if (targetLang.equalsIgnoreCase("dutch")) src_abbr = "nl"; else if (targetLang.equalsIgnoreCase("portuguese")) src_abbr = "pt"; else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs"; else System.out.println("Don't recognise the source language"); String tgt_abbr = ""; if (targetLang.equalsIgnoreCase("english")) tgt_abbr = "en"; else if (targetLang.equalsIgnoreCase("spanish")) tgt_abbr = "es"; else if (targetLang.equalsIgnoreCase("french")) tgt_abbr = "fr"; else if (targetLang.equalsIgnoreCase("german")) tgt_abbr = "de"; else if (targetLang.equalsIgnoreCase("dutch")) tgt_abbr = "nl"; else if (targetLang.equalsIgnoreCase("portuguese")) tgt_abbr = "pt"; else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs"; else System.out.println("Don't recognise the target language"); String truecasePath = ""; if (null != resourceManager.getProperty(sourceLang + ".lowercase")) { truecasePath = resourceManager.getProperty(sourceLang + ".lowercase") + " -q "; } else { truecasePath = resourceManager.getString(sourceLang + ".truecase") + " --model " + resourceManager.getString(sourceLang + ".truecase.model"); } Tokenizer enTok = new Tokenizer( inputSourceFile.getPath(), inputSourceFile.getPath() + ".tok", truecasePath, resourceManager.getString(sourceLang + ".tokenizer"), src_abbr, forceRun); // Tokenizer enTok = new Tokenizer(inputSourceFile.getPath(), inputSourceFile.getPath() + // ".tok", resourceManager.getString("english.lowercase"), // resourceManager.getString("english.tokenizer"), "en", forceRun); enTok.run(); sourceFile = enTok.getTok(); System.out.println(sourceFile); // run tokenizer for target (Spanish) System.out.println("running tokenizer"); // Tokenizer esTok = new Tokenizer(inputTargetFile.getPath(), inputTargetFile.getPath() + // ".tok", resourceManager.getString("spanish.lowercase"), // resourceManager.getString("spanish.tokenizer"), "es", forceRun); if (null != resourceManager.getProperty(targetLang + ".lowercase")) { truecasePath = resourceManager.getProperty(targetLang + ".lowercase") + " -q "; } else { truecasePath = resourceManager.getString(targetLang + ".truecase") + " --model " + resourceManager.getString(targetLang + ".truecase.model"); } Tokenizer esTok = new Tokenizer( inputTargetFile.getPath(), inputTargetFile.getPath() + ".tok", truecasePath, resourceManager.getString(targetLang + ".tokenizer"), tgt_abbr, forceRun); esTok.run(); targetFile = esTok.getTok(); System.out.println(targetFile); // Normalize files to avoid strange characters in UTF-8 that may break the PoS tagger // normalize_utf8(); }
public void runAll() { File f = new File(sourceFile); String sourceFileName = f.getName(); f = new File(targetFile); String targetFileName = f.getName(); String outputFileName = sourceFileName + "_to_" + targetFileName + ".out"; String out = resourceManager.getString("output") + File.separator + outputFileName; System.out.println("Output will be: " + out); MTOutputProcessor mtop = null; if (gbMode == 1) gbXML = initialiseGBResources(); String nbestSentPath = resourceManager.getString("input") + File.separator + targetLang + File.separator + "temp"; String ngramExecPath = resourceManager.getString("tools.ngram.path"); mtop = new MTOutputProcessor(gbXML, nbestSentPath, ngramExecPath, ngramSize); // wlv.mt.features.coherence.Coherence coh = new wlv.mt.features.coherence.Coherence( // getTargetFile()); String pplSourcePath = resourceManager.getString("input") + File.separator + sourceLang + File.separator + sourceFileName + resourceManager.getString("tools.ngram.output.ext"); String pplTargetPath = resourceManager.getString("input") + File.separator + targetLang + File.separator + targetFileName + resourceManager.getString("tools.ngram.output.ext"); String pplPOSTargetPath = resourceManager.getString("input") + File.separator + targetLang + File.separator + targetFileName + PosTagger.getXPOS() + resourceManager.getString("tools.ngram.output.ext"); runNGramPPL(); PPLProcessor pplProcSource = new PPLProcessor(pplSourcePath, new String[] {"logprob", "ppl", "ppl1"}); PPLProcessor pplProcTarget = new PPLProcessor(pplTargetPath, new String[] {"logprob", "ppl", "ppl1"}); FileModel fm = new FileModel(sourceFile, resourceManager.getString(sourceLang + ".corpus")); String sourcePosOutput = runPOS(sourceFile, sourceLang, "source"); String targetPosOutput = runPOS(targetFile, targetLang, "target"); String targetPPLPos = runNGramPPLPos(targetPosOutput + PosTagger.getXPOS()); System.out.println("---------TARGET PPLPOS: " + targetPPLPos); PPLProcessor pplPosTarget = new PPLProcessor(targetPPLPos, new String[] {"poslogprob", "posppl", "posppl1"}); loadGiza(); processNGrams(); try { BufferedReader brSource = new BufferedReader(new FileReader(sourceFileName)); BufferedReader brTarget = new BufferedReader(new FileReader(targetFileName)); BufferedWriter output = new BufferedWriter(new FileWriter(out)); BufferedReader posSource = null; BufferedReader posTarget = null; boolean posSourceExists = ResourceManager.isRegistered("sourcePosTagger"); boolean posTargetExists = ResourceManager.isRegistered("targetPosTagger"); POSProcessor posSourceProc = null; POSProcessor posTargetProc = null; if (posSourceExists) { posSourceProc = new POSProcessor(sourcePosOutput); posSource = new BufferedReader( new InputStreamReader(new FileInputStream(sourcePosOutput), "utf-8")); } if (posTargetExists) { posTargetProc = new POSProcessor(targetPosOutput); posTarget = new BufferedReader(new InputStreamReader(new FileInputStream(targetPosOutput))); } ResourceManager.printResources(); Sentence targetSent; // HACK Sentence sourceSent; int sentCount = 0; // HACK String lineSource = brSource.readLine(); String lineTarget = brTarget.readLine(); // HACK int result; while ((lineSource != null) && (lineTarget != null)) { // the MADA-tokenised files contain start each sentence with the setence ID. We put it there // (why?) - no we've got to remove it lineSource = lineSource.trim().substring(lineSource.indexOf(" ")).replace("+", ""); sourceSent = new Sentence(lineSource, sentCount); targetSent = new Sentence(lineTarget, sentCount); System.out.println("Processing sentence " + sentCount); if (posSourceExists) { posSourceProc.processSentence(sourceSent); } if (posTargetExists) { posTargetProc.processSentence(targetSent); } sourceSent.computeNGrams(3); targetSent.computeNGrams(3); pplProcSource.processNextSentence(sourceSent); pplProcTarget.processNextSentence(targetSent); pplPosTarget.processNextSentence(targetSent); // coh.processNextSentence(targetSent); mtop.processNextSentence(sourceSent); ++sentCount; output.write(featureManager.runFeatures(sourceSent, targetSent)); output.write("\r\n"); lineSource = brSource.readLine(); lineTarget = brTarget.readLine(); } // featureManager.printFeatureIndeces(); if (posSource != null) { posSource.close(); } if (posTarget != null) { posTarget.close(); } brSource.close(); brTarget.close(); output.close(); Logger.close(); } catch (Exception e) { e.printStackTrace(); } }