public void initialize(PropertiesManager propertiesManager, FeatureManager featureManager) { // load monolingual terminology, if it is given this.monoTerms = null; String monoTerminologyPath = propertiesManager.getProperty("terminology.monolingual-mwu"); if (monoTerminologyPath != null) { this.monoTerms = new Terminology(monoTerminologyPath); } // load bilingual terminology, if it is given this.biTerms = null; String biTerminologyPath = propertiesManager.getProperty("terminology.bilingual-mwu"); if (biTerminologyPath != null) { this.biTerms = new Terminology(biTerminologyPath); } }
/** * Performs some basic processing of the input source and target files For English, this consists * of converting the input to lower case and tokenizing For Arabic, this consists of * transliteration and tokenization. Please note that the current tools used for tokenizing Arabic * also perform POS tagging and morphological analysis Although we could separate the tokenization * process from the more in-depth text analysis performed by these tools, for efficiency reasons * this is not desirable The input files are also copied to the /input folder. This is necessary * because the MADA analyser produces its output in the same folder as the input file, which may * cause problems if the right access rights are not available for that particular folder */ private static void preprocessing() { String sourceInputFolder = input + File.separator + sourceLang; String targetInputFolder = input + File.separator + targetLang; File origSourceFile = new File(sourceFile); File inputSourceFile = new File(sourceInputFolder + File.separator + origSourceFile.getName()); System.out.println("source input:" + sourceFile); System.out.println("target input:" + targetFile); File origTargetFile = new File(targetFile); File inputTargetFile = new File(targetInputFolder + File.separator + origTargetFile.getName()); try { System.out.println("copying input to " + inputSourceFile.getPath()); copyFile(origSourceFile, inputSourceFile); System.out.println("copying input to " + inputTargetFile.getPath()); copyFile(origTargetFile, inputTargetFile); } catch (Exception e) { e.printStackTrace(); } // run tokenizer for source (English) System.out.println("running tokenizer"); String src_abbr = ""; if (sourceLang.equalsIgnoreCase("english")) src_abbr = "en"; else if (sourceLang.equalsIgnoreCase("spanish")) src_abbr = "es"; else if (sourceLang.equalsIgnoreCase("french")) src_abbr = "fr"; else if (sourceLang.equalsIgnoreCase("german")) src_abbr = "de"; else if (targetLang.equalsIgnoreCase("dutch")) src_abbr = "nl"; else if (targetLang.equalsIgnoreCase("portuguese")) src_abbr = "pt"; else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs"; else System.out.println("Don't recognise the source language"); String tgt_abbr = ""; if (targetLang.equalsIgnoreCase("english")) tgt_abbr = "en"; else if (targetLang.equalsIgnoreCase("spanish")) tgt_abbr = "es"; else if (targetLang.equalsIgnoreCase("french")) tgt_abbr = "fr"; else if (targetLang.equalsIgnoreCase("german")) tgt_abbr = "de"; else if (targetLang.equalsIgnoreCase("dutch")) tgt_abbr = "nl"; else if (targetLang.equalsIgnoreCase("portuguese")) tgt_abbr = "pt"; else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs"; else System.out.println("Don't recognise the target language"); String truecasePath = ""; if (null != resourceManager.getProperty(sourceLang + ".lowercase")) { truecasePath = resourceManager.getProperty(sourceLang + ".lowercase") + " -q "; } else { truecasePath = resourceManager.getString(sourceLang + ".truecase") + " --model " + resourceManager.getString(sourceLang + ".truecase.model"); } Tokenizer enTok = new Tokenizer( inputSourceFile.getPath(), inputSourceFile.getPath() + ".tok", truecasePath, resourceManager.getString(sourceLang + ".tokenizer"), src_abbr, forceRun); // Tokenizer enTok = new Tokenizer(inputSourceFile.getPath(), inputSourceFile.getPath() + // ".tok", resourceManager.getString("english.lowercase"), // resourceManager.getString("english.tokenizer"), "en", forceRun); enTok.run(); sourceFile = enTok.getTok(); System.out.println(sourceFile); // run tokenizer for target (Spanish) System.out.println("running tokenizer"); // Tokenizer esTok = new Tokenizer(inputTargetFile.getPath(), inputTargetFile.getPath() + // ".tok", resourceManager.getString("spanish.lowercase"), // resourceManager.getString("spanish.tokenizer"), "es", forceRun); if (null != resourceManager.getProperty(targetLang + ".lowercase")) { truecasePath = resourceManager.getProperty(targetLang + ".lowercase") + " -q "; } else { truecasePath = resourceManager.getString(targetLang + ".truecase") + " --model " + resourceManager.getString(targetLang + ".truecase.model"); } Tokenizer esTok = new Tokenizer( inputTargetFile.getPath(), inputTargetFile.getPath() + ".tok", truecasePath, resourceManager.getString(targetLang + ".tokenizer"), tgt_abbr, forceRun); esTok.run(); targetFile = esTok.getTok(); System.out.println(targetFile); // Normalize files to avoid strange characters in UTF-8 that may break the PoS tagger // normalize_utf8(); }