public void initialize(PropertiesManager propertiesManager, FeatureManager featureManager) {
    // load monolingual terminology, if it is given
    this.monoTerms = null;

    String monoTerminologyPath = propertiesManager.getProperty("terminology.monolingual-mwu");

    if (monoTerminologyPath != null) {
      this.monoTerms = new Terminology(monoTerminologyPath);
    }

    // load bilingual terminology, if it is given
    this.biTerms = null;

    String biTerminologyPath = propertiesManager.getProperty("terminology.bilingual-mwu");

    if (biTerminologyPath != null) {
      this.biTerms = new Terminology(biTerminologyPath);
    }
  }
예제 #2
0
  /**
   * Performs some basic processing of the input source and target files For English, this consists
   * of converting the input to lower case and tokenizing For Arabic, this consists of
   * transliteration and tokenization. Please note that the current tools used for tokenizing Arabic
   * also perform POS tagging and morphological analysis Although we could separate the tokenization
   * process from the more in-depth text analysis performed by these tools, for efficiency reasons
   * this is not desirable The input files are also copied to the /input folder. This is necessary
   * because the MADA analyser produces its output in the same folder as the input file, which may
   * cause problems if the right access rights are not available for that particular folder
   */
  private static void preprocessing() {
    String sourceInputFolder = input + File.separator + sourceLang;
    String targetInputFolder = input + File.separator + targetLang;
    File origSourceFile = new File(sourceFile);
    File inputSourceFile = new File(sourceInputFolder + File.separator + origSourceFile.getName());

    System.out.println("source input:" + sourceFile);
    System.out.println("target input:" + targetFile);
    File origTargetFile = new File(targetFile);
    File inputTargetFile = new File(targetInputFolder + File.separator + origTargetFile.getName());
    try {
      System.out.println("copying input to " + inputSourceFile.getPath());
      copyFile(origSourceFile, inputSourceFile);
      System.out.println("copying input to " + inputTargetFile.getPath());
      copyFile(origTargetFile, inputTargetFile);
    } catch (Exception e) {
      e.printStackTrace();
    }

    // run tokenizer for source (English)
    System.out.println("running tokenizer");

    String src_abbr = "";
    if (sourceLang.equalsIgnoreCase("english")) src_abbr = "en";
    else if (sourceLang.equalsIgnoreCase("spanish")) src_abbr = "es";
    else if (sourceLang.equalsIgnoreCase("french")) src_abbr = "fr";
    else if (sourceLang.equalsIgnoreCase("german")) src_abbr = "de";
    else if (targetLang.equalsIgnoreCase("dutch")) src_abbr = "nl";
    else if (targetLang.equalsIgnoreCase("portuguese")) src_abbr = "pt";
    else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs";
    else System.out.println("Don't recognise the source language");

    String tgt_abbr = "";
    if (targetLang.equalsIgnoreCase("english")) tgt_abbr = "en";
    else if (targetLang.equalsIgnoreCase("spanish")) tgt_abbr = "es";
    else if (targetLang.equalsIgnoreCase("french")) tgt_abbr = "fr";
    else if (targetLang.equalsIgnoreCase("german")) tgt_abbr = "de";
    else if (targetLang.equalsIgnoreCase("dutch")) tgt_abbr = "nl";
    else if (targetLang.equalsIgnoreCase("portuguese")) tgt_abbr = "pt";
    else if (targetLang.equalsIgnoreCase("czech")) tgt_abbr = "cs";
    else System.out.println("Don't recognise the target language");

    String truecasePath = "";
    if (null != resourceManager.getProperty(sourceLang + ".lowercase")) {
      truecasePath = resourceManager.getProperty(sourceLang + ".lowercase") + " -q ";
    } else {
      truecasePath =
          resourceManager.getString(sourceLang + ".truecase")
              + " --model "
              + resourceManager.getString(sourceLang + ".truecase.model");
    }
    Tokenizer enTok =
        new Tokenizer(
            inputSourceFile.getPath(),
            inputSourceFile.getPath() + ".tok",
            truecasePath,
            resourceManager.getString(sourceLang + ".tokenizer"),
            src_abbr,
            forceRun);

    // Tokenizer enTok = new Tokenizer(inputSourceFile.getPath(), inputSourceFile.getPath() +
    // ".tok", resourceManager.getString("english.lowercase"),
    // resourceManager.getString("english.tokenizer"), "en", forceRun);
    enTok.run();
    sourceFile = enTok.getTok();
    System.out.println(sourceFile);
    // run tokenizer for target (Spanish)
    System.out.println("running tokenizer");
    //        Tokenizer esTok = new Tokenizer(inputTargetFile.getPath(), inputTargetFile.getPath() +
    // ".tok", resourceManager.getString("spanish.lowercase"),
    // resourceManager.getString("spanish.tokenizer"), "es", forceRun);

    if (null != resourceManager.getProperty(targetLang + ".lowercase")) {
      truecasePath = resourceManager.getProperty(targetLang + ".lowercase") + " -q ";
    } else {
      truecasePath =
          resourceManager.getString(targetLang + ".truecase")
              + " --model "
              + resourceManager.getString(targetLang + ".truecase.model");
    }
    Tokenizer esTok =
        new Tokenizer(
            inputTargetFile.getPath(),
            inputTargetFile.getPath() + ".tok",
            truecasePath,
            resourceManager.getString(targetLang + ".tokenizer"),
            tgt_abbr,
            forceRun);

    esTok.run();
    targetFile = esTok.getTok();
    System.out.println(targetFile);

    // Normalize files to avoid strange characters in UTF-8 that may break the PoS tagger
    // normalize_utf8();
  }