Пример #1
0
  public JCas getPreliminarCas(
      Analyzer analyzer, JCas emptyCas, String sentenceId, String sentence) {
    this.preliminaryCas.reset();

    /** Without this the annotator fails badly */
    sentence = sentence.replaceAll("/", "");
    sentence = sentence.replaceAll("~", "");

    // Carry out preliminary analysis
    Analyzable content = new SimpleContent(sentenceId, sentence, ArabicAnalyzer.ARABIC_LAN);

    analyzer.analyze(this.preliminaryCas, content);

    // Copy data to a new CAS and use normalized text as DocumentText
    emptyCas.reset();
    emptyCas.setDocumentLanguage(ArabicAnalyzer.ARABIC_LAN);

    CasCopier.copyCas(this.preliminaryCas.getCas(), emptyCas.getCas(), false);

    String normalizedText =
        JCasUtil.selectSingle(this.preliminaryCas, NormalizedText.class).getText();
    emptyCas.setDocumentText(normalizedText);

    return emptyCas;
  }
  private void setupUimaTools() throws IOException, UIMAException {
    /*
    Document docTrain = JsoupUtils.getDoc(CQA_QL_TRAIN_EN);
    Document docDevel = JsoupUtils.getDoc(CQA_QL_DEV_EN);
    Document docTest = JsoupUtils.getDoc(CQA_QL_TEST_EN);

    this.userProfiles = UserProfile.createUserProfiles(docTrain, docDevel, docTest);
    //////user profiles are built on the training+dev+test semeval2015 datasets!
    */

    //    for(Entry<String, UserProfile> entry: userProfiles.entrySet()){
    //      if(entry.getValue().getSignatures().size()>0){
    //        System.out.println("---------- SIGNATURES FOR USER: "******"
    // ----------");
    //        for(String signature : entry.getValue().getSignatures()){
    //          System.out.println("____\n" + JsoupUtils.recoverOriginalText(signature));
    //        }
    //      }
    //    }
    this.stopwords = new Stopwords(Stopwords.STOPWORD_EN);

    this.pfEnglish = new PairFeatureFactoryEnglish(this.alphabet);
    this.pfEnglish.setupMeasures(RichNode.OUTPUT_PAR_LEMMA, this.stopwords);

    /** Add some punctuation to the stopwords list */
    for (String stopword : ".|...|\\|,|?|!|#|(|)|$|%|&".split("\\|")) {
      this.stopwords.add(stopword);
    }

    /** Specify A and B subtask labels */
    this.a_labels.add("Not English");
    this.a_labels.add("Good");
    this.a_labels.add("Potential");
    this.a_labels.add("Dialogue");
    this.a_labels.add("Bad");

    this.b_labels.add("No");
    this.b_labels.add("Yes");
    this.b_labels.add("Unsure");

    /** Create the analysis pipeline */
    AnalysisEngine segmenter = createEngine(createEngineDescription(OpenNlpSegmenter.class));
    AnalysisEngine postagger = createEngine(createEngineDescription(OpenNlpPosTagger.class));
    AnalysisEngine chunker = createEngine(createEngineDescription(OpenNlpChunker.class));
    AnalysisEngine lemmatizer = createEngine(createEngineDescription(StanfordLemmatizer.class));

    this.analysisEngineList = new AnalysisEngine[4];
    this.analysisEngineList[0] = segmenter;
    this.analysisEngineList[1] = postagger;
    this.analysisEngineList[2] = chunker;
    this.analysisEngineList[3] = lemmatizer;

    this.analyzer = new Analyzer(new UIMAFilePersistence("CASes/semeval"));
    for (AnalysisEngine ae : this.analysisEngineList) {
      analyzer.addAE(ae);
    }
  }
Пример #3
0
  public void runForArabic() throws UIMAException {
    this.stopwords = new Stopwords(Stopwords.STOPWORD_AR);
    this.stopwords = new Stopwords("semeval2015-3/arabic-corpus-specific-stopwords.txt");

    this.pfArabic = new PairFeatureFactoryArabic(this.alphabet);
    this.pfArabic.setupMeasures(RichNode.OUTPUT_PAR_TOKEN_LOWERCASE, this.stopwords);

    this.language = LANG_ARABIC;

    this.preliminaryCas = JCasFactory.createJCas();

    /** Specify the task label For Arabic there is just one task */
    this.a_labels.add("direct");
    this.a_labels.add("related");
    this.a_labels.add("irrelevant");

    /**
     * Instantiate the QCRI Analyzer, but for now we are using the analysis engines instantiated
     * later on
     */
    if (USE_QCRI_ALT_TOOLS) {
      this.analyzer = new Analyzer(new UIMANoPersistence());
      analyzer.addAE(
          AnalysisEngineFactory.createEngine(createEngineDescription(ArabicAnalyzer.class)));
    } else {
      /**
       * Whitespace tokenizer. The Stanford Segmenter for Arabic has a very bad bug and the
       * tokenization is completely wrong.
       */
      AnalysisEngine segmenter = createEngine(createEngineDescription(WhitespaceTokenizer.class));
      /** Stanford POS-Tagger */
      AnalysisEngine postagger =
          createEngine(
              createEngineDescription(
                  StanfordPosTagger.class,
                  StanfordPosTagger.PARAM_LANGUAGE,
                  "ar",
                  StanfordPosTagger.PARAM_VARIANT,
                  "accurate"));
      /** Putting together the UIMA DKPro annotators */
      this.analysisEngineList = new AnalysisEngine[2];
      this.analysisEngineList[0] = segmenter;
      this.analysisEngineList[1] = postagger;
    }

    try {
      processArabicFile(analyzer, CQA_QL_TRAIN_AR, "train");
      processArabicFile(analyzer, CQA_QL_DEV_AR, "dev");
    } catch (SimilarityException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Пример #4
0
  public void runForEnglish() throws UIMAException {

    this.stopwords = new Stopwords(Stopwords.STOPWORD_EN);

    this.pfEnglish = new PairFeatureFactoryEnglish(this.alphabet);
    this.pfEnglish.setupMeasures(RichNode.OUTPUT_PAR_LEMMA, this.stopwords);

    this.language = LANG_ENGLISH;

    /** Add some punctuation to the stopwords list */
    for (String stopword : ".|...|\\|,|?|!|#|(|)|$|%|&".split("\\|")) {
      this.stopwords.add(stopword);
    }

    /** Specify A and B subtask labels */
    this.a_labels.add("Not English");
    this.a_labels.add("Good");
    this.a_labels.add("Potential");
    this.a_labels.add("Dialogue");
    this.a_labels.add("Bad");

    this.b_labels.add("No");
    this.b_labels.add("Yes");
    this.b_labels.add("Unsure");

    /** Create the analysis pipeline */
    AnalysisEngine segmenter = createEngine(createEngineDescription(OpenNlpSegmenter.class));
    AnalysisEngine postagger = createEngine(createEngineDescription(OpenNlpPosTagger.class));
    AnalysisEngine chunker = createEngine(createEngineDescription(OpenNlpChunker.class));
    AnalysisEngine lemmatizer = createEngine(createEngineDescription(StanfordLemmatizer.class));

    this.analysisEngineList = new AnalysisEngine[4];
    this.analysisEngineList[0] = segmenter;
    this.analysisEngineList[1] = postagger;
    this.analysisEngineList[2] = chunker;
    this.analysisEngineList[3] = lemmatizer;

    this.analyzer = new Analyzer(new UIMAFilePersistence("CASes/semeval"));
    for (AnalysisEngine ae : this.analysisEngineList) {
      analyzer.addAE(ae);
    }

    try {
      this.processEnglishFile(CQA_QL_TRAIN_EN, "train");
      this.processEnglishFile(CQA_QL_DEV_EN, "dev");
    } catch (UIMAException | IOException | SimilarityException e) {
      e.printStackTrace();
    }
  }