private JCas computeCommentCas(Element comment) throws UIMAException {
    JCas cCas = JCasFactory.createJCas();
    String cid = comment.attr("CID");
    String cuserid = comment.attr("CUSERID");
    // String cgold = comment.attr("CGOLD");
    // String cgold = getgold(comment.attr("CGOLD"));

    // String cgold_yn = comment.attr("CGOLD_YN");
    String csubject = comment.getElementsByTag("CSubject").get(0).text();
    String cbody = comment.getElementsByTag("CBody").get(0).text();

    /** Setup comment CAS */
    cCas.reset();
    cCas.setDocumentLanguage("en");
    String commentText =
        TextNormalizer.normalize(SubjectBodyAggregator.getCommentText(csubject, cbody));
    cCas.setDocumentText(commentText);
    // cCas.setDocumentText(csubject + ". " + cbody);

    /** Run the UIMA pipeline */
    SimplePipeline.runPipeline(cCas, this.analysisEngineList);

    // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + cbody));
    return cCas;
  }
 /**
  * @deprecated use {@link cqaElementToCas} instead
  * @param cqa
  * @return
  * @throws UIMAException
  */
 @Deprecated
 private JCas questionToCas(CQAinstance cqa) throws UIMAException {
   JCas questionCas = JCasFactory.createJCas();
   questionCas.setDocumentLanguage("en");
   questionCas.setDocumentText(cqa.getQuestion().getWholeText());
   return questionCas;
 }
 /**
  * Takes an element, either a question or a comment and generates a JCas of its whole text
  * (subject + body)
  *
  * @param element either a question or a comment
  * @return JCas instance of the text in the element
  * @throws UIMAException
  */
 protected JCas cqaElementToCas(CQAabstractElement element) throws UIMAException {
   JCas jcas = JCasFactory.createJCas();
   jcas.setDocumentLanguage("en");
   jcas.setDocumentText(element.getWholeText());
   // FIXME replace above command with this one (and make it work):
   // jcas.setDocumentText(element.getWholeTextNormalized(userProfiles));
   return jcas;
 }
Beispiel #4
0
  public void runForArabic() throws UIMAException {
    this.stopwords = new Stopwords(Stopwords.STOPWORD_AR);
    this.stopwords = new Stopwords("semeval2015-3/arabic-corpus-specific-stopwords.txt");

    this.pfArabic = new PairFeatureFactoryArabic(this.alphabet);
    this.pfArabic.setupMeasures(RichNode.OUTPUT_PAR_TOKEN_LOWERCASE, this.stopwords);

    this.language = LANG_ARABIC;

    this.preliminaryCas = JCasFactory.createJCas();

    /** Specify the task label For Arabic there is just one task */
    this.a_labels.add("direct");
    this.a_labels.add("related");
    this.a_labels.add("irrelevant");

    /**
     * Instantiate the QCRI Analyzer, but for now we are using the analysis engines instantiated
     * later on
     */
    if (USE_QCRI_ALT_TOOLS) {
      this.analyzer = new Analyzer(new UIMANoPersistence());
      analyzer.addAE(
          AnalysisEngineFactory.createEngine(createEngineDescription(ArabicAnalyzer.class)));
    } else {
      /**
       * Whitespace tokenizer. The Stanford Segmenter for Arabic has a very bad bug and the
       * tokenization is completely wrong.
       */
      AnalysisEngine segmenter = createEngine(createEngineDescription(WhitespaceTokenizer.class));
      /** Stanford POS-Tagger */
      AnalysisEngine postagger =
          createEngine(
              createEngineDescription(
                  StanfordPosTagger.class,
                  StanfordPosTagger.PARAM_LANGUAGE,
                  "ar",
                  StanfordPosTagger.PARAM_VARIANT,
                  "accurate"));
      /** Putting together the UIMA DKPro annotators */
      this.analysisEngineList = new AnalysisEngine[2];
      this.analysisEngineList[0] = segmenter;
      this.analysisEngineList[1] = postagger;
    }

    try {
      processArabicFile(analyzer, CQA_QL_TRAIN_AR, "train");
      processArabicFile(analyzer, CQA_QL_DEV_AR, "dev");
    } catch (SimilarityException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
 /**
  * @deprecated use {@link cqaElementToCas} instead
  * @param comment
  * @return
  * @throws UIMAException
  */
 @Deprecated
 private JCas commentToCas(Comment comment) throws UIMAException {
   JCas commentCas = JCasFactory.createJCas();
   commentCas.setDocumentLanguage("en");
   String commentText =
       SubjectBodyAggregator.getCommentText(comment.getCsubject(), comment.getCbody());
   //    if(commentText.contains("&")){
   //      System.out.println(commentText);
   //    }
   commentCas.setDocumentText(commentText);
   return commentCas;
 }
Beispiel #6
0
  public static void main(String[] args) throws Exception {
    JCas jCas = JCasFactory.createJCas();
    jCas.setDocumentLanguage("de");
    jCas.setDocumentText(
        "Die Fossillagerstätte Geiseltal befindet sich im ehemaligen Braunkohlerevier des Geiseltales südlich der Stadt Halle in Sachsen-Anhalt. Sie ist eine bedeutende Fundstelle heute ausgestorbener Pflanzen und Tiere aus der Zeit des Mittleren Eozäns vor 48 bis 41 Millionen Jahren. Im Geiseltal wurde nachweislich seit 1698 erstmals Kohle gefördert, die ersten Fossilien kamen aber erst Anfang des 20. Jahrhunderts eher zufällig zu Tage. Planmäßige wissenschaftliche Ausgrabungen begannen 1925 seitens der Martin-Luther-Universität Halle-Wittenberg. Unterbrochen durch den Zweiten Weltkrieg, können die Untersuchungen in zwei Forschungsphasen untergliedert werden. Aufgrund der zunehmenden Auskohlung der Rohstofflager kamen die Ausgrabungen Mitte der 1980er allmählich zum Erliegen und endeten endgültig zu Beginn des dritten Jahrtausends.");

    SimplePipeline.runPipeline(
        jCas,
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
        AnalysisEngineFactory.createEngineDescription(StanfordNamedEntityRecognizer.class),
        AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class));

    for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
      System.out.println("Found NE: " + ne.getValue() + ", " + ne.getCoveredText());
    }
  }
Beispiel #7
0
  /**
   * Process the xml file and output a csv file with the results in the same directory
   *
   * @param dataFile the xml file to process
   * @suffix suffix for identifying the data file
   * @param suffix
   * @throws ResourceInitializationException
   * @throws UIMAException
   * @throws IOException
   * @throws AnalysisEngineProcessException
   * @throws SimilarityException
   */
  private void processEnglishFile(String dataFile, String suffix)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    /** Parameters for matching tree structures */
    String parameterList =
        Joiner.on(",")
            .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    Map<String, Boolean> commentIsDialogue = new HashMap<>();

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String quserid = question.attr("QUSERID");
      String qtype = question.attr("QTYPE");
      String qgold_yn = question.attr("QGOLD_YN");
      String qsubject = question.getElementsByTag("QSubject").get(0).text();
      String qbody = question.getElementsByTag("QBody").get(0).text();

      /** Setup question CAS */
      questionCas.reset();
      questionCas.setDocumentLanguage("en");
      questionCas.setDocumentText(qsubject + ". " + qbody);

      /** Run the UIMA pipeline */
      SimplePipeline.runPipeline(questionCas, this.analysisEngineList);

      // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody));

      /** Parse comment nodes */
      Elements comments = question.getElementsByTag("Comment");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cuserid = comment.attr("CUSERID");
        String cgold = comment.attr("CGOLD");
        String cgold_yn = comment.attr("CGOLD_YN");
        String csubject = comment.getElementsByTag("CSubject").get(0).text();
        String cbody = comment.getElementsByTag("CBody").get(0).text();

        /** Setup comment CAS */
        commentCas.reset();
        commentCas.setDocumentLanguage("en");
        commentCas.setDocumentText(csubject + ". " + cbody);

        /** Run the UIMA pipeline */
        SimplePipeline.runPipeline(commentCas, this.analysisEngineList);

        // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " +
        // cbody));

        FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */
        boolean quseridEqCuserid = quserid.equals(cuserid);
        if (quseridEqCuserid) {
          commentIsDialogue.put(cid, true);
        }

        // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid);

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("qid,cgold,cgold_yn");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features));

        /** Produce also the file needed to train structural models */
        if (PRODUCE_SVMLIGHTTK_DATA) {
          produceSVMLightTKExample(
              questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features);
        }
      }
    }

    for (String commentId : commentIsDialogue.keySet()) {
      this.fm.writeLn(dataFile + ".dialogue.txt", commentId);
    }

    this.fm.closeFiles();
    out.close();
  }
Beispiel #8
0
  public void processArabicFile(Analyzer analyzer, String dataFile, String suffix)
      throws SimilarityException, UIMAException, IOException {
    /** We do not have a lemmatizer so we work with tokens */
    String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");

    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String qsubject =
          question
              .getElementsByTag("QSubject")
              .get(0)
              .text()
              .replaceAll("/", "")
              .replaceAll("~", "");
      String qbody =
          question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", "");

      /** Get analyzed text for question */
      if (USE_QCRI_ALT_TOOLS) {
        questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody);
      } else {
        questionCas.reset();
        questionCas.setDocumentLanguage("ar");
        questionCas.setDocumentText(qsubject + ". " + qbody);
        SimplePipeline.runPipeline(questionCas, this.analysisEngineList);
      }

      /** Parse answer nodes */
      Elements comments = question.getElementsByTag("Answer");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cgold = comment.attr("CGOLD");
        String cbody = comment.text().replaceAll("/", "").replaceAll("~", "");
        ;

        /** Get analyzed text for comment */
        if (USE_QCRI_ALT_TOOLS) {
          commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody);
        } else {
          commentCas.reset();
          commentCas.setDocumentLanguage("ar");
          commentCas.setDocumentText(cbody);

          SimplePipeline.runPipeline(commentCas, this.analysisEngineList);
        }

        /** Compute features between question and comment */
        FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("cid,cgold");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        /** Produce output line */
        out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features));
      }
    }

    this.fm.closeFiles();
    out.close();
  }