Beispiel #1
0
  /**
   * Process the xml file and output a csv file with the results in the same directory
   *
   * @param dataFile the xml file to process
   * @suffix suffix for identifying the data file
   * @param suffix
   * @throws ResourceInitializationException
   * @throws UIMAException
   * @throws IOException
   * @throws AnalysisEngineProcessException
   * @throws SimilarityException
   */
  private void processEnglishFile(String dataFile, String suffix)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    /** Parameters for matching tree structures */
    String parameterList =
        Joiner.on(",")
            .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    Map<String, Boolean> commentIsDialogue = new HashMap<>();

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String quserid = question.attr("QUSERID");
      String qtype = question.attr("QTYPE");
      String qgold_yn = question.attr("QGOLD_YN");
      String qsubject = question.getElementsByTag("QSubject").get(0).text();
      String qbody = question.getElementsByTag("QBody").get(0).text();

      /** Setup question CAS */
      questionCas.reset();
      questionCas.setDocumentLanguage("en");
      questionCas.setDocumentText(qsubject + ". " + qbody);

      /** Run the UIMA pipeline */
      SimplePipeline.runPipeline(questionCas, this.analysisEngineList);

      // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody));

      /** Parse comment nodes */
      Elements comments = question.getElementsByTag("Comment");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cuserid = comment.attr("CUSERID");
        String cgold = comment.attr("CGOLD");
        String cgold_yn = comment.attr("CGOLD_YN");
        String csubject = comment.getElementsByTag("CSubject").get(0).text();
        String cbody = comment.getElementsByTag("CBody").get(0).text();

        /** Setup comment CAS */
        commentCas.reset();
        commentCas.setDocumentLanguage("en");
        commentCas.setDocumentText(csubject + ". " + cbody);

        /** Run the UIMA pipeline */
        SimplePipeline.runPipeline(commentCas, this.analysisEngineList);

        // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " +
        // cbody));

        FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */
        boolean quseridEqCuserid = quserid.equals(cuserid);
        if (quseridEqCuserid) {
          commentIsDialogue.put(cid, true);
        }

        // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid);

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("qid,cgold,cgold_yn");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features));

        /** Produce also the file needed to train structural models */
        if (PRODUCE_SVMLIGHTTK_DATA) {
          produceSVMLightTKExample(
              questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features);
        }
      }
    }

    for (String commentId : commentIsDialogue.keySet()) {
      this.fm.writeLn(dataFile + ".dialogue.txt", commentId);
    }

    this.fm.closeFiles();
    out.close();
  }
  /**
   * Process the xml file and output a csv file with the results in the same directory
   *
   * @param dataFile the xml file to process
   * @suffix suffix for identifying the data file
   * @param suffix
   * @throws ResourceInitializationException
   * @throws UIMAException
   * @throws IOException
   * @throws AnalysisEngineProcessException
   * @throws SimilarityException
   */
  private void processEnglishFile(String dataFile)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    /** Parameters for matching tree structures */
    String parameterList =
        Joiner.on(",")
            .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    // TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    WriteFile out = new WriteFile(dataFile + SUFFIX);

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    // Map<String, Boolean> commentIsDialogue = new HashMap<>();
    //		HashSet<String> questionCategories = new HashSet<String>();

    double[] matches = new double[11];
    int[] totals = new int[11];
    int bin;

    for (Element question : questions) {

      Question q = new Question();
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String quserid = question.attr("QUSERID");
      String qtype = question.attr("QTYPE");
      String qgold_yn = question.attr("QGOLD_YN");
      String qsubject = question.getElementsByTag("QSubject").get(0).text();
      String qbody = question.getElementsByTag("QBody").get(0).text();

      //			questionCategories.add(qcategory);

      q.setQid(qid);
      q.setQcategory(qcategory);
      q.setQdate(qdate);
      q.setQuserId(quserid);
      q.setQtype(qtype);
      q.setQgoldYN(qgold_yn);
      q.setQsubject(qsubject);
      q.setQbody(qbody);

      // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody));

      /** Parse comment nodes */
      Elements comments = question.getElementsByTag("Comment");

      if (LIMIT_COMMENTS_PER_Q && comments.size() >= LIMIT_COMMENTS) {
        continue;
      }

      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cuserid = comment.attr("CUSERID");
        String cgold = comment.attr("CGOLD");
        String cgold_yn = comment.attr("CGOLD_YN");
        String csubject = comment.getElementsByTag("CSubject").get(0).text();
        String cbody = comment.getElementsByTag("CBody").get(0).text();
        q.addComment(cid, cuserid, cgold, cgold_yn, csubject, cbody);
      }

      List<JCas> allCommentsCas = new ArrayList<JCas>();
      List<String> ids = new ArrayList<String>();
      List<String> labels = new ArrayList<String>();

      for (Element comment : comments) {
        allCommentsCas.add(computeCommentCas(comment));
        ids.add(comment.attr("CID"));
        labels.add(getgold(comment.attr("CGOLD")));
      }

      for (int i = 0; i < allCommentsCas.size() - 1; i++) {
        for (int j = i + 1; j <= allCommentsCas.size() - 1; j++) {
          AugmentableFeatureVector fv;
          // COMPUTE THE SIMILARITY HERE
          // TODO where to assign this
          // Whether the CAS are exactly identical
          // how to store/display the output
          fv =
              (AugmentableFeatureVector)
                  pfEnglish.getPairFeatures(
                      allCommentsCas.get(i), allCommentsCas.get(j), parameterList);

          //					System.out.println(
          //							ids.get(i) + ","+
          //							labels.get(i) + ","+
          //							ids.get(j)+","+
          //							labels.get(j) + ","+
          //							Joiner.on(",").join(this.serializeFv(fv))
          //					);

          bin = (int) Math.round(fv.getValues()[0] * 10);
          if (labels.get(i).equals(labels.get(j))) matches[bin]++;
          totals[bin]++;

          /** Produce output line */
          if (firstRow) {
            out.write("qid,cgold");
            for (int c = 0; c < fv.numLocations(); c++) {
              int featureIndex = c + 1;
              out.write(",f" + featureIndex);
            }
            out.write("\n");
            firstRow = false;
          }

          // System.out.println(bin);
          out.writeLn(
              ids.get(i)
                  + "-"
                  + ids.get(j)
                  + ","
                  + labels.get(i)
                  + "-"
                  + labels.get(j)
                  + ","
                  + Joiner.on(",").join(this.serializeFv(fv)));
        }
      }
    }
    for (int i = 0; i < 11; i++)
      System.out.println("BIN: " + i + " pctge: " + matches[i] / totals[i]);

    this.fm.closeFiles();

    out.close();
  }
Beispiel #3
0
  public void processArabicFile(Analyzer analyzer, String dataFile, String suffix)
      throws SimilarityException, UIMAException, IOException {
    /** We do not have a lemmatizer so we work with tokens */
    String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");

    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String qsubject =
          question
              .getElementsByTag("QSubject")
              .get(0)
              .text()
              .replaceAll("/", "")
              .replaceAll("~", "");
      String qbody =
          question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", "");

      /** Get analyzed text for question */
      if (USE_QCRI_ALT_TOOLS) {
        questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody);
      } else {
        questionCas.reset();
        questionCas.setDocumentLanguage("ar");
        questionCas.setDocumentText(qsubject + ". " + qbody);
        SimplePipeline.runPipeline(questionCas, this.analysisEngineList);
      }

      /** Parse answer nodes */
      Elements comments = question.getElementsByTag("Answer");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cgold = comment.attr("CGOLD");
        String cbody = comment.text().replaceAll("/", "").replaceAll("~", "");
        ;

        /** Get analyzed text for comment */
        if (USE_QCRI_ALT_TOOLS) {
          commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody);
        } else {
          commentCas.reset();
          commentCas.setDocumentLanguage("ar");
          commentCas.setDocumentText(cbody);

          SimplePipeline.runPipeline(commentCas, this.analysisEngineList);
        }

        /** Compute features between question and comment */
        FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("cid,cgold");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        /** Produce output line */
        out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features));
      }
    }

    this.fm.closeFiles();
    out.close();
  }