// TODO the method should be private
  public void processEnglishFile(Document doc, String dataFile, String suffix)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    String plainTextOutputPath = dataFile + "plain.txt";
    String goodVSbadOutputPath = dataFile + ".csv";
    String pairwiseOutputPath = dataFile + getPairwiseSuffix();
    String kelpFilePath = dataFile + ".klp";

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    /// ** Instantiate CASes */ assigned in the for loop
    // JCas questionCas = JCasFactory.createJCas();

    // WriteFile out = new WriteFile(dataFile + ".csv");
    // TODO ABC, Sep 10th 2015. Do we really need this? It seems like a bad patch
    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int qNumber = 1;

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + qNumber++ + " out of " + numberOfQuestions);

      CQAinstance cqainstance = qElementToObject(question);

      getFeaturesFromThread(cqainstance);
      // TODO MOVE FROM HERE TO getFeaturesFromThread.
      // FOR THAT the printing operations have to be moved out and
      // question and comment must have a method to extract header+body.
      // Move them from SubjectBodyAggregator
      // AQUI VOY
      /** Setup question CAS */

      // questionCas.reset();
      JCas questionCas = cqaElementToCas(cqainstance.getQuestion());

      fm.writeLn(
          plainTextOutputPath,
          "---------------------------- QID: "
              + cqainstance.getQuestion().getId()
              + " USER:"******"q-" + qid, qsubject + ". " + qbody));

      /*Comment-level features to be combined*/
      List<List<Double>> listFeatures = new ArrayList<List<Double>>();
      List<Map<String, Double>> albertoSimoneFeatures;
      if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) { // TODO RENAME THIS PLEASE
        albertoSimoneFeatures = FeatureExtractor.extractFeatures(cqainstance);
      }

      int commentIndex = 0;
      List<JCas> allCommentsCas = new ArrayList<JCas>();
      for (CQAcomment c : cqainstance.getComments()) {
        /** Setup comment CAS */
        JCas commentCas = cqaElementToCas(c);

        /** Run the UIMA pipeline */
        SimplePipeline.runPipeline(commentCas, this.analysisEngineList);
        // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " +
        // cbody));

        AugmentableFeatureVector fv;
        if (GENERATE_MASSIMO_FEATURES) {
          fv =
              (AugmentableFeatureVector)
                  pfEnglish.getPairFeatures(questionCas, commentCas, PARAMETER_LIST);
        } else {
          fv = new AugmentableFeatureVector(this.alphabet);
        }

        if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) {
          Map<String, Double> featureVector = albertoSimoneFeatures.get(commentIndex);
          for (String featureName : FeatureExtractor.getAllFeatureNames()) {
            Double value = featureVector.get(featureName);
            double featureValue = 0;
            if (value != null) {
              featureValue = value;
            }
            fv.add(featureName, featureValue);
          }
        }
        commentIndex++;

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */

        // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid);

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce outputs */
        writeToPlainTextOutput(plainTextOutputPath, c, commentCas);

        //				String goodVSbadOutputPath = dataFile + ".csv";
        //				String pairwiseOutputPath

        // FIXME Once we fix that issue with the features, we can know this info
        // in advance and fix the output, probably out of the method
        if (firstRow) {
          // header for Good vs Bad
          this.fm.write(goodVSbadOutputPath, "qid,cgold,cgold_yn");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            this.fm.write(goodVSbadOutputPath, ",f" + featureIndex);
          }
          this.fm.writeLn(goodVSbadOutputPath, "");

          // header for pairwise
          this.fm.write(pairwiseOutputPath, "qid,cgold");
          int numFeatures = fv.numLocations();
          if (COMBINATION_CONCAT) {
            numFeatures *= 2;
          }
          if (INCLUDE_SIMILARITIES) {
            numFeatures += PairFeatureFactoryEnglish.NUM_SIM_FEATURES;
          }

          for (int i = 0; i < numFeatures; i++) {
            int featureIndex = i + 1;
            this.fm.write(pairwiseOutputPath, ",f" + featureIndex);
          }
          this.fm.writeLn(pairwiseOutputPath, "");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);
        listFeatures.add(features);

        this.fm.writeLn(
            goodVSbadOutputPath,
            c.getId()
                + ","
                + c.getGold()
                + ","
                + c.getGold_yn()
                + ","
                + Joiner.on(",").join(features));

        /** Produce also the file needed to train structural models */
        if (PRODUCE_SVMLIGHTTK_DATA) {
          produceSVMLightTKExample(
              questionCas,
              commentCas,
              suffix,
              ts,
              cqainstance.getQuestion().getId(),
              c.getId(),
              c.getGold(),
              c.getGold_yn(),
              features);
        }
        if (PRODUCE_KELP_DATA) {
          produceKelpExample(
              questionCas,
              commentCas,
              kelpFilePath,
              ts,
              cqainstance.getQuestion().getId(),
              c.getId(),
              c.getGold(),
              c.getGold_yn(),
              features);
        }
        allCommentsCas.add(commentCas);
      }
      // TODO MOVE UP TO HERE

      this.fm.write(
          pairwiseOutputPath, computePairwiseFeatures(cqainstance, listFeatures, allCommentsCas));
      // out.writeLn(computePairwiseFeatures(q, listFeatures);
    }

    //		Iterator<String> iterator = questionCategories.iterator();
    //		while(iterator.hasNext()){
    //			System.out.println("CATEGORY_" + iterator.next());
    //		}

    this.fm.closeFiles();
  }
Пример #2
0
  public void printTopWords(int numWords, boolean useNewLines) {
    class WordProb implements Comparable {
      int wi;
      double p;

      public WordProb(int wi, double p) {
        this.wi = wi;
        this.p = p;
      }

      public final int compareTo(Object o2) {
        if (p > ((WordProb) o2).p) return -1;
        else if (p == ((WordProb) o2).p) return 0;
        else return 1;
      }
    }

    for (int ti = 0; ti < numTopics; ti++) {
      // Unigrams
      WordProb[] wp = new WordProb[numTypes];
      for (int wi = 0; wi < numTypes; wi++)
        wp[wi] = new WordProb(wi, (double) unitypeTopicCounts[wi][ti]);
      Arrays.sort(wp);
      int numToPrint = Math.min(wp.length, numWords);
      if (useNewLines) {
        System.out.println("\nTopic " + ti + " unigrams");
        for (int i = 0; i < numToPrint; i++)
          System.out.println(
              uniAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p / tokensPerTopic[ti]);
      } else {
        System.out.print("Topic " + ti + ": ");
        for (int i = 0; i < numToPrint; i++)
          System.out.print(uniAlphabet.lookupObject(wp[i].wi).toString() + " ");
      }

      // Bigrams
      /*
      wp = new WordProb[numBitypes];
      int bisum = 0;
      for (int wi = 0; wi < numBitypes; wi++) {
      	wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti]));
      	bisum += bitypeTopicCounts[wi][ti];
      }
      Arrays.sort (wp);
      numToPrint = Math.min(wp.length, numWords);
      if (useNewLines) {
      	System.out.println ("\nTopic "+ti+" bigrams");
      	for (int i = 0; i < numToPrint; i++)
      		System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum);
      } else {
      	System.out.print ("          ");
      	for (int i = 0; i < numToPrint; i++)
      		System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " ");
      	System.out.println();
      }
      */

      // Ngrams
      AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false);
      for (int di = 0; di < topics.length; di++) {
        FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
        for (int si = topics[di].length - 1; si >= 0; si--) {
          if (topics[di][si] == ti && grams[di][si] == 1) {
            String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString();
            while (grams[di][si] == 1 && --si >= 0)
              gramString =
                  uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString;
            afv.add(gramString, 1.0);
          }
        }
      }
      // System.out.println ("pre-sorting");
      int numNgrams = afv.numLocations();
      // System.out.println ("post-sorting "+numNgrams);
      wp = new WordProb[numNgrams];
      int ngramSum = 0;
      for (int loc = 0; loc < numNgrams; loc++) {
        wp[loc] = new WordProb(afv.indexAtLocation(loc), afv.valueAtLocation(loc));
        ngramSum += wp[loc].p;
      }
      Arrays.sort(wp);
      int numUnitypeTokens = 0, numBitypeTokens = 0, numUnitypeTypes = 0, numBitypeTypes = 0;
      for (int fi = 0; fi < numTypes; fi++) {
        numUnitypeTokens += unitypeTopicCounts[fi][ti];
        if (unitypeTopicCounts[fi][ti] != 0) numUnitypeTypes++;
      }
      for (int fi = 0; fi < numBitypes; fi++) {
        numBitypeTokens += bitypeTopicCounts[fi][ti];
        if (bitypeTopicCounts[fi][ti] != 0) numBitypeTypes++;
      }

      if (useNewLines) {
        System.out.println(
            "\nTopic "
                + ti
                + " unigrams "
                + numUnitypeTokens
                + "/"
                + numUnitypeTypes
                + " bigrams "
                + numBitypeTokens
                + "/"
                + numBitypeTypes
                + " phrases "
                + Math.round(afv.oneNorm())
                + "/"
                + numNgrams);
        for (int i = 0; i < Math.min(numNgrams, numWords); i++)
          System.out.println(
              afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p / ngramSum);
      } else {
        System.out.print(
            " (unigrams "
                + numUnitypeTokens
                + "/"
                + numUnitypeTypes
                + " bigrams "
                + numBitypeTokens
                + "/"
                + numBitypeTypes
                + " phrases "
                + Math.round(afv.oneNorm())
                + "/"
                + numNgrams
                + ")\n         ");
        // System.out.print (" (unique-ngrams="+numNgrams+"
        // ngram-count="+Math.round(afv.oneNorm())+")\n         ");
        for (int i = 0; i < Math.min(numNgrams, numWords); i++)
          System.out.print(afv.getAlphabet().lookupObject(wp[i].wi).toString() + " ");
        System.out.println();
      }
    }
  }