public void printTopWords(int numWords, boolean useNewLines) { class WordProb implements Comparable { int wi; double p; public WordProb(int wi, double p) { this.wi = wi; this.p = p; } public final int compareTo(Object o2) { if (p > ((WordProb) o2).p) return -1; else if (p == ((WordProb) o2).p) return 0; else return 1; } } for (int ti = 0; ti < numTopics; ti++) { // Unigrams WordProb[] wp = new WordProb[numTypes]; for (int wi = 0; wi < numTypes; wi++) wp[wi] = new WordProb(wi, (double) unitypeTopicCounts[wi][ti]); Arrays.sort(wp); int numToPrint = Math.min(wp.length, numWords); if (useNewLines) { System.out.println("\nTopic " + ti + " unigrams"); for (int i = 0; i < numToPrint; i++) System.out.println( uniAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p / tokensPerTopic[ti]); } else { System.out.print("Topic " + ti + ": "); for (int i = 0; i < numToPrint; i++) System.out.print(uniAlphabet.lookupObject(wp[i].wi).toString() + " "); } // Bigrams /* wp = new WordProb[numBitypes]; int bisum = 0; for (int wi = 0; wi < numBitypes; wi++) { wp[wi] = new WordProb (wi, ((double)bitypeTopicCounts[wi][ti])); bisum += bitypeTopicCounts[wi][ti]; } Arrays.sort (wp); numToPrint = Math.min(wp.length, numWords); if (useNewLines) { System.out.println ("\nTopic "+ti+" bigrams"); for (int i = 0; i < numToPrint; i++) System.out.println (biAlphabet.lookupObject(wp[i].wi).toString() + " " + wp[i].p/bisum); } else { System.out.print (" "); for (int i = 0; i < numToPrint; i++) System.out.print (biAlphabet.lookupObject(wp[i].wi).toString() + " "); System.out.println(); } */ // Ngrams AugmentableFeatureVector afv = new AugmentableFeatureVector(new Alphabet(), 10000, false); for (int di = 0; di < topics.length; di++) { FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData(); for (int si = topics[di].length - 1; si >= 0; si--) { if (topics[di][si] == ti && grams[di][si] == 1) { String gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString(); while (grams[di][si] == 1 && --si >= 0) gramString = uniAlphabet.lookupObject(fs.getIndexAtPosition(si)).toString() + "_" + gramString; afv.add(gramString, 1.0); } } } // System.out.println ("pre-sorting"); int numNgrams = afv.numLocations(); // System.out.println ("post-sorting "+numNgrams); wp = new WordProb[numNgrams]; int ngramSum = 0; for (int loc = 0; loc < numNgrams; loc++) { wp[loc] = new WordProb(afv.indexAtLocation(loc), afv.valueAtLocation(loc)); ngramSum += wp[loc].p; } Arrays.sort(wp); int numUnitypeTokens = 0, numBitypeTokens = 0, numUnitypeTypes = 0, numBitypeTypes = 0; for (int fi = 0; fi < numTypes; fi++) { numUnitypeTokens += unitypeTopicCounts[fi][ti]; if (unitypeTopicCounts[fi][ti] != 0) numUnitypeTypes++; } for (int fi = 0; fi < numBitypes; fi++) { numBitypeTokens += bitypeTopicCounts[fi][ti]; if (bitypeTopicCounts[fi][ti] != 0) numBitypeTypes++; } if (useNewLines) { System.out.println( "\nTopic " + ti + " unigrams " + numUnitypeTokens + "/" + numUnitypeTypes + " bigrams " + numBitypeTokens + "/" + numBitypeTypes + " phrases " + Math.round(afv.oneNorm()) + "/" + numNgrams); for (int i = 0; i < Math.min(numNgrams, numWords); i++) System.out.println( afv.getAlphabet().lookupObject(wp[i].wi).toString() + " " + wp[i].p / ngramSum); } else { System.out.print( " (unigrams " + numUnitypeTokens + "/" + numUnitypeTypes + " bigrams " + numBitypeTokens + "/" + numBitypeTypes + " phrases " + Math.round(afv.oneNorm()) + "/" + numNgrams + ")\n "); // System.out.print (" (unique-ngrams="+numNgrams+" // ngram-count="+Math.round(afv.oneNorm())+")\n "); for (int i = 0; i < Math.min(numNgrams, numWords); i++) System.out.print(afv.getAlphabet().lookupObject(wp[i].wi).toString() + " "); System.out.println(); } } }
// TODO the method should be private public void processEnglishFile(Document doc, String dataFile, String suffix) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { String plainTextOutputPath = dataFile + "plain.txt"; String goodVSbadOutputPath = dataFile + ".csv"; String pairwiseOutputPath = dataFile + getPairwiseSuffix(); String kelpFilePath = dataFile + ".klp"; /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); /// ** Instantiate CASes */ assigned in the for loop // JCas questionCas = JCasFactory.createJCas(); // WriteFile out = new WriteFile(dataFile + ".csv"); // TODO ABC, Sep 10th 2015. Do we really need this? It seems like a bad patch doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int qNumber = 1; for (Element question : questions) { System.out.println("[INFO]: Processing " + qNumber++ + " out of " + numberOfQuestions); CQAinstance cqainstance = qElementToObject(question); getFeaturesFromThread(cqainstance); // TODO MOVE FROM HERE TO getFeaturesFromThread. // FOR THAT the printing operations have to be moved out and // question and comment must have a method to extract header+body. // Move them from SubjectBodyAggregator // AQUI VOY /** Setup question CAS */ // questionCas.reset(); JCas questionCas = cqaElementToCas(cqainstance.getQuestion()); fm.writeLn( plainTextOutputPath, "---------------------------- QID: " + cqainstance.getQuestion().getId() + " USER:"******"q-" + qid, qsubject + ". " + qbody)); /*Comment-level features to be combined*/ List<List<Double>> listFeatures = new ArrayList<List<Double>>(); List<Map<String, Double>> albertoSimoneFeatures; if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) { // TODO RENAME THIS PLEASE albertoSimoneFeatures = FeatureExtractor.extractFeatures(cqainstance); } int commentIndex = 0; List<JCas> allCommentsCas = new ArrayList<JCas>(); for (CQAcomment c : cqainstance.getComments()) { /** Setup comment CAS */ JCas commentCas = cqaElementToCas(c); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(commentCas, this.analysisEngineList); // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + // cbody)); AugmentableFeatureVector fv; if (GENERATE_MASSIMO_FEATURES) { fv = (AugmentableFeatureVector) pfEnglish.getPairFeatures(questionCas, commentCas, PARAMETER_LIST); } else { fv = new AugmentableFeatureVector(this.alphabet); } if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) { Map<String, Double> featureVector = albertoSimoneFeatures.get(commentIndex); for (String featureName : FeatureExtractor.getAllFeatureNames()) { Double value = featureVector.get(featureName); double featureValue = 0; if (value != null) { featureValue = value; } fv.add(featureName, featureValue); } } commentIndex++; /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid); /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce outputs */ writeToPlainTextOutput(plainTextOutputPath, c, commentCas); // String goodVSbadOutputPath = dataFile + ".csv"; // String pairwiseOutputPath // FIXME Once we fix that issue with the features, we can know this info // in advance and fix the output, probably out of the method if (firstRow) { // header for Good vs Bad this.fm.write(goodVSbadOutputPath, "qid,cgold,cgold_yn"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; this.fm.write(goodVSbadOutputPath, ",f" + featureIndex); } this.fm.writeLn(goodVSbadOutputPath, ""); // header for pairwise this.fm.write(pairwiseOutputPath, "qid,cgold"); int numFeatures = fv.numLocations(); if (COMBINATION_CONCAT) { numFeatures *= 2; } if (INCLUDE_SIMILARITIES) { numFeatures += PairFeatureFactoryEnglish.NUM_SIM_FEATURES; } for (int i = 0; i < numFeatures; i++) { int featureIndex = i + 1; this.fm.write(pairwiseOutputPath, ",f" + featureIndex); } this.fm.writeLn(pairwiseOutputPath, ""); firstRow = false; } List<Double> features = this.serializeFv(fv); listFeatures.add(features); this.fm.writeLn( goodVSbadOutputPath, c.getId() + "," + c.getGold() + "," + c.getGold_yn() + "," + Joiner.on(",").join(features)); /** Produce also the file needed to train structural models */ if (PRODUCE_SVMLIGHTTK_DATA) { produceSVMLightTKExample( questionCas, commentCas, suffix, ts, cqainstance.getQuestion().getId(), c.getId(), c.getGold(), c.getGold_yn(), features); } if (PRODUCE_KELP_DATA) { produceKelpExample( questionCas, commentCas, kelpFilePath, ts, cqainstance.getQuestion().getId(), c.getId(), c.getGold(), c.getGold_yn(), features); } allCommentsCas.add(commentCas); } // TODO MOVE UP TO HERE this.fm.write( pairwiseOutputPath, computePairwiseFeatures(cqainstance, listFeatures, allCommentsCas)); // out.writeLn(computePairwiseFeatures(q, listFeatures); } // Iterator<String> iterator = questionCategories.iterator(); // while(iterator.hasNext()){ // System.out.println("CATEGORY_" + iterator.next()); // } this.fm.closeFiles(); }
/** * Process the xml file and output a csv file with the results in the same directory * * @param dataFile the xml file to process * @suffix suffix for identifying the data file * @param suffix * @throws ResourceInitializationException * @throws UIMAException * @throws IOException * @throws AnalysisEngineProcessException * @throws SimilarityException */ private void processEnglishFile(String dataFile) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { /** Parameters for matching tree structures */ String parameterList = Joiner.on(",") .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ // TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); WriteFile out = new WriteFile(dataFile + SUFFIX); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; // Map<String, Boolean> commentIsDialogue = new HashMap<>(); // HashSet<String> questionCategories = new HashSet<String>(); double[] matches = new double[11]; int[] totals = new int[11]; int bin; for (Element question : questions) { Question q = new Question(); System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String quserid = question.attr("QUSERID"); String qtype = question.attr("QTYPE"); String qgold_yn = question.attr("QGOLD_YN"); String qsubject = question.getElementsByTag("QSubject").get(0).text(); String qbody = question.getElementsByTag("QBody").get(0).text(); // questionCategories.add(qcategory); q.setQid(qid); q.setQcategory(qcategory); q.setQdate(qdate); q.setQuserId(quserid); q.setQtype(qtype); q.setQgoldYN(qgold_yn); q.setQsubject(qsubject); q.setQbody(qbody); // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody)); /** Parse comment nodes */ Elements comments = question.getElementsByTag("Comment"); if (LIMIT_COMMENTS_PER_Q && comments.size() >= LIMIT_COMMENTS) { continue; } for (Element comment : comments) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); String cgold_yn = comment.attr("CGOLD_YN"); String csubject = comment.getElementsByTag("CSubject").get(0).text(); String cbody = comment.getElementsByTag("CBody").get(0).text(); q.addComment(cid, cuserid, cgold, cgold_yn, csubject, cbody); } List<JCas> allCommentsCas = new ArrayList<JCas>(); List<String> ids = new ArrayList<String>(); List<String> labels = new ArrayList<String>(); for (Element comment : comments) { allCommentsCas.add(computeCommentCas(comment)); ids.add(comment.attr("CID")); labels.add(getgold(comment.attr("CGOLD"))); } for (int i = 0; i < allCommentsCas.size() - 1; i++) { for (int j = i + 1; j <= allCommentsCas.size() - 1; j++) { AugmentableFeatureVector fv; // COMPUTE THE SIMILARITY HERE // TODO where to assign this // Whether the CAS are exactly identical // how to store/display the output fv = (AugmentableFeatureVector) pfEnglish.getPairFeatures( allCommentsCas.get(i), allCommentsCas.get(j), parameterList); // System.out.println( // ids.get(i) + ","+ // labels.get(i) + ","+ // ids.get(j)+","+ // labels.get(j) + ","+ // Joiner.on(",").join(this.serializeFv(fv)) // ); bin = (int) Math.round(fv.getValues()[0] * 10); if (labels.get(i).equals(labels.get(j))) matches[bin]++; totals[bin]++; /** Produce output line */ if (firstRow) { out.write("qid,cgold"); for (int c = 0; c < fv.numLocations(); c++) { int featureIndex = c + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } // System.out.println(bin); out.writeLn( ids.get(i) + "-" + ids.get(j) + "," + labels.get(i) + "-" + labels.get(j) + "," + Joiner.on(",").join(this.serializeFv(fv))); } } } for (int i = 0; i < 11; i++) System.out.println("BIN: " + i + " pctge: " + matches[i] / totals[i]); this.fm.closeFiles(); out.close(); }