/** * @deprecated use {@link cqaElementToCas} instead * @param cqa * @return * @throws UIMAException */ @Deprecated private JCas questionToCas(CQAinstance cqa) throws UIMAException { JCas questionCas = JCasFactory.createJCas(); questionCas.setDocumentLanguage("en"); questionCas.setDocumentText(cqa.getQuestion().getWholeText()); return questionCas; }
/** * Gets the contents of the question and feed it into a Question object TODO this method should be * deprecated and moved into a class that just reads the XML file and generates the object * * @param qelement * @return object instance with the question data */ private CQAinstance qElementToObject(Element qelement) { String id = qelement.attr("QID"); String category = qelement.attr("QCATEGORY"); String date = qelement.attr("QDATE"); String userid = qelement.attr("QUSERID"); String type = qelement.attr("QTYPE"); String goldYN = qelement.attr("QGOLD_YN"); String subject = TextNormalizer.normalize( JsoupUtils.recoverOriginalText(qelement.getElementsByTag("QSubject").get(0).text())); // TODO we don't normalise the subject? String body = qelement.getElementsByTag("QBody").get(0).text(); // FIXME make it use useprofiles as below // body = JsoupUtils.recoverOriginalText( // UserProfile.removeSignature(body, // userProfiles.get(userid))); body = TextNormalizer.normalize(body); CQAquestion q = new CQAquestion(id, date, userid, type, goldYN, subject, body); CQAinstance cqa = new CQAinstance(q, category); /** Parse comment nodes */ for (Element comment : qelement.getElementsByTag("Comment")) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); if (ONLY_BAD_AND_GOOD_CLASSES) { cgold = (cgold.equalsIgnoreCase("good")) ? GOOD : BAD; } String cgold_yn = comment.attr("CGOLD_YN"); String csubject = JsoupUtils.recoverOriginalText(comment.getElementsByTag("CSubject").get(0).text()); csubject = TextNormalizer.normalize(csubject); String cbody = comment.getElementsByTag("CBody").get(0).text(); // FIXME make the following line work // cbody = JsoupUtils.recoverOriginalText( // UserProfile.removeSignature(cbody, userProfiles.get(cuserid))); cbody = TextNormalizer.normalize(cbody); cqa.addComment(cid, cuserid, cgold, cgold_yn, csubject, cbody); } return cqa; }
private String standardCombination( CQAinstance cqainstance, List<List<Double>> features, List<JCas> allCommentsCas) { StringBuffer sb = new StringBuffer(); List<CQAcomment> comments = cqainstance.getComments(); for (int i = 0; i < comments.size() - 1; i++) { CQAcomment comment1 = comments.get(i); for (int j = i + 1; j < comments.size(); j++) { CQAcomment comment2 = comments.get(j); sb.append(comment1.getId()).append("-").append(comment2.getId()).append(","); sb.append(getClassLabel(comment1.getGold(), comment2.getGold())); sb.append(","); if (COMBINATION_CONCAT) { sb.append(Joiner.on(",").join(concatVectors(features.get(i), features.get(j)))); } else { sb.append(Joiner.on(",").join(absoluteDifference(features.get(i), features.get(j)))); } // Simimarities if (INCLUDE_SIMILARITIES) { AugmentableFeatureVector fv; fv = (AugmentableFeatureVector) pfEnglish.getPairFeatures( allCommentsCas.get(i), allCommentsCas.get(j), PARAMETER_LIST); // System.out.println( // ids.get(i) + ","+ // labels.get(i) + ","+ // ids.get(j)+","+ // labels.get(j) + ","+ // Joiner.on(",").join(this.serializeFv(fv)) // ); sb.append(",").append(Joiner.on(",").join(this.serializeFv(fv))); } // out.writeLn(sb.toString()); sb.append("\n"); } } return sb.toString(); }
// TODO the method should be private public void processEnglishFile(Document doc, String dataFile, String suffix) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { String plainTextOutputPath = dataFile + "plain.txt"; String goodVSbadOutputPath = dataFile + ".csv"; String pairwiseOutputPath = dataFile + getPairwiseSuffix(); String kelpFilePath = dataFile + ".klp"; /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); /// ** Instantiate CASes */ assigned in the for loop // JCas questionCas = JCasFactory.createJCas(); // WriteFile out = new WriteFile(dataFile + ".csv"); // TODO ABC, Sep 10th 2015. Do we really need this? It seems like a bad patch doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int qNumber = 1; for (Element question : questions) { System.out.println("[INFO]: Processing " + qNumber++ + " out of " + numberOfQuestions); CQAinstance cqainstance = qElementToObject(question); getFeaturesFromThread(cqainstance); // TODO MOVE FROM HERE TO getFeaturesFromThread. // FOR THAT the printing operations have to be moved out and // question and comment must have a method to extract header+body. // Move them from SubjectBodyAggregator // AQUI VOY /** Setup question CAS */ // questionCas.reset(); JCas questionCas = cqaElementToCas(cqainstance.getQuestion()); fm.writeLn( plainTextOutputPath, "---------------------------- QID: " + cqainstance.getQuestion().getId() + " USER:"******"q-" + qid, qsubject + ". " + qbody)); /*Comment-level features to be combined*/ List<List<Double>> listFeatures = new ArrayList<List<Double>>(); List<Map<String, Double>> albertoSimoneFeatures; if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) { // TODO RENAME THIS PLEASE albertoSimoneFeatures = FeatureExtractor.extractFeatures(cqainstance); } int commentIndex = 0; List<JCas> allCommentsCas = new ArrayList<JCas>(); for (CQAcomment c : cqainstance.getComments()) { /** Setup comment CAS */ JCas commentCas = cqaElementToCas(c); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(commentCas, this.analysisEngineList); // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + // cbody)); AugmentableFeatureVector fv; if (GENERATE_MASSIMO_FEATURES) { fv = (AugmentableFeatureVector) pfEnglish.getPairFeatures(questionCas, commentCas, PARAMETER_LIST); } else { fv = new AugmentableFeatureVector(this.alphabet); } if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) { Map<String, Double> featureVector = albertoSimoneFeatures.get(commentIndex); for (String featureName : FeatureExtractor.getAllFeatureNames()) { Double value = featureVector.get(featureName); double featureValue = 0; if (value != null) { featureValue = value; } fv.add(featureName, featureValue); } } commentIndex++; /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid); /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce outputs */ writeToPlainTextOutput(plainTextOutputPath, c, commentCas); // String goodVSbadOutputPath = dataFile + ".csv"; // String pairwiseOutputPath // FIXME Once we fix that issue with the features, we can know this info // in advance and fix the output, probably out of the method if (firstRow) { // header for Good vs Bad this.fm.write(goodVSbadOutputPath, "qid,cgold,cgold_yn"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; this.fm.write(goodVSbadOutputPath, ",f" + featureIndex); } this.fm.writeLn(goodVSbadOutputPath, ""); // header for pairwise this.fm.write(pairwiseOutputPath, "qid,cgold"); int numFeatures = fv.numLocations(); if (COMBINATION_CONCAT) { numFeatures *= 2; } if (INCLUDE_SIMILARITIES) { numFeatures += PairFeatureFactoryEnglish.NUM_SIM_FEATURES; } for (int i = 0; i < numFeatures; i++) { int featureIndex = i + 1; this.fm.write(pairwiseOutputPath, ",f" + featureIndex); } this.fm.writeLn(pairwiseOutputPath, ""); firstRow = false; } List<Double> features = this.serializeFv(fv); listFeatures.add(features); this.fm.writeLn( goodVSbadOutputPath, c.getId() + "," + c.getGold() + "," + c.getGold_yn() + "," + Joiner.on(",").join(features)); /** Produce also the file needed to train structural models */ if (PRODUCE_SVMLIGHTTK_DATA) { produceSVMLightTKExample( questionCas, commentCas, suffix, ts, cqainstance.getQuestion().getId(), c.getId(), c.getGold(), c.getGold_yn(), features); } if (PRODUCE_KELP_DATA) { produceKelpExample( questionCas, commentCas, kelpFilePath, ts, cqainstance.getQuestion().getId(), c.getId(), c.getGold(), c.getGold_yn(), features); } allCommentsCas.add(commentCas); } // TODO MOVE UP TO HERE this.fm.write( pairwiseOutputPath, computePairwiseFeatures(cqainstance, listFeatures, allCommentsCas)); // out.writeLn(computePairwiseFeatures(q, listFeatures); } // Iterator<String> iterator = questionCategories.iterator(); // while(iterator.hasNext()){ // System.out.println("CATEGORY_" + iterator.next()); // } this.fm.closeFiles(); }