/** * Process the xml file and output a csv file with the results in the same directory * * @param dataFile the xml file to process * @suffix suffix for identifying the data file * @param suffix * @throws ResourceInitializationException * @throws UIMAException * @throws IOException * @throws AnalysisEngineProcessException * @throws SimilarityException */ private void processEnglishFile(String dataFile, String suffix) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { /** Parameters for matching tree structures */ String parameterList = Joiner.on(",") .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); /** Instantiate CASes */ JCas questionCas = JCasFactory.createJCas(); JCas commentCas = JCasFactory.createJCas(); WriteFile out = new WriteFile(dataFile + ".csv"); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; Map<String, Boolean> commentIsDialogue = new HashMap<>(); for (Element question : questions) { System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String quserid = question.attr("QUSERID"); String qtype = question.attr("QTYPE"); String qgold_yn = question.attr("QGOLD_YN"); String qsubject = question.getElementsByTag("QSubject").get(0).text(); String qbody = question.getElementsByTag("QBody").get(0).text(); /** Setup question CAS */ questionCas.reset(); questionCas.setDocumentLanguage("en"); questionCas.setDocumentText(qsubject + ". " + qbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(questionCas, this.analysisEngineList); // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody)); /** Parse comment nodes */ Elements comments = question.getElementsByTag("Comment"); for (Element comment : comments) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); String cgold_yn = comment.attr("CGOLD_YN"); String csubject = comment.getElementsByTag("CSubject").get(0).text(); String cbody = comment.getElementsByTag("CBody").get(0).text(); /** Setup comment CAS */ commentCas.reset(); commentCas.setDocumentLanguage("en"); commentCas.setDocumentText(csubject + ". " + cbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(commentCas, this.analysisEngineList); // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + // cbody)); FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList); /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ boolean quseridEqCuserid = quserid.equals(cuserid); if (quseridEqCuserid) { commentIsDialogue.put(cid, true); } // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid); /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce output line */ if (firstRow) { out.write("qid,cgold,cgold_yn"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } List<Double> features = this.serializeFv(fv); out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features)); /** Produce also the file needed to train structural models */ if (PRODUCE_SVMLIGHTTK_DATA) { produceSVMLightTKExample( questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features); } } } for (String commentId : commentIsDialogue.keySet()) { this.fm.writeLn(dataFile + ".dialogue.txt", commentId); } this.fm.closeFiles(); out.close(); }
/** * Process the xml file and output a csv file with the results in the same directory * * @param dataFile the xml file to process * @suffix suffix for identifying the data file * @param suffix * @throws ResourceInitializationException * @throws UIMAException * @throws IOException * @throws AnalysisEngineProcessException * @throws SimilarityException */ private void processEnglishFile(String dataFile) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { /** Parameters for matching tree structures */ String parameterList = Joiner.on(",") .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ // TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); WriteFile out = new WriteFile(dataFile + SUFFIX); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; // Map<String, Boolean> commentIsDialogue = new HashMap<>(); // HashSet<String> questionCategories = new HashSet<String>(); double[] matches = new double[11]; int[] totals = new int[11]; int bin; for (Element question : questions) { Question q = new Question(); System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String quserid = question.attr("QUSERID"); String qtype = question.attr("QTYPE"); String qgold_yn = question.attr("QGOLD_YN"); String qsubject = question.getElementsByTag("QSubject").get(0).text(); String qbody = question.getElementsByTag("QBody").get(0).text(); // questionCategories.add(qcategory); q.setQid(qid); q.setQcategory(qcategory); q.setQdate(qdate); q.setQuserId(quserid); q.setQtype(qtype); q.setQgoldYN(qgold_yn); q.setQsubject(qsubject); q.setQbody(qbody); // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody)); /** Parse comment nodes */ Elements comments = question.getElementsByTag("Comment"); if (LIMIT_COMMENTS_PER_Q && comments.size() >= LIMIT_COMMENTS) { continue; } for (Element comment : comments) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); String cgold_yn = comment.attr("CGOLD_YN"); String csubject = comment.getElementsByTag("CSubject").get(0).text(); String cbody = comment.getElementsByTag("CBody").get(0).text(); q.addComment(cid, cuserid, cgold, cgold_yn, csubject, cbody); } List<JCas> allCommentsCas = new ArrayList<JCas>(); List<String> ids = new ArrayList<String>(); List<String> labels = new ArrayList<String>(); for (Element comment : comments) { allCommentsCas.add(computeCommentCas(comment)); ids.add(comment.attr("CID")); labels.add(getgold(comment.attr("CGOLD"))); } for (int i = 0; i < allCommentsCas.size() - 1; i++) { for (int j = i + 1; j <= allCommentsCas.size() - 1; j++) { AugmentableFeatureVector fv; // COMPUTE THE SIMILARITY HERE // TODO where to assign this // Whether the CAS are exactly identical // how to store/display the output fv = (AugmentableFeatureVector) pfEnglish.getPairFeatures( allCommentsCas.get(i), allCommentsCas.get(j), parameterList); // System.out.println( // ids.get(i) + ","+ // labels.get(i) + ","+ // ids.get(j)+","+ // labels.get(j) + ","+ // Joiner.on(",").join(this.serializeFv(fv)) // ); bin = (int) Math.round(fv.getValues()[0] * 10); if (labels.get(i).equals(labels.get(j))) matches[bin]++; totals[bin]++; /** Produce output line */ if (firstRow) { out.write("qid,cgold"); for (int c = 0; c < fv.numLocations(); c++) { int featureIndex = c + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } // System.out.println(bin); out.writeLn( ids.get(i) + "-" + ids.get(j) + "," + labels.get(i) + "-" + labels.get(j) + "," + Joiner.on(",").join(this.serializeFv(fv))); } } } for (int i = 0; i < 11; i++) System.out.println("BIN: " + i + " pctge: " + matches[i] / totals[i]); this.fm.closeFiles(); out.close(); }
public void processArabicFile(Analyzer analyzer, String dataFile, String suffix) throws SimilarityException, UIMAException, IOException { /** We do not have a lemmatizer so we work with tokens */ String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Instantiate CASes */ JCas questionCas = JCasFactory.createJCas(); JCas commentCas = JCasFactory.createJCas(); WriteFile out = new WriteFile(dataFile + ".csv"); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; for (Element question : questions) { System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String qsubject = question .getElementsByTag("QSubject") .get(0) .text() .replaceAll("/", "") .replaceAll("~", ""); String qbody = question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", ""); /** Get analyzed text for question */ if (USE_QCRI_ALT_TOOLS) { questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody); } else { questionCas.reset(); questionCas.setDocumentLanguage("ar"); questionCas.setDocumentText(qsubject + ". " + qbody); SimplePipeline.runPipeline(questionCas, this.analysisEngineList); } /** Parse answer nodes */ Elements comments = question.getElementsByTag("Answer"); for (Element comment : comments) { String cid = comment.attr("CID"); String cgold = comment.attr("CGOLD"); String cbody = comment.text().replaceAll("/", "").replaceAll("~", ""); ; /** Get analyzed text for comment */ if (USE_QCRI_ALT_TOOLS) { commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody); } else { commentCas.reset(); commentCas.setDocumentLanguage("ar"); commentCas.setDocumentText(cbody); SimplePipeline.runPipeline(commentCas, this.analysisEngineList); } /** Compute features between question and comment */ FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList); /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce output line */ if (firstRow) { out.write("cid,cgold"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } List<Double> features = this.serializeFv(fv); /** Produce output line */ out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features)); } } this.fm.closeFiles(); out.close(); }