public void runForEnglish() throws UIMAException, IOException { // TODO This should be changed to logger? System.out.println("EXTRACTING THE USER SIGNATURES"); Document docTrain = JsoupUtils.getDoc(CQA_QL_TRAIN_EN); Document docDevel = JsoupUtils.getDoc(CQA_QL_DEV_EN); Document docTest = JsoupUtils.getDoc(CQA_QL_TEST_EN); this.userProfiles = UserProfile.createUserProfiles(docTrain, docDevel, docTest); // for(Entry<String, UserProfile> entry: userProfiles.entrySet()){ // if(entry.getValue().getSignatures().size()>0){ // System.out.println("---------- SIGNATURES FOR USER: "******" ----------"); // for(String signature : entry.getValue().getSignatures()){ // System.out.println("____\n" + JsoupUtils.recoverOriginalText(signature)); // } // } // } setupUimaTools(); try { this.processEnglishFile(docTrain, CQA_QL_TRAIN_EN, "train"); this.processEnglishFile(docDevel, CQA_QL_DEV_EN, "devel"); LIMIT_COMMENTS_ACTIVE = false; this.processEnglishFile(docTest, CQA_QL_TEST_EN, "test"); } catch (UIMAException | IOException | SimilarityException e) { e.printStackTrace(); } System.out.println("TOTAL NUMBER OF REMOVED SIGNATURES: " + UserProfile.getRemovedSignatures()); }
/** * Gets the contents of the question and feed it into a Question object TODO this method should be * deprecated and moved into a class that just reads the XML file and generates the object * * @param qelement * @return object instance with the question data */ private CQAinstance qElementToObject(Element qelement) { String id = qelement.attr("QID"); String category = qelement.attr("QCATEGORY"); String date = qelement.attr("QDATE"); String userid = qelement.attr("QUSERID"); String type = qelement.attr("QTYPE"); String goldYN = qelement.attr("QGOLD_YN"); String subject = TextNormalizer.normalize( JsoupUtils.recoverOriginalText(qelement.getElementsByTag("QSubject").get(0).text())); // TODO we don't normalise the subject? String body = qelement.getElementsByTag("QBody").get(0).text(); // FIXME make it use useprofiles as below // body = JsoupUtils.recoverOriginalText( // UserProfile.removeSignature(body, // userProfiles.get(userid))); body = TextNormalizer.normalize(body); CQAquestion q = new CQAquestion(id, date, userid, type, goldYN, subject, body); CQAinstance cqa = new CQAinstance(q, category); /** Parse comment nodes */ for (Element comment : qelement.getElementsByTag("Comment")) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); if (ONLY_BAD_AND_GOOD_CLASSES) { cgold = (cgold.equalsIgnoreCase("good")) ? GOOD : BAD; } String cgold_yn = comment.attr("CGOLD_YN"); String csubject = JsoupUtils.recoverOriginalText(comment.getElementsByTag("CSubject").get(0).text()); csubject = TextNormalizer.normalize(csubject); String cbody = comment.getElementsByTag("CBody").get(0).text(); // FIXME make the following line work // cbody = JsoupUtils.recoverOriginalText( // UserProfile.removeSignature(cbody, userProfiles.get(cuserid))); cbody = TextNormalizer.normalize(cbody); cqa.addComment(cid, cuserid, cgold, cgold_yn, csubject, cbody); } return cqa; }