private JCas computeCommentCas(Element comment) throws UIMAException { JCas cCas = JCasFactory.createJCas(); String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); // String cgold = comment.attr("CGOLD"); // String cgold = getgold(comment.attr("CGOLD")); // String cgold_yn = comment.attr("CGOLD_YN"); String csubject = comment.getElementsByTag("CSubject").get(0).text(); String cbody = comment.getElementsByTag("CBody").get(0).text(); /** Setup comment CAS */ cCas.reset(); cCas.setDocumentLanguage("en"); String commentText = TextNormalizer.normalize(SubjectBodyAggregator.getCommentText(csubject, cbody)); cCas.setDocumentText(commentText); // cCas.setDocumentText(csubject + ". " + cbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(cCas, this.analysisEngineList); // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + cbody)); return cCas; }
/** * Gets the contents of the question and feed it into a Question object TODO this method should be * deprecated and moved into a class that just reads the XML file and generates the object * * @param qelement * @return object instance with the question data */ private CQAinstance qElementToObject(Element qelement) { String id = qelement.attr("QID"); String category = qelement.attr("QCATEGORY"); String date = qelement.attr("QDATE"); String userid = qelement.attr("QUSERID"); String type = qelement.attr("QTYPE"); String goldYN = qelement.attr("QGOLD_YN"); String subject = TextNormalizer.normalize( JsoupUtils.recoverOriginalText(qelement.getElementsByTag("QSubject").get(0).text())); // TODO we don't normalise the subject? String body = qelement.getElementsByTag("QBody").get(0).text(); // FIXME make it use useprofiles as below // body = JsoupUtils.recoverOriginalText( // UserProfile.removeSignature(body, // userProfiles.get(userid))); body = TextNormalizer.normalize(body); CQAquestion q = new CQAquestion(id, date, userid, type, goldYN, subject, body); CQAinstance cqa = new CQAinstance(q, category); /** Parse comment nodes */ for (Element comment : qelement.getElementsByTag("Comment")) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); if (ONLY_BAD_AND_GOOD_CLASSES) { cgold = (cgold.equalsIgnoreCase("good")) ? GOOD : BAD; } String cgold_yn = comment.attr("CGOLD_YN"); String csubject = JsoupUtils.recoverOriginalText(comment.getElementsByTag("CSubject").get(0).text()); csubject = TextNormalizer.normalize(csubject); String cbody = comment.getElementsByTag("CBody").get(0).text(); // FIXME make the following line work // cbody = JsoupUtils.recoverOriginalText( // UserProfile.removeSignature(cbody, userProfiles.get(cuserid))); cbody = TextNormalizer.normalize(cbody); cqa.addComment(cid, cuserid, cgold, cgold_yn, csubject, cbody); } return cqa; }