/* * Definition: parse the xml files in the Knowtator annotation output folder to build * CRF training file. */ public void buildCRFi2b2MedicationTrain( String crfTrainingFile, boolean bMakeLowerCase, boolean bLabelProblemList, boolean bAddReasonSingleton) throws Exception { String baseDirectory = KnowtatorXMLDirectory; RawInput rinput = new RawInput(); ArrayList<String> kxmlfileList = new ArrayList<String>(); rinput.getDirectoryFile(baseDirectory, kxmlfileList); BufferedWriter fTrain; try { fTrain = new BufferedWriter(new FileWriter(crfTrainingFile)); for (String knowtatorXmlFile : kxmlfileList) { System.out.println(knowtatorXmlFile); Parser p = new Parser(knowtatorXmlFile); String taggedArticle = p.getTaggedArticleForCRFModel(bMakeLowerCase, bLabelProblemList, bAddReasonSingleton); taggedArticle = taggedArticle.replaceAll("\\|\\|O", ""); // System.out.print(taggedArticle); fTrain.write(taggedArticle + "\n"); } fTrain.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(kxmlfileList.size()); }
public static void GenerateListNarrativeSentenceTrainingData( String listFile, String narrativeFile, String negativeFile) throws Exception { // TODO Auto-generated method stub String baseDirectory = KnowtatorXMLDirectory; RawInput rinput = new RawInput(); ArrayList<String> kxmlfileList = new ArrayList<String>(); rinput.getDirectoryFile(baseDirectory, kxmlfileList); BufferedWriter fList = null; BufferedWriter fNarrative = null; BufferedWriter fNegative = null; try { fList = new BufferedWriter(new FileWriter(listFile)); fNarrative = new BufferedWriter(new FileWriter(narrativeFile)); fNegative = new BufferedWriter(new FileWriter(negativeFile)); for (String knowtatorXmlFile : kxmlfileList) { System.out.println(knowtatorXmlFile); Parser p = new Parser(knowtatorXmlFile); ArrayList<String> narratives = p.GetListNarrativeSentences("narrative"); ArrayList<String> lists = p.GetListNarrativeSentences("list"); ArrayList<String> negatives = p.GetListNarrativeSentences("negative"); for (String sent : narratives) { fNarrative.write(knowtatorXmlFile + "\t" + sent + "\n"); } for (String sent : lists) { fList.write(knowtatorXmlFile + "\t" + sent + "\n"); } for (String sent : negatives) { fNegative.write(knowtatorXmlFile + "\t" + sent + "\n"); } } fList.close(); fNarrative.close(); fNegative.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(kxmlfileList.size()); }
/** * @param trainFile * @param makeLowerCase * @param labelProblemList * @param addReasonSingleton * @throws Exception */ private void buildCRFTrainBySentence( String trainFile, boolean makeLowerCase, boolean labelProblemList, boolean addReasonSingleton) throws Exception { String baseDirectory = KnowtatorXMLDirectory; RawInput rinput = new RawInput(); ArrayList<String> kxmlfileList = new ArrayList<String>(); rinput.getDirectoryFile(baseDirectory, kxmlfileList); BufferedWriter fTrain; try { fTrain = new BufferedWriter(new FileWriter(trainFile)); for (String knowtatorXmlFile : kxmlfileList) { System.out.println(knowtatorXmlFile); // if(!knowtatorXmlFile.contains("11995")) // continue; Parser p = new Parser(knowtatorXmlFile); // String taggedArticle = p.getTaggedArticleForCRFModel(makeLowerCase, // labelProblemList, addReasonSingleton); ArrayList<String> taggedSentences = p.getTaggedSentences(); for (String sentence : taggedSentences) { sentence = sentence.replaceAll("\\|\\|O", ""); // System.out.print(taggedArticle); fTrain.write(sentence + "\n\n"); } } fTrain.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(kxmlfileList.size()); }
/* * Definition: used to build ARFF training file for medication relationship * model. */ static void buildMedicationRelationshipTrainingData( String relationshipTrainfile, boolean addReasonSingleton) throws Exception { String baseDirectory = KnowtatorXMLDirectory; RawInput rinput = new RawInput(); ArrayList<String> kxmlfileList = new ArrayList<String>(); rinput.getDirectoryFile(baseDirectory, kxmlfileList); String head = ""; String dataSet = ""; BufferedWriter fTrain; JMerki jm = new JMerki(); jm.initializeParser(); for (String knowtatorXmlFile : kxmlfileList) { Parser p = new Parser(knowtatorXmlFile); String rawContent = p.getRawContent(); ArrayList<String> listedReason = new ArrayList<String>(); listedReason = jm.getListedReason(rawContent); HashMap<String, String> data = p.getARFFDataset(listedReason); if (head.isEmpty()) head = data.get("head"); System.out.println(data.get("data")); dataSet += data.get("data"); } try { fTrain = new BufferedWriter(new FileWriter(relationshipTrainfile)); fTrain.write(head + "@data\n" + dataSet); fTrain.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(kxmlfileList.size()); }
private static void GenerateLNTrainingFileForMalletDocumentClassification() throws Exception { // TODO Auto-generated method stub String baseDirectory = KnowtatorXMLDirectory; RawInput rinput = new RawInput(); ArrayList<String> kxmlfileList = new ArrayList<String>(); RawInput.getDirectoryFile(baseDirectory, kxmlfileList); try { for (String knowtatorXmlFile : kxmlfileList) { File fxml = new File(knowtatorXmlFile); String fileName = fxml.getName(); String listFile = Messages.getString("i2b2.mallet.document.train.file.folder.list") + fileName + ".txt"; String narrativeFile = Messages.getString("i2b2.mallet.document.train.file.folder.narrative") + fileName + ".txt"; File fl = new File(listFile); File fn = new File(narrativeFile); if (fl.exists() || fn.exists()) System.err.println( "error in generate traing data for mallet document classification: 0618 pm File exist!"); BufferedWriter fList = null; BufferedWriter fNarrative = null; fList = new BufferedWriter(new FileWriter(listFile)); fNarrative = new BufferedWriter(new FileWriter(narrativeFile)); Parser p = new Parser(knowtatorXmlFile); ArrayList<String> narratives = p.GetListNarrativeSentences("narrative"); ArrayList<String> lists = p.GetListNarrativeSentences("list"); for (String sent : narratives) { // fNarrative.write(knowtatorXmlFile + "\t" + sent + "\n"); fNarrative.write(sent + "\n"); } for (String sent : lists) { // fList.write(knowtatorXmlFile + "\t" + sent + "\n"); fList.write(sent + "\n"); } fList.close(); fNarrative.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(kxmlfileList.size()); // generate mallet file File root = new File(Messages.getString("i2b2.mallet.document.train.file.folder")); String arffFile = Messages.getString("i2b2.mallet.ln.classification.arff.file"); MalletWekaInterface mwInterface = new MalletWekaInterface(); mwInterface.GenerateTrainingARFFfile(root, arffFile); }