/*
   * Definition: parse the xml files in the Knowtator annotation output folder to build
   * 				CRF training file.
   */
  public void buildCRFi2b2MedicationTrain(
      String crfTrainingFile,
      boolean bMakeLowerCase,
      boolean bLabelProblemList,
      boolean bAddReasonSingleton)
      throws Exception {
    String baseDirectory = KnowtatorXMLDirectory;

    RawInput rinput = new RawInput();
    ArrayList<String> kxmlfileList = new ArrayList<String>();
    rinput.getDirectoryFile(baseDirectory, kxmlfileList);

    BufferedWriter fTrain;
    try {
      fTrain = new BufferedWriter(new FileWriter(crfTrainingFile));

      for (String knowtatorXmlFile : kxmlfileList) {
        System.out.println(knowtatorXmlFile);
        Parser p = new Parser(knowtatorXmlFile);
        String taggedArticle =
            p.getTaggedArticleForCRFModel(bMakeLowerCase, bLabelProblemList, bAddReasonSingleton);

        taggedArticle = taggedArticle.replaceAll("\\|\\|O", "");
        //        	System.out.print(taggedArticle);
        fTrain.write(taggedArticle + "\n");
      }

      fTrain.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    System.out.println(kxmlfileList.size());
  }
Beispiel #2
0
  private void InitialBuilder() {
    //		root_id = 990000;
    tempFilepath = RawInput.getTemporaryFilePath("KnowtatorXmlBuilder", "xml");

    try {
      fos = new FileOutputStream(tempFilepath);
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    }
    OutputFormat of = new OutputFormat("XML", "UTF-8", true);
    of.setIndent(1);
    of.setIndenting(true);

    serializer = new XMLSerializer(fos, of);

    try {
      hd = serializer.asContentHandler();
    } catch (IOException e) {
      e.printStackTrace();
    }
    try {
      hd.startDocument();
    } catch (SAXException e) {
      e.printStackTrace();
    }
  }
  public static void GenerateListNarrativeSentenceTrainingData(
      String listFile, String narrativeFile, String negativeFile) throws Exception {
    // TODO Auto-generated method stub
    String baseDirectory = KnowtatorXMLDirectory;

    RawInput rinput = new RawInput();
    ArrayList<String> kxmlfileList = new ArrayList<String>();
    rinput.getDirectoryFile(baseDirectory, kxmlfileList);

    BufferedWriter fList = null;
    BufferedWriter fNarrative = null;
    BufferedWriter fNegative = null;
    try {
      fList = new BufferedWriter(new FileWriter(listFile));
      fNarrative = new BufferedWriter(new FileWriter(narrativeFile));
      fNegative = new BufferedWriter(new FileWriter(negativeFile));

      for (String knowtatorXmlFile : kxmlfileList) {
        System.out.println(knowtatorXmlFile);
        Parser p = new Parser(knowtatorXmlFile);

        ArrayList<String> narratives = p.GetListNarrativeSentences("narrative");
        ArrayList<String> lists = p.GetListNarrativeSentences("list");
        ArrayList<String> negatives = p.GetListNarrativeSentences("negative");

        for (String sent : narratives) {
          fNarrative.write(knowtatorXmlFile + "\t" + sent + "\n");
        }
        for (String sent : lists) {
          fList.write(knowtatorXmlFile + "\t" + sent + "\n");
        }
        for (String sent : negatives) {
          fNegative.write(knowtatorXmlFile + "\t" + sent + "\n");
        }
      }

      fList.close();
      fNarrative.close();
      fNegative.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    System.out.println(kxmlfileList.size());
  }
  /**
   * @param trainFile
   * @param makeLowerCase
   * @param labelProblemList
   * @param addReasonSingleton
   * @throws Exception
   */
  private void buildCRFTrainBySentence(
      String trainFile, boolean makeLowerCase, boolean labelProblemList, boolean addReasonSingleton)
      throws Exception {
    String baseDirectory = KnowtatorXMLDirectory;

    RawInput rinput = new RawInput();
    ArrayList<String> kxmlfileList = new ArrayList<String>();
    rinput.getDirectoryFile(baseDirectory, kxmlfileList);

    BufferedWriter fTrain;
    try {
      fTrain = new BufferedWriter(new FileWriter(trainFile));

      for (String knowtatorXmlFile : kxmlfileList) {
        System.out.println(knowtatorXmlFile);
        //        		if(!knowtatorXmlFile.contains("11995"))
        //        			continue;

        Parser p = new Parser(knowtatorXmlFile);
        //            	String taggedArticle = p.getTaggedArticleForCRFModel(makeLowerCase,
        // labelProblemList, addReasonSingleton);

        ArrayList<String> taggedSentences = p.getTaggedSentences();

        for (String sentence : taggedSentences) {
          sentence = sentence.replaceAll("\\|\\|O", "");
          //        	System.out.print(taggedArticle);
          fTrain.write(sentence + "\n\n");
        }
      }

      fTrain.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    System.out.println(kxmlfileList.size());
  }
  /*
   * Definition: used to build ARFF training file for medication relationship
   * 				model.
   */
  static void buildMedicationRelationshipTrainingData(
      String relationshipTrainfile, boolean addReasonSingleton) throws Exception {
    String baseDirectory = KnowtatorXMLDirectory;

    RawInput rinput = new RawInput();
    ArrayList<String> kxmlfileList = new ArrayList<String>();
    rinput.getDirectoryFile(baseDirectory, kxmlfileList);

    String head = "";
    String dataSet = "";
    BufferedWriter fTrain;
    JMerki jm = new JMerki();

    jm.initializeParser();

    for (String knowtatorXmlFile : kxmlfileList) {
      Parser p = new Parser(knowtatorXmlFile);
      String rawContent = p.getRawContent();
      ArrayList<String> listedReason = new ArrayList<String>();
      listedReason = jm.getListedReason(rawContent);
      HashMap<String, String> data = p.getARFFDataset(listedReason);
      if (head.isEmpty()) head = data.get("head");
      System.out.println(data.get("data"));

      dataSet += data.get("data");
    }

    try {
      fTrain = new BufferedWriter(new FileWriter(relationshipTrainfile));
      fTrain.write(head + "@data\n" + dataSet);

      fTrain.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    System.out.println(kxmlfileList.size());
  }
Beispiel #6
0
  private void EndBuilder() {

    try {
      hd.endElement("", "", "annotations");
      hd.endDocument();
    } catch (SAXException e) {
      e.printStackTrace();
    }
    try {
      fos.close();
    } catch (IOException e) {
      e.printStackTrace();
    }

    System.out.println(tempFilepath);
    XMLContent = RawInput.getFullText(tempFilepath);
  }
  private static void GenerateLNTrainingFileForMalletDocumentClassification() throws Exception {
    // TODO Auto-generated method stub
    String baseDirectory = KnowtatorXMLDirectory;

    RawInput rinput = new RawInput();
    ArrayList<String> kxmlfileList = new ArrayList<String>();
    RawInput.getDirectoryFile(baseDirectory, kxmlfileList);

    try {

      for (String knowtatorXmlFile : kxmlfileList) {
        File fxml = new File(knowtatorXmlFile);
        String fileName = fxml.getName();

        String listFile =
            Messages.getString("i2b2.mallet.document.train.file.folder.list") + fileName + ".txt";
        String narrativeFile =
            Messages.getString("i2b2.mallet.document.train.file.folder.narrative")
                + fileName
                + ".txt";

        File fl = new File(listFile);
        File fn = new File(narrativeFile);

        if (fl.exists() || fn.exists())
          System.err.println(
              "error in generate traing data for mallet document classification: 0618 pm File exist!");

        BufferedWriter fList = null;
        BufferedWriter fNarrative = null;

        fList = new BufferedWriter(new FileWriter(listFile));
        fNarrative = new BufferedWriter(new FileWriter(narrativeFile));

        Parser p = new Parser(knowtatorXmlFile);

        ArrayList<String> narratives = p.GetListNarrativeSentences("narrative");
        ArrayList<String> lists = p.GetListNarrativeSentences("list");

        for (String sent : narratives) {
          //        			fNarrative.write(knowtatorXmlFile + "\t" + sent + "\n");
          fNarrative.write(sent + "\n");
        }
        for (String sent : lists) {
          //        			fList.write(knowtatorXmlFile + "\t" + sent + "\n");
          fList.write(sent + "\n");
        }

        fList.close();
        fNarrative.close();
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    System.out.println(kxmlfileList.size());

    // generate mallet file
    File root = new File(Messages.getString("i2b2.mallet.document.train.file.folder"));

    String arffFile = Messages.getString("i2b2.mallet.ln.classification.arff.file");

    MalletWekaInterface mwInterface = new MalletWekaInterface();

    mwInterface.GenerateTrainingARFFfile(root, arffFile);
  }