示例#1
0
 private static void processFileList(String fileList) throws IOException {
   // open list of files
   BufferedReader reader = new BufferedReader(new FileReader(fileList));
   int docCount = 0;
   String currentDoc;
   while ((currentDoc = reader.readLine()) != null) {
     // process file 'currentDoc'
     docCount++;
     System.out.println("\nProcessing document " + docCount + ": " + currentDoc);
     String textFileName = ACEdir + currentDoc + ".sgm";
     ExternalDocument doc = new ExternalDocument("sgml", textFileName);
     doc.setAllTags(true);
     doc.open();
     String APFfileName = ACEdir + currentDoc + apfExtension;
     AceDocument aceDoc = new AceDocument(textFileName, APFfileName);
     addMentionTags(doc, aceDoc);
     doc.setSGMLwrapMargin(0);
     doc.saveAs(outputDir, currentDoc + ".co.txt");
   }
   reader.close();
 }
示例#2
0
  /**
   * train the tagger using the DocumentCollection in file 'trainingCollection'.
   * 'trainingCollection' should consist of documents which have been explicitly tagged with
   * part-of-speech information.
   */
  void train(String trainingCollection) {

    for (int i = 0; i < posTable.length; i++)
      tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]};

    // build ergodic HMM with one state for each POS (plus start and end states)

    HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class);
    posh.addState(startState);
    for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0));
    HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class);
    posh.addState(endState);
    for (int i = 0; i < posTable.length; i++) {
      String pos = posTable[i];
      HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class);
      posh.addState(state);
      for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0));
      state.addArc(new HMMarc("end", 0));
    }
    posh.resolveNames();

    posh.resetForTraining();
    annotator = new HMMannotator(posh);
    annotator.setTagTable(tagTable);
    annotator.setBItag(false);

    DocumentCollection col = new DocumentCollection(trainingCollection);
    col.open();
    for (int i = 0; i < col.size(); i++) {
      ExternalDocument doc = col.get(i);
      doc.open();
      System.out.println("Training from " + doc.fileName());

      // divide at endmarks (constit cat="."), adding "S" marks

      int posn = 0;
      int start = posn;
      Vector anns;
      while ((anns = doc.annotationsAt(posn, "constit")) != null) {
        Annotation ann = (Annotation) anns.get(0);
        posn = ann.span().end();
        String pos = (String) ann.get("cat");
        if (pos.equals(".")) {
          doc.annotate("S", new Span(start, posn), new FeatureSet());
          start = posn;
        }
      }
      annotator.train(doc);
      //  free up space taken by annotations on document
      doc.clearAnnotations();
    }
    posh.computeProbabilities();
  }
示例#3
0
 /** process a single file, 'docName', generating a file with in-line XML. */
 public static void processFile(String docName) {
   docCount++;
   System.out.println("\nProcessing document " + docCount + ": " + docName);
   String textFileName = ACEdir + docName + ".sgm";
   ExternalDocument doc = new ExternalDocument("sgml", textFileName);
   doc.setAllTags(true);
   if (year.equals("2003") || year.equals("2004")) doc.setEmptyTags(new String[] {"TURN"});
   doc.open();
   String APFfileName = ACEdir + docName + "." + apfExtension;
   AceDocument aceDoc = new AceDocument(textFileName, APFfileName);
   addAnnotations(doc, aceDoc);
   doc.setSGMLwrapMargin(0);
   doc.saveAs(outputDir, docName + "." + outputExtension);
 }