Example #1
0
 /**
  * convert a list of APF files to XML files containing in-line markup for sentences, timex,
  * mentions, entity types, and names. Takes the following arguments:
  *
  * <ul>
  *   <li>year: the year (2002, 2003, 2004, 2005) the APF file was created, which determines its
  *       format
  *   <li>inDir: the directory containing the text and APF files
  *   <li>outDir: the directory containing the NE files
  *   <li>filelist: a file containing a list of document names
  *   <li>apfExtension: the suffix added to the doc name to create the APF file name
  *   <li>outExtension: the suffix added to the doc name to create the output file name
  *   <li>gazetteer: (for 2004 files only) a Jet gazetteer, used to resolve the name/noun
  *       distinction for GPE words categorized as PRE
  *   <li>PREdict: (for 2004 files only) a name vs. noun list, used to resolve the name/noun
  *       distinction for non-GPE words categorized as PRE
  *   <li>flag ... : one or more of 'sentences', 'timex', 'mentions', 'extents', 'types' or
  *       'names', indicating the information to be included in the output file
  * </ul>
  */
 public static void main(String[] args) throws IOException {
   init(args);
   processFileList(fileList);
   if (year.equals("2004")) {
     System.out.println("\nUnclassified items:  " + unknownPre.size());
     for (String word : unknownPre) System.out.println(word);
   }
 }
Example #2
0
 /** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */
 static void addENAMEXtags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = entities.get(i);
     ArrayList<AceEntityName> names = entity.names;
     for (int j = 0; j < names.size(); j++) {
       AceEntityName name = names.get(j);
       Span aceSpan = name.extent;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
     }
     // for 2004 we have to examine PRE mentions and decide which are names
     if (year.equals("2004")) {
       ArrayList<AceEntityMention> mentions = entity.mentions;
       for (int j = 0; j < mentions.size(); j++) {
         AceEntityMention mention = mentions.get(j);
         String htext = Resolve.normalizeName(mention.headText);
         String[] mentionName = Gazetteer.splitAtWS(htext);
         String preClass = preDict.get(htext.toLowerCase());
         if (mention.type.equals("PRE")) {
           if (gazetteer.isNationality(mentionName)
               || gazetteer.isLocation(mentionName)
               || "N".equals(preClass)) {
             Span aceSpan = mention.head;
             Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
             doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
           } else if (preClass != null) {
             // do nothing
           } else {
             System.out.println(
                 "Unclassified PRE: " + mention.text + " {" + mention.headText + ")");
             unknownPre.add(htext.toLowerCase());
           }
         }
       }
     }
   }
 }