Пример #1
0
 /** analyze arguments to APFtoXML. */
 public static void init(String[] args) throws IOException {
   if (args.length == 0) argErr();
   JetTest.encoding = "UTF-8";
   year = args[0];
   AceDocument.ace2004 = false;
   AceDocument.ace2005 = false;
   int requiredArgs = 6;
   if (year.equals("2002")) {
   } else if (year.equals("2003")) {
   } else if (year.equals("2004")) {
     requiredArgs = 8;
     String gazFile = args[6];
     String preDict = args[7];
     gazetteer = new Gazetteer();
     gazetteer.load(gazFile);
     loadPreDict(preDict);
     AceDocument.ace2004 = true;
   } else if (year.equals("2005")) {
     AceDocument.ace2004 = true;
     AceDocument.ace2005 = true;
   } else {
     System.err.println("Invalid year:  must be 2002-2005");
     System.exit(1);
   }
   if (args.length <= requiredArgs) argErr();
   ACEdir = args[1];
   if (!ACEdir.endsWith("/")) ACEdir += "/";
   outputDir = args[2];
   if (!outputDir.endsWith("/")) outputDir += "/";
   fileList = args[3];
   apfExtension = args[4];
   outputExtension = args[5];
   for (int i = requiredArgs; i < args.length; i++) setFlag(args[i]);
 }
Пример #2
0
 public static void addAnnotations(Document doc, AceDocument aceDoc) {
   boolean monocase = Ace.allLowerCase(doc);
   if (year.equals("2004")) gazetteer.setMonocase(monocase);
   if (flags.contains("sentences")) addSentences(doc);
   if (flags.contains("timex")) addTimexTags(doc, aceDoc);
   if (flags.contains("mentions")) addMentionTags(doc, aceDoc);
   if (flags.contains("names")) addENAMEXtags(doc, aceDoc);
 }
Пример #3
0
 /** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */
 static void addENAMEXtags(Document doc, AceDocument aceDoc) {
   ArrayList<AceEntity> entities = aceDoc.entities;
   for (int i = 0; i < entities.size(); i++) {
     AceEntity entity = entities.get(i);
     ArrayList<AceEntityName> names = entity.names;
     for (int j = 0; j < names.size(); j++) {
       AceEntityName name = names.get(j);
       Span aceSpan = name.extent;
       Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
       doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
     }
     // for 2004 we have to examine PRE mentions and decide which are names
     if (year.equals("2004")) {
       ArrayList<AceEntityMention> mentions = entity.mentions;
       for (int j = 0; j < mentions.size(); j++) {
         AceEntityMention mention = mentions.get(j);
         String htext = Resolve.normalizeName(mention.headText);
         String[] mentionName = Gazetteer.splitAtWS(htext);
         String preClass = preDict.get(htext.toLowerCase());
         if (mention.type.equals("PRE")) {
           if (gazetteer.isNationality(mentionName)
               || gazetteer.isLocation(mentionName)
               || "N".equals(preClass)) {
             Span aceSpan = mention.head;
             Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
             doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
           } else if (preClass != null) {
             // do nothing
           } else {
             System.out.println(
                 "Unclassified PRE: " + mention.text + " {" + mention.headText + ")");
             unknownPre.add(htext.toLowerCase());
           }
         }
       }
     }
   }
 }