/** analyze arguments to APFtoXML. */ public static void init(String[] args) throws IOException { if (args.length == 0) argErr(); JetTest.encoding = "UTF-8"; year = args[0]; AceDocument.ace2004 = false; AceDocument.ace2005 = false; int requiredArgs = 6; if (year.equals("2002")) { } else if (year.equals("2003")) { } else if (year.equals("2004")) { requiredArgs = 8; String gazFile = args[6]; String preDict = args[7]; gazetteer = new Gazetteer(); gazetteer.load(gazFile); loadPreDict(preDict); AceDocument.ace2004 = true; } else if (year.equals("2005")) { AceDocument.ace2004 = true; AceDocument.ace2005 = true; } else { System.err.println("Invalid year: must be 2002-2005"); System.exit(1); } if (args.length <= requiredArgs) argErr(); ACEdir = args[1]; if (!ACEdir.endsWith("/")) ACEdir += "/"; outputDir = args[2]; if (!outputDir.endsWith("/")) outputDir += "/"; fileList = args[3]; apfExtension = args[4]; outputExtension = args[5]; for (int i = requiredArgs; i < args.length; i++) setFlag(args[i]); }
public static void addAnnotations(Document doc, AceDocument aceDoc) { boolean monocase = Ace.allLowerCase(doc); if (year.equals("2004")) gazetteer.setMonocase(monocase); if (flags.contains("sentences")) addSentences(doc); if (flags.contains("timex")) addTimexTags(doc, aceDoc); if (flags.contains("mentions")) addMentionTags(doc, aceDoc); if (flags.contains("names")) addENAMEXtags(doc, aceDoc); }
/** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */ static void addENAMEXtags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityName> names = entity.names; for (int j = 0; j < names.size(); j++) { AceEntityName name = names.get(j); Span aceSpan = name.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } // for 2004 we have to examine PRE mentions and decide which are names if (year.equals("2004")) { ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); String htext = Resolve.normalizeName(mention.headText); String[] mentionName = Gazetteer.splitAtWS(htext); String preClass = preDict.get(htext.toLowerCase()); if (mention.type.equals("PRE")) { if (gazetteer.isNationality(mentionName) || gazetteer.isLocation(mentionName) || "N".equals(preClass)) { Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } else if (preClass != null) { // do nothing } else { System.out.println( "Unclassified PRE: " + mention.text + " {" + mention.headText + ")"); unknownPre.add(htext.toLowerCase()); } } } } } }