/** * convert a list of APF files to XML files containing in-line markup for sentences, timex, * mentions, entity types, and names. Takes the following arguments: * * <ul> * <li>year: the year (2002, 2003, 2004, 2005) the APF file was created, which determines its * format * <li>inDir: the directory containing the text and APF files * <li>outDir: the directory containing the NE files * <li>filelist: a file containing a list of document names * <li>apfExtension: the suffix added to the doc name to create the APF file name * <li>outExtension: the suffix added to the doc name to create the output file name * <li>gazetteer: (for 2004 files only) a Jet gazetteer, used to resolve the name/noun * distinction for GPE words categorized as PRE * <li>PREdict: (for 2004 files only) a name vs. noun list, used to resolve the name/noun * distinction for non-GPE words categorized as PRE * <li>flag ... : one or more of 'sentences', 'timex', 'mentions', 'extents', 'types' or * 'names', indicating the information to be included in the output file * </ul> */ public static void main(String[] args) throws IOException { init(args); processFileList(fileList); if (year.equals("2004")) { System.out.println("\nUnclassified items: " + unknownPre.size()); for (String word : unknownPre) System.out.println(word); } }
/** write 'fileText' out as file 'XMLfileName' with ENAMEX tags for the names in the document */ static void addENAMEXtags(Document doc, AceDocument aceDoc) { ArrayList<AceEntity> entities = aceDoc.entities; for (int i = 0; i < entities.size(); i++) { AceEntity entity = entities.get(i); ArrayList<AceEntityName> names = entity.names; for (int j = 0; j < names.size(); j++) { AceEntityName name = names.get(j); Span aceSpan = name.extent; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } // for 2004 we have to examine PRE mentions and decide which are names if (year.equals("2004")) { ArrayList<AceEntityMention> mentions = entity.mentions; for (int j = 0; j < mentions.size(); j++) { AceEntityMention mention = mentions.get(j); String htext = Resolve.normalizeName(mention.headText); String[] mentionName = Gazetteer.splitAtWS(htext); String preClass = preDict.get(htext.toLowerCase()); if (mention.type.equals("PRE")) { if (gazetteer.isNationality(mentionName) || gazetteer.isLocation(mentionName) || "N".equals(preClass)) { Span aceSpan = mention.head; Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1); doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type)); } else if (preClass != null) { // do nothing } else { System.out.println( "Unclassified PRE: " + mention.text + " {" + mention.headText + ")"); unknownPre.add(htext.toLowerCase()); } } } } } }