@Override public void process(JCas jCas) throws AnalysisEngineProcessException { try { String apfUri = jCas.getView(Ace2005Constants.ACE_2005_APF_URI_VIEW).getSofaDataURI(); JCas initialView = jCas.getView(CAS.NAME_DEFAULT_SOFA); String documentText = initialView.getDocumentText(); SAXBuilder builder = new SAXBuilder(); builder.setDTDHandler(null); URI sofaDataURI = new URI(apfUri); Document doc = builder.build(new File(sofaDataURI)); Element apfSource = doc.getRootElement(); Element apfDocument = apfSource.getChild("document"); for (Element apfEntity : apfDocument.getChildren("entity")) { NamedEntity namedEntity = new NamedEntity(initialView); namedEntity.setEntityType(apfEntity.getAttributeValue("TYPE")); namedEntity.setEntitySubtype(apfEntity.getAttributeValue("SUBTYPE")); namedEntity.setEntityClass(apfEntity.getAttributeValue("CLASS")); namedEntity.setEntityId(apfEntity.getAttributeValue("ID")); namedEntity.addToIndexes(); List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (Element entityMention : apfEntity.getChildren("entity_mention")) { int start = Integer.parseInt( entityMention.getChild("extent").getChild("charseq").getAttributeValue("START")); int end = Integer.parseInt( entityMention.getChild("extent").getChild("charseq").getAttributeValue("END")); String givenText = entityMention.getChild("extent").getChild("charseq").getText(); String parsedText = documentText.substring(start, end + 1); Matcher ampMatcher = ampPattern.matcher(parsedText); parsedText = ampMatcher.replaceAll("&"); NamedEntityMention mention = new NamedEntityMention(initialView, start, end + 1); mention.setMentionId(entityMention.getAttributeValue("ID")); mention.setMentionType(entityMention.getAttributeValue("TYPE")); mention.setMentionedEntity(namedEntity); Chunk chunk = new Chunk(initialView, start, end + 1); mention.setAnnotation(chunk); int headStart = Integer.parseInt( entityMention.getChild("head").getChild("charseq").getAttributeValue("START")); int headEnd = Integer.parseInt( entityMention.getChild("head").getChild("charseq").getAttributeValue("END")); Chunk head = new Chunk(initialView, headStart, headEnd + 1); mention.setHead(head); mention.addToIndexes(); mentions.add(mention); givenText = givenText.replaceAll("\\s+", " "); parsedText = givenText.replaceAll("\\s+", " "); // if(!givenText.equals(parsedText)) // { // // System.out.println("given text and parsed text differ."); // System.out.println(givenText); // System.out.println(parsedText); // System.out.println(apfDocument.getAttributeValue("DOCID")); // System.out.println(apfEntity.getAttributeValue("ID")); // System.out.println(documentText); // } } namedEntity.setMentions(new FSArray(jCas, mentions.size())); FSCollectionFactory.fillArrayFS(namedEntity.getMentions(), mentions); } } catch (CASException ce) { throw new AnalysisEngineProcessException(ce); } catch (IOException ioe) { throw new AnalysisEngineProcessException(ioe); } catch (JDOMException je) { throw new AnalysisEngineProcessException(je); } catch (URISyntaxException use) { throw new AnalysisEngineProcessException(use); } }