private void addCoref(Communication comm) { AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(comm); AnalyticUUIDGenerator g = f.create(); corefMentions = new EntityMentionSet(); corefMentions.setUuid(g.next()); corefMentions.setMetadata(Conll2011.META_COREF); corefMentions.setMentionList(new ArrayList<>()); Map<String, List<EntityMention>> clusters = new HashMap<>(); int addedMentions = 0; for (Conll2011Sentence s : sentences) { // Get the entity mentions in this sentence Map<String, List<EntityMention>> c = s.getCoref(g); // Add these mentions to the EntityMentionSet for (List<EntityMention> ems : c.values()) { for (EntityMention em : ems) { corefMentions.addToMentionList(em); addedMentions++; } } // Merge into the document-level view of the entities for (Map.Entry<String, List<EntityMention>> se : c.entrySet()) { String clustId = se.getKey(); List<EntityMention> existingMentions = clusters.get(clustId); if (existingMentions == null) { existingMentions = new ArrayList<>(); clusters.put(clustId, existingMentions); } existingMentions.addAll(se.getValue()); } } int addedEntities = 0; corefClusters = new EntitySet(); corefClusters.setUuid(g.next()); corefClusters.setMetadata(Conll2011.META_COREF); corefClusters.setMentionSetId(corefMentions.getUuid()); corefClusters.setEntityList(new ArrayList<>()); for (Map.Entry<String, List<EntityMention>> cluster : clusters.entrySet()) { addedEntities++; Entity ent = new Entity(); ent.setUuid(g.next()); ent.setConfidence(1); for (EntityMention em : cluster.getValue()) ent.addToMentionIdList(em.getUuid()); corefClusters.addToEntityList(ent); } comm.addToEntitySetList(corefClusters); comm.addToEntityMentionSetList(corefMentions); if (conll2011.warnOnEmptyCoref && (addedMentions == 0 || addedEntities == 0)) { LOGGER.warn( "addedMentions=" + addedMentions + " addedEntities=" + addedEntities + " communication=" + comm.getId()); } }
@Test(expected = RebarException.class) public void exWithAnnotations() throws RebarException, TableNotFoundException, MutationsRejectedException, InvalidInputException { assertFalse(cr.exists("bar")); Communication c = generateMockDocument(); c.addToSectionSegmentations(new SingleSectionSegmenter().annotateDiff(c)); ci.ingest(c); assertTrue(cr.exists(c)); }
public Communication convertToConcrete() { if (comm != null) return comm; AnalyticUUIDGeneratorFactory f = new AnalyticUUIDGeneratorFactory(); AnalyticUUIDGenerator g = f.create(); comm = new Communication(); comm.setId(id); comm.setUuid(g.next()); comm.setType(communicationType); comm.setMetadata(Conll2011.META_GENERAL); // Tokenization for the words // TokenTagging for the POS tags // Parse for the constituency parse // TokenTagging for NER labels String sectionNum = null; Section section = null; for (Conll2011Sentence sent : sentences) { if (sectionNum == null || !sent.getPart().equals(sectionNum)) { if (section != null) comm.addToSectionList(section); section = new Section(); section.setUuid(g.next()); section.setKind(Conll2011.SECTION_TYPE); sectionNum = sent.getPart(); } section.addToSentenceList(sent.convertToConcrete(g)); } assert section != null; comm.addToSectionList(section); // SituationMentionSet for the SRL labels propBankSrlSituationMentions = new SituationMentionSet(); propBankSrlSituationMentions.setUuid(g.next()); propBankSrlSituationMentions.setMetadata(Conll2011.META_SRL); propBankSrlSituationMentions.setMentionList(new ArrayList<>()); for (Conll2011Sentence s : sentences) { for (int pai = 0; pai < s.getNumPredicates(); pai++) { SituationMention sm = s.getPredArg(pai, g); assert sm.getTokens() != null || sm.getConstituent() != null; propBankSrlSituationMentions.addToMentionList(sm); } } comm.addToSituationMentionSetList(propBankSrlSituationMentions); // EntitySet and EntityMentionSet for the coref labels addCoref(comm); // EntityMentionSet for the NER labels if (this.conll2011.addNerAsEntityMentionSet) { nerEms = new EntityMentionSet(); nerEms.setUuid(g.next()); nerEms.setMetadata(Conll2011.META_NER); nerEms.setMentionList(new ArrayList<>()); for (Conll2011Sentence s : sentences) for (EntityMention em : s.getNerEntityMentions()) nerEms.addToMentionList(em); comm.addToEntityMentionSetList(nerEms); } return comm; }
/** * Test method for {@link * edu.jhu.hlt.rebar.accumulo.CleanIngester#isDocumentIngested(edu.jhu.hlt.concrete.Communication)}. * * @throws RebarException * @throws TableNotFoundException * @throws MutationsRejectedException */ @Test public void testIsDocumentIngested() throws RebarException, TableNotFoundException, MutationsRejectedException { assertFalse(cr.exists("bar")); Communication c = generateMockDocument(); Communication c2 = generateMockDocument(); c.startTime = 39595830; c2.startTime = 395958301; assertFalse(cr.exists(c)); ci.ingest(c); assertTrue(cr.exists(c)); assertFalse(cr.exists("bar")); ci.ingest(c); ci.ingest(c2); assertTrue(cr.exists(c)); assertTrue(cr.exists(c2)); }
/** * The result starts out as a deep copy of left (so it inherits things like left's id and metadata * on things which can't be unioned like sections and sentences), and then annotations from right * are added. */ public MergeTokenAlignedCommunications(Communication left, Communication right) { this.union = new Communication(left); int nsection = left.getSectionListSize(); if (nsection != right.getSectionListSize()) throw new IllegalArgumentException( "left has " + nsection + " sections but right has " + right.getSectionListSize()); for (int i = 0; i < nsection; i++) { Section sl = left.getSectionList().get(i); Section sr = right.getSectionList().get(i); MergedSection m; try { m = new MergedSection(sl, sr); } catch (IllegalArgumentException e) { throw new IllegalArgumentException("in section " + i, e); } union.getSectionList().set(i, m.getUnion()); } }