/** * @param dt * @param eBoundaries * @param listFeatIndsOfCurInp * @param listFeatCountOfCurInp * @throws IOException */ private void createNgramFeatures( DependencyTree dt, int weight, Entity ent, ArrayList<Integer> listFeatIndsOfCurInp, ArrayList<Integer> listFeatCountOfCurInp) throws IOException { String[] feature = new String[0]; // collecting the unigram within a window of {-x, +x} int x = ent.getStartWordIndex(); for (int i = x - 1; i >= x - 2 && i >= 0; i--) { feature = new String[] {dt.allNodesByWordIndex[i].lemma + "$" + (i - x)}; GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, weight); } x = ent.getEndWordIndex(); for (int i = x + 1; i <= x + 2 && i < dt.allNodesByWordIndex.length; i++) { feature = new String[] {dt.allNodesByWordIndex[i].lemma + "$" + (x - i)}; GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, weight); } }
/** * @param curSen * @param listFeatIndsOfCurInp * @param listFeatCountOfCurInp * @param e1 * @param e2 */ private void extractNonTargetEntityFeatures( Sentence curSen, ArrayList<Integer> listFeatIndsOfCurInp, ArrayList<Integer> listFeatCountOfCurInp, Entity e1, Entity e2) { for (int e = 0; e < curSen.listOfEntities.size(); e++) { Entity entOther = curSen.listOfEntities.get(e); if (entOther.getNEcategory().equalsIgnoreCase("DISO")) { GenericFeatVect.addNewFeatureInList( new String[] {"DISOinsideSentence"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // System.out.println(entOther.id); DepTreeNode headOfEnt = curSen.depTree.getHeadWordFromWordBoundaries( entOther.getAllWordIndexes(), true, curSen); int hwiOfEntOther = headOfEnt.wordIndex; int wiOfImmediateGovernor = getNonConjGovernorIndex(curSen, hwiOfEntOther); if (wiOfImmediateGovernor < 0) continue; int wiOfNearestVerbGovernor = getNearestVerbGovernor(wiOfImmediateGovernor, curSen); ArrayList<Integer> listOfWIofEntities = new ArrayList<Integer>(); listOfWIofEntities.addAll(e1.getAllWordIndexes()); listOfWIofEntities.addAll(e2.getAllWordIndexes()); // detect whether drugs are dependent of the verb governor if (wiOfNearestVerbGovernor > -1 && curSen.depTree.allNodesByWordIndex[wiOfNearestVerbGovernor].governAllWIsInList( listOfWIofEntities)) { /* GenericFeatVect.addNewFeatureInList( new String[]{"bothEntDependOnVerbGovernorOfTheDISO"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); //*/ GenericFeatVect.addNewFeatureInList( new String[] { "nearestVerbGovernorOfTheDISO=" + curSen.arrLemmasByParser[wiOfNearestVerbGovernor].toLowerCase() }, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // */ } if (wiOfImmediateGovernor > -1 && curSen.depTree.allNodesByWordIndex[wiOfImmediateGovernor].governAllWIsInList( listOfWIofEntities)) { /* GenericFeatVect.addNewFeatureInList( new String[]{"bothEntDependOnImmediateGovernorOfTheDISO"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); //*/ GenericFeatVect.addNewFeatureInList( new String[] { "immediateGovernorOfTheDISO=" + curSen.arrLemmasByParser[wiOfImmediateGovernor].toLowerCase() }, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // */ } // * if (wiOfImmediateGovernor == wiOfNearestVerbGovernor) GenericFeatVect.addNewFeatureInList( new String[] {"immediateGovernorIsVerbGovernorOfTheDISO"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // */ } } }
/** * @param curSen * @param listFeatIndsOfCurInp * @param listFeatCountOfCurInp * @param e1 * @param e2 * @param negWI * @param listOfReducedGraph */ private void extractNegationScopeFeatures( Sentence curSen, ArrayList<Integer> listFeatIndsOfCurInp, ArrayList<Integer> listFeatCountOfCurInp, Entity e1, Entity e2, int negWI, ArrayList<Integer> listOfReducedGraph) { int wiOfImmediateGovernor = getNonConjGovernorIndex(curSen, negWI); if (!listOfReducedGraph.contains(wiOfImmediateGovernor) || wiOfImmediateGovernor < 0) return; int wiOfNearestVerbGovernor = getNearestVerbGovernor(wiOfImmediateGovernor, curSen); // int dist = 0; // * if (!listOfReducedGraph.contains(wiOfNearestVerbGovernor) // || // dist > 3 ) wiOfNearestVerbGovernor = -1; /* if ( wiOfNearestVerbGovernor > -1 && curSen.depTree.allNodesByWordIndex[wiOfNearestVerbGovernor].getParentsWordIndexes().isEmpty() ) GenericFeatVect.addNewFeatureInList( new String[]{"verbGovernorIsRoot"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); */ ArrayList<Integer> listOfWIofEntities = new ArrayList<Integer>(); listOfWIofEntities.addAll(e1.getAllWordIndexes()); listOfWIofEntities.addAll(e2.getAllWordIndexes()); /* // detect whether drugs are dependent of the verb governor if ( wiOfNearestVerbGovernor > -1 && curSen.depTree.allNodesByWordIndex[wiOfNearestVerbGovernor].governAllWIsInList(listOfWIofEntities) ) GenericFeatVect.addNewFeatureInList( new String[]{"bothEntDependOnVerbGovernor"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); */ if (wiOfImmediateGovernor > -1 && curSen.depTree.allNodesByWordIndex[wiOfImmediateGovernor].governAllWIsInList( listOfWIofEntities)) GenericFeatVect.addNewFeatureInList( new String[] {"bothEntDependOnImmediateGovernor"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); if (wiOfImmediateGovernor == wiOfNearestVerbGovernor) GenericFeatVect.addNewFeatureInList( new String[] {"immediateGovernorIsVerbGovernor"}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); /* GenericFeatVect.addNewFeatureInList( new String[]{"immediateGovernor=" + curSen.arrLemmasByParser[wiOfImmediateGovernor].toLowerCase()}, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); //*/ if (wiOfNearestVerbGovernor > -1) GenericFeatVect.addNewFeatureInList( new String[] { "nearestVerbGovernor=" + curSen.arrLemmasByParser[wiOfNearestVerbGovernor].toLowerCase() }, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // */ }
/** * @param isSimplifyEntity * @param tokenWithPos * @param senID * @param sentence * @param listEnt * @param listRel * @param listDependencies * @param medtType * @param isResolveOverlappingEntities * @param relToBeConsidered * @param arrClauseBoundOfSen * @return * @throws IOException */ public boolean setInpVectFromDepGraphOfPairsAndTrigger( Relation objRel, Sentence objCurSen, int medtType, boolean discardDepRelUsingProbabilityInReducedGraph, boolean useWalkFeatures, boolean useRegExPatterns, boolean useDepPatterns, boolean useTriggers, boolean triggersFromWholeRGinsteadOfLCP, boolean useNegativeCues, Entity e1, Entity e2) throws IOException { ArrayList<Integer> listFeatIndsOfCurInp = new ArrayList<Integer>(), listFeatCountOfCurInp = new ArrayList<Integer>(); DepTreeNode headOfEnt1 = objCurSen.depTree.getHeadWordFromWordBoundaries( e1.getAllWordIndexes(), true, objCurSen), headOfEnt2 = objCurSen.depTree.getHeadWordFromWordBoundaries( e2.getAllWordIndexes(), true, objCurSen); DepTreeNode dn = null; // System.out.println(e1.id + " " + e2.id); // All nodes in the shortest path connecting the target pairs must be retained // All nodes satisfying the 3 rules of MEDT kernel must be retained dn = objCurSen .depTree .clone() .findMinimalSubTreeWithEntities(false, headOfEnt1, medtType, headOfEnt2); /* * If there is no minimal subtree/path between target entities then we do not consider to generate * instance for training/testing. */ if (dn == null) { FileUtility.writeInFile( GenericFeatVect.vectOutFile, e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n", true); FileUtility.writeInFile(GenericFeatVect.vectOutFile, objCurSen.text + "\n\n", true); return false; } GenericFeatVect.listOfAllInstances.add( e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n" + objCurSen.text + "\n\n"); // Construct feature set using e-walks and v-walks if (dn != null && useWalkFeatures) { createNgramFeatures(objCurSen.depTree, 1, e1, listFeatIndsOfCurInp, listFeatCountOfCurInp); createNgramFeatures(objCurSen.depTree, 1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp); // System.out.println(objCurSen.senID + " " + dn.wordIndex); objCurSen.depTree.replaceEntitiesWithDummies( dn, e1.boundaries, e2.boundaries, new ArrayList<Integer>()); createFeaturesFromInputGraph( objCurSen.depTree, dn, "", listFeatIndsOfCurInp, listFeatCountOfCurInp, new ArrayList<Integer>(), e1.boundaries, e2.boundaries); } if (useRegExPatterns) { matchPPIpatternOnSentence( objCurSen.text, e1.name, e2.name, listFeatIndsOfCurInp, listFeatCountOfCurInp); matchPPIpatternOnSentence( objCurSen.text, e2.name, e1.name, listFeatIndsOfCurInp, listFeatCountOfCurInp); } ArrayList<String> listOfDepRelsInReducedGraph = new ArrayList<String>(); ArrayList<ArrayList<Integer>> listNodesAndLCP = new PatternsDepRelFromGraph() .extractDepRelsInReducedGraph( objRel, objCurSen, listOfDepRelsInReducedGraph, discardDepRelUsingProbabilityInReducedGraph, false); if (useDepPatterns) extractDepPatternFeatures( listOfDepRelsInReducedGraph, listFeatIndsOfCurInp, listFeatCountOfCurInp); // NOTE: listNodesAndLCP has two elements - (0) all the nodes in Reduced graph, and (1) the // least common parents (LCPs) in Reduced graph. if (listNodesAndLCP == null) listNodesAndLCP = new ArrayList<ArrayList<Integer>>(); if (useTriggers && listNodesAndLCP.size() > 0) { if (triggersFromWholeRGinsteadOfLCP) for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) addTriggerWordFeatures( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word, listFeatIndsOfCurInp, listFeatCountOfCurInp); else for (int i = 0; i < listNodesAndLCP.get(1).size(); i++) addTriggerWordFeatures( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(1).get(i)].word, listFeatIndsOfCurInp, listFeatCountOfCurInp); } // * if (useNegativeCues && listNodesAndLCP.size() > 0) { for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) { // System.out.println(objCurSen.senID + " " + dn.wordIndex); // addNegativeWordFeatures( graph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word, // listFeatIndsOfCurInp, listFeatCountOfCurInp); if (Triggers.listOfNegativeWords.contains( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma)) { // add a feature indicating that there is a negative word in the reduced graph String[] feature = new String[] { "HasNegWord@$" + objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma, }; GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].lemma .equalsIgnoreCase("be") || objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].pos .equalsIgnoreCase("IN")) { feature = new String[] { "WordNextToNegCue@$" + objCurSen .depGraph .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 2] .lemma, }; } else { feature = new String[] { "WordNextToNegCue@$" + objCurSen .depGraph .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1] .lemma, }; } GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // * // extract negation scope features if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma.matches( "(no|not)")) extractNegationScopeFeatures( objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2, listNodesAndLCP.get(0).get(i), listNodesAndLCP.get(0)); // */ } } } // */ // add Zhou et al. 2005 features new ExtAceFeatVect() .getZhouEtAl2005FeatVal(objCurSen, e1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp); extractNonTargetEntityFeatures(objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2); GenericFeatVect.sortFeatValByIndx(listFeatIndsOfCurInp, listFeatCountOfCurInp); GenericFeatVect.listOfAllInstancesWithFeat.add(listFeatIndsOfCurInp); GenericFeatVect.listOfAllInstancesWithFeatCount.add(listFeatCountOfCurInp); return true; }