/** * @param isSimplifyEntity * @param tokenWithPos * @param senID * @param sentence * @param listEnt * @param listRel * @param listDependencies * @param medtType * @param isResolveOverlappingEntities * @param relToBeConsidered * @param arrClauseBoundOfSen * @return * @throws IOException */ public boolean setInpVectFromDepGraphOfPairsAndTrigger( Relation objRel, Sentence objCurSen, int medtType, boolean discardDepRelUsingProbabilityInReducedGraph, boolean useWalkFeatures, boolean useRegExPatterns, boolean useDepPatterns, boolean useTriggers, boolean triggersFromWholeRGinsteadOfLCP, boolean useNegativeCues, Entity e1, Entity e2) throws IOException { ArrayList<Integer> listFeatIndsOfCurInp = new ArrayList<Integer>(), listFeatCountOfCurInp = new ArrayList<Integer>(); DepTreeNode headOfEnt1 = objCurSen.depTree.getHeadWordFromWordBoundaries( e1.getAllWordIndexes(), true, objCurSen), headOfEnt2 = objCurSen.depTree.getHeadWordFromWordBoundaries( e2.getAllWordIndexes(), true, objCurSen); DepTreeNode dn = null; // System.out.println(e1.id + " " + e2.id); // All nodes in the shortest path connecting the target pairs must be retained // All nodes satisfying the 3 rules of MEDT kernel must be retained dn = objCurSen .depTree .clone() .findMinimalSubTreeWithEntities(false, headOfEnt1, medtType, headOfEnt2); /* * If there is no minimal subtree/path between target entities then we do not consider to generate * instance for training/testing. */ if (dn == null) { FileUtility.writeInFile( GenericFeatVect.vectOutFile, e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n", true); FileUtility.writeInFile(GenericFeatVect.vectOutFile, objCurSen.text + "\n\n", true); return false; } GenericFeatVect.listOfAllInstances.add( e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n" + objCurSen.text + "\n\n"); // Construct feature set using e-walks and v-walks if (dn != null && useWalkFeatures) { createNgramFeatures(objCurSen.depTree, 1, e1, listFeatIndsOfCurInp, listFeatCountOfCurInp); createNgramFeatures(objCurSen.depTree, 1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp); // System.out.println(objCurSen.senID + " " + dn.wordIndex); objCurSen.depTree.replaceEntitiesWithDummies( dn, e1.boundaries, e2.boundaries, new ArrayList<Integer>()); createFeaturesFromInputGraph( objCurSen.depTree, dn, "", listFeatIndsOfCurInp, listFeatCountOfCurInp, new ArrayList<Integer>(), e1.boundaries, e2.boundaries); } if (useRegExPatterns) { matchPPIpatternOnSentence( objCurSen.text, e1.name, e2.name, listFeatIndsOfCurInp, listFeatCountOfCurInp); matchPPIpatternOnSentence( objCurSen.text, e2.name, e1.name, listFeatIndsOfCurInp, listFeatCountOfCurInp); } ArrayList<String> listOfDepRelsInReducedGraph = new ArrayList<String>(); ArrayList<ArrayList<Integer>> listNodesAndLCP = new PatternsDepRelFromGraph() .extractDepRelsInReducedGraph( objRel, objCurSen, listOfDepRelsInReducedGraph, discardDepRelUsingProbabilityInReducedGraph, false); if (useDepPatterns) extractDepPatternFeatures( listOfDepRelsInReducedGraph, listFeatIndsOfCurInp, listFeatCountOfCurInp); // NOTE: listNodesAndLCP has two elements - (0) all the nodes in Reduced graph, and (1) the // least common parents (LCPs) in Reduced graph. if (listNodesAndLCP == null) listNodesAndLCP = new ArrayList<ArrayList<Integer>>(); if (useTriggers && listNodesAndLCP.size() > 0) { if (triggersFromWholeRGinsteadOfLCP) for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) addTriggerWordFeatures( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word, listFeatIndsOfCurInp, listFeatCountOfCurInp); else for (int i = 0; i < listNodesAndLCP.get(1).size(); i++) addTriggerWordFeatures( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(1).get(i)].word, listFeatIndsOfCurInp, listFeatCountOfCurInp); } // * if (useNegativeCues && listNodesAndLCP.size() > 0) { for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) { // System.out.println(objCurSen.senID + " " + dn.wordIndex); // addNegativeWordFeatures( graph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word, // listFeatIndsOfCurInp, listFeatCountOfCurInp); if (Triggers.listOfNegativeWords.contains( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma)) { // add a feature indicating that there is a negative word in the reduced graph String[] feature = new String[] { "HasNegWord@$" + objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma, }; GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].lemma .equalsIgnoreCase("be") || objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].pos .equalsIgnoreCase("IN")) { feature = new String[] { "WordNextToNegCue@$" + objCurSen .depGraph .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 2] .lemma, }; } else { feature = new String[] { "WordNextToNegCue@$" + objCurSen .depGraph .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1] .lemma, }; } GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // * // extract negation scope features if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma.matches( "(no|not)")) extractNegationScopeFeatures( objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2, listNodesAndLCP.get(0).get(i), listNodesAndLCP.get(0)); // */ } } } // */ // add Zhou et al. 2005 features new ExtAceFeatVect() .getZhouEtAl2005FeatVal(objCurSen, e1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp); extractNonTargetEntityFeatures(objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2); GenericFeatVect.sortFeatValByIndx(listFeatIndsOfCurInp, listFeatCountOfCurInp); GenericFeatVect.listOfAllInstancesWithFeat.add(listFeatIndsOfCurInp); GenericFeatVect.listOfAllInstancesWithFeatCount.add(listFeatCountOfCurInp); return true; }
/** * @param isSimplifyEntity * @param parsedFileName * @param aimedDataFileName * @param outputFile * @param medtType * @param isRemoveOverlappingEntities * @throws Exception */ public void generateTPWFvectorOutput( ArrayList<Sentence> listSentence, String outputFile, int medtType, String entPairFileName, ClauseAnalyser.eDataFilterOption relToBeConsidered, String inClauseBoundFileName) throws Exception { boolean useWalkFeatures = true, useRegExPatterns = false, useDepPatterns = true, useTriggers = true, useNegativeCues = true, discardDepRelUsingProbabilityInReducedGraph = false, triggersFromWholeRGinsteadOfLCP = true; if (TextUtility.isEmptyString(TKOutputGenerator.triggerFileName)) useTriggers = false; String str = ""; if (discardDepRelUsingProbabilityInReducedGraph) str += "discardDepRelUsingProbabilityInReducedGraph "; if (useWalkFeatures) str += "WalkFeatures "; if (useRegExPatterns) str += "RegExPatterns "; if (useDepPatterns) str += "DepPatterns "; if (useTriggers) str += "Triggers "; if (triggersFromWholeRGinsteadOfLCP) str += "TriggersFromWholeRGinsteadOfLCP "; if (useNegativeCues) str += "NegativeCues "; System.out.println(str); PatternsDepRelFromGraph clsWVG = new PatternsDepRelFromGraph(); if (PatternsDepRelFromGraph.listOfAllPatterns.size() == 0) { clsWVG.collectAllDepRelPatternsFromTrainData( listSentence, discardDepRelUsingProbabilityInReducedGraph); } int[][] arrClauseBoundOfSen = new TKOutputPST().getClauseBoundOfAllSen(inClauseBoundFileName); FileUtility.writeInFile(outputFile, "", false); // read trigger word list Triggers.readTriggersAndNegativeWord(); for (int s = 0; s < listSentence.size(); s++) { Sentence objCurSen = listSentence.get(s); int senIndex = TKOutputPST.listAllSenIDs.indexOf(objCurSen.senID); // only those sentences are taken into account which has more than one entity annotations if (objCurSen.listOfEntities.size() > 1) { generateVectorForSen( objCurSen, medtType, entPairFileName, discardDepRelUsingProbabilityInReducedGraph, useWalkFeatures, useRegExPatterns, useDepPatterns, useTriggers, triggersFromWholeRGinsteadOfLCP, useNegativeCues, relToBeConsidered, senIndex > 0 ? arrClauseBoundOfSen[senIndex] : null); } } FileUtility.writeInFile(outputFile, GenericFeatVect.getInstanceVectors(), false); }
/** * @param tokenWithPos * @param senID * @param sentence * @param listEnt * @param listRel * @param listDependencies * @param medtType * @param entPairFileName * @return * @throws IOException */ private void generateVectorForSen( Sentence objCurSen, int medtType, String entPairFileName, boolean discardDepRelUsingProbabilityInReducedGraph, boolean useWalkFeatures, boolean useRegExPatterns, boolean useDepPatterns, boolean useTriggers, boolean triggersFromWholeRGinsteadOfLCP, boolean useNegativeCues, ClauseAnalyser.eDataFilterOption relToBeConsidered, int[] arrClauseBoundOfSen) throws IOException { // for each pair of entities, find minimal subtrees and output it with 1 or 0 // 1 represents there exists a relation between those entities for (int r = 0; r < objCurSen.listRels.size(); r++) { Entity e1 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg1); Entity e2 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg2); // checking relation type if (TKOutputPST.skipInstance(arrClauseBoundOfSen, relToBeConsidered, e1, e2, objCurSen, r)) continue; if (!objCurSen.listRels.get(r).isPositive) TKOutputPST.totalRelNeg++; else TKOutputPST.totalRelPos++; boolean isSet = setInpVectFromDepGraphOfPairsAndTrigger( objCurSen.listRels.get(r), objCurSen, medtType, discardDepRelUsingProbabilityInReducedGraph, useWalkFeatures, useRegExPatterns, useDepPatterns, useTriggers, triggersFromWholeRGinsteadOfLCP, useNegativeCues, e1, e2); if (isSet) GenericFeatVect.listOfAllInstancePolarity.add( objCurSen.listRels.get(r).isPositive ? 1 : -1); // */ if (!TextUtility.isEmptyString(entPairFileName)) { if (isSet) FileUtility.writeInFile(entPairFileName, e1.id + "\t" + e2.id + "\n", true); /* //if ( !str.isEmpty() ) if ( !isSet ) FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tFOUND\n", true); else { FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tNOT_FOUND\n", true); } */ } } }