/** * @param isSimplifyEntity * @param parsedFileName * @param aimedDataFileName * @param outputFile * @param medtType * @param isRemoveOverlappingEntities * @throws Exception */ public void generateTPWFvectorOutput( ArrayList<Sentence> listSentence, String outputFile, int medtType, String entPairFileName, ClauseAnalyser.eDataFilterOption relToBeConsidered, String inClauseBoundFileName) throws Exception { boolean useWalkFeatures = true, useRegExPatterns = false, useDepPatterns = true, useTriggers = true, useNegativeCues = true, discardDepRelUsingProbabilityInReducedGraph = false, triggersFromWholeRGinsteadOfLCP = true; if (TextUtility.isEmptyString(TKOutputGenerator.triggerFileName)) useTriggers = false; String str = ""; if (discardDepRelUsingProbabilityInReducedGraph) str += "discardDepRelUsingProbabilityInReducedGraph "; if (useWalkFeatures) str += "WalkFeatures "; if (useRegExPatterns) str += "RegExPatterns "; if (useDepPatterns) str += "DepPatterns "; if (useTriggers) str += "Triggers "; if (triggersFromWholeRGinsteadOfLCP) str += "TriggersFromWholeRGinsteadOfLCP "; if (useNegativeCues) str += "NegativeCues "; System.out.println(str); PatternsDepRelFromGraph clsWVG = new PatternsDepRelFromGraph(); if (PatternsDepRelFromGraph.listOfAllPatterns.size() == 0) { clsWVG.collectAllDepRelPatternsFromTrainData( listSentence, discardDepRelUsingProbabilityInReducedGraph); } int[][] arrClauseBoundOfSen = new TKOutputPST().getClauseBoundOfAllSen(inClauseBoundFileName); FileUtility.writeInFile(outputFile, "", false); // read trigger word list Triggers.readTriggersAndNegativeWord(); for (int s = 0; s < listSentence.size(); s++) { Sentence objCurSen = listSentence.get(s); int senIndex = TKOutputPST.listAllSenIDs.indexOf(objCurSen.senID); // only those sentences are taken into account which has more than one entity annotations if (objCurSen.listOfEntities.size() > 1) { generateVectorForSen( objCurSen, medtType, entPairFileName, discardDepRelUsingProbabilityInReducedGraph, useWalkFeatures, useRegExPatterns, useDepPatterns, useTriggers, triggersFromWholeRGinsteadOfLCP, useNegativeCues, relToBeConsidered, senIndex > 0 ? arrClauseBoundOfSen[senIndex] : null); } } FileUtility.writeInFile(outputFile, GenericFeatVect.getInstanceVectors(), false); }
/** * @param tokenWithPos * @param senID * @param sentence * @param listEnt * @param listRel * @param listDependencies * @param medtType * @param entPairFileName * @return * @throws IOException */ private void generateVectorForSen( Sentence objCurSen, int medtType, String entPairFileName, boolean discardDepRelUsingProbabilityInReducedGraph, boolean useWalkFeatures, boolean useRegExPatterns, boolean useDepPatterns, boolean useTriggers, boolean triggersFromWholeRGinsteadOfLCP, boolean useNegativeCues, ClauseAnalyser.eDataFilterOption relToBeConsidered, int[] arrClauseBoundOfSen) throws IOException { // for each pair of entities, find minimal subtrees and output it with 1 or 0 // 1 represents there exists a relation between those entities for (int r = 0; r < objCurSen.listRels.size(); r++) { Entity e1 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg1); Entity e2 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg2); // checking relation type if (TKOutputPST.skipInstance(arrClauseBoundOfSen, relToBeConsidered, e1, e2, objCurSen, r)) continue; if (!objCurSen.listRels.get(r).isPositive) TKOutputPST.totalRelNeg++; else TKOutputPST.totalRelPos++; boolean isSet = setInpVectFromDepGraphOfPairsAndTrigger( objCurSen.listRels.get(r), objCurSen, medtType, discardDepRelUsingProbabilityInReducedGraph, useWalkFeatures, useRegExPatterns, useDepPatterns, useTriggers, triggersFromWholeRGinsteadOfLCP, useNegativeCues, e1, e2); if (isSet) GenericFeatVect.listOfAllInstancePolarity.add( objCurSen.listRels.get(r).isPositive ? 1 : -1); // */ if (!TextUtility.isEmptyString(entPairFileName)) { if (isSet) FileUtility.writeInFile(entPairFileName, e1.id + "\t" + e2.id + "\n", true); /* //if ( !str.isEmpty() ) if ( !isSet ) FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tFOUND\n", true); else { FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tNOT_FOUND\n", true); } */ } } }
/** * @param isSimplifyEntity * @param tokenWithPos * @param senID * @param sentence * @param listEnt * @param listRel * @param listDependencies * @param medtType * @param isResolveOverlappingEntities * @param relToBeConsidered * @param arrClauseBoundOfSen * @return * @throws IOException */ public boolean setInpVectFromDepGraphOfPairsAndTrigger( Relation objRel, Sentence objCurSen, int medtType, boolean discardDepRelUsingProbabilityInReducedGraph, boolean useWalkFeatures, boolean useRegExPatterns, boolean useDepPatterns, boolean useTriggers, boolean triggersFromWholeRGinsteadOfLCP, boolean useNegativeCues, Entity e1, Entity e2) throws IOException { ArrayList<Integer> listFeatIndsOfCurInp = new ArrayList<Integer>(), listFeatCountOfCurInp = new ArrayList<Integer>(); DepTreeNode headOfEnt1 = objCurSen.depTree.getHeadWordFromWordBoundaries( e1.getAllWordIndexes(), true, objCurSen), headOfEnt2 = objCurSen.depTree.getHeadWordFromWordBoundaries( e2.getAllWordIndexes(), true, objCurSen); DepTreeNode dn = null; // System.out.println(e1.id + " " + e2.id); // All nodes in the shortest path connecting the target pairs must be retained // All nodes satisfying the 3 rules of MEDT kernel must be retained dn = objCurSen .depTree .clone() .findMinimalSubTreeWithEntities(false, headOfEnt1, medtType, headOfEnt2); /* * If there is no minimal subtree/path between target entities then we do not consider to generate * instance for training/testing. */ if (dn == null) { FileUtility.writeInFile( GenericFeatVect.vectOutFile, e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n", true); FileUtility.writeInFile(GenericFeatVect.vectOutFile, objCurSen.text + "\n\n", true); return false; } GenericFeatVect.listOfAllInstances.add( e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n" + objCurSen.text + "\n\n"); // Construct feature set using e-walks and v-walks if (dn != null && useWalkFeatures) { createNgramFeatures(objCurSen.depTree, 1, e1, listFeatIndsOfCurInp, listFeatCountOfCurInp); createNgramFeatures(objCurSen.depTree, 1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp); // System.out.println(objCurSen.senID + " " + dn.wordIndex); objCurSen.depTree.replaceEntitiesWithDummies( dn, e1.boundaries, e2.boundaries, new ArrayList<Integer>()); createFeaturesFromInputGraph( objCurSen.depTree, dn, "", listFeatIndsOfCurInp, listFeatCountOfCurInp, new ArrayList<Integer>(), e1.boundaries, e2.boundaries); } if (useRegExPatterns) { matchPPIpatternOnSentence( objCurSen.text, e1.name, e2.name, listFeatIndsOfCurInp, listFeatCountOfCurInp); matchPPIpatternOnSentence( objCurSen.text, e2.name, e1.name, listFeatIndsOfCurInp, listFeatCountOfCurInp); } ArrayList<String> listOfDepRelsInReducedGraph = new ArrayList<String>(); ArrayList<ArrayList<Integer>> listNodesAndLCP = new PatternsDepRelFromGraph() .extractDepRelsInReducedGraph( objRel, objCurSen, listOfDepRelsInReducedGraph, discardDepRelUsingProbabilityInReducedGraph, false); if (useDepPatterns) extractDepPatternFeatures( listOfDepRelsInReducedGraph, listFeatIndsOfCurInp, listFeatCountOfCurInp); // NOTE: listNodesAndLCP has two elements - (0) all the nodes in Reduced graph, and (1) the // least common parents (LCPs) in Reduced graph. if (listNodesAndLCP == null) listNodesAndLCP = new ArrayList<ArrayList<Integer>>(); if (useTriggers && listNodesAndLCP.size() > 0) { if (triggersFromWholeRGinsteadOfLCP) for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) addTriggerWordFeatures( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word, listFeatIndsOfCurInp, listFeatCountOfCurInp); else for (int i = 0; i < listNodesAndLCP.get(1).size(); i++) addTriggerWordFeatures( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(1).get(i)].word, listFeatIndsOfCurInp, listFeatCountOfCurInp); } // * if (useNegativeCues && listNodesAndLCP.size() > 0) { for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) { // System.out.println(objCurSen.senID + " " + dn.wordIndex); // addNegativeWordFeatures( graph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word, // listFeatIndsOfCurInp, listFeatCountOfCurInp); if (Triggers.listOfNegativeWords.contains( objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma)) { // add a feature indicating that there is a negative word in the reduced graph String[] feature = new String[] { "HasNegWord@$" + objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma, }; GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].lemma .equalsIgnoreCase("be") || objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].pos .equalsIgnoreCase("IN")) { feature = new String[] { "WordNextToNegCue@$" + objCurSen .depGraph .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 2] .lemma, }; } else { feature = new String[] { "WordNextToNegCue@$" + objCurSen .depGraph .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1] .lemma, }; } GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); // * // extract negation scope features if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma.matches( "(no|not)")) extractNegationScopeFeatures( objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2, listNodesAndLCP.get(0).get(i), listNodesAndLCP.get(0)); // */ } } } // */ // add Zhou et al. 2005 features new ExtAceFeatVect() .getZhouEtAl2005FeatVal(objCurSen, e1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp); extractNonTargetEntityFeatures(objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2); GenericFeatVect.sortFeatValByIndx(listFeatIndsOfCurInp, listFeatCountOfCurInp); GenericFeatVect.listOfAllInstancesWithFeat.add(listFeatIndsOfCurInp); GenericFeatVect.listOfAllInstancesWithFeatCount.add(listFeatCountOfCurInp); return true; }
/** * Remove the words that are in the same NP of an entity. Note: This particular pre-processing * didn't improve results. * * @param bioRelExInpFile * @param psgParsedFileName * @throws Exception */ public void removeOtherWordsInLeastNPofEnt(String bioRelExInpFile, String psgParsedFileName) throws Exception { ArrayList<ArrayList<String>> allSen = FileUtility.readAllMultiLineInputs(bioRelExInpFile); ArrayList<CFGParseOfSen> listCFGParseOfAllSen = CFGParseOfSen.readCFGParseForAllSen(psgParsedFileName); FileUtility.writeInFile(bioRelExInpFile, "", false); String psg = "", prevId = ""; for (int s = 0; s < allSen.size(); s = s + 2) { StringBuilder sb = new StringBuilder(); if (!prevId.equals(allSen.get(s).get(0))) psg = CFGParseOfSen.getBySenId(listCFGParseOfAllSen, allSen.get(s).get(0)).psgParse; prevId = allSen.get(s).get(0); // find the least phrasal category for each token String[] tmp = psg.replaceAll("\\)\\)", ") ()").replaceAll("\\s+", " ").trim().split("\\("); ArrayList<String[]> listTokPhraseCat = new ArrayList<String[]>(); int phraseCatIndex = 0, rBrack = 0; // 1st element in tmp is empty for (int i = 1; i < tmp.length; i++) { String[] str = tmp[i].trim().split("\\s+"); rBrack = 0; if (str.length > 1) { for (int k = listTokPhraseCat.size() - 1; k >= 0; k--) { if (listTokPhraseCat.get(k).length == 1 && !listTokPhraseCat.get(k)[0].equals(")") && rBrack == 0) { // str[0] = str[1]; str[1] = listTokPhraseCat.get(k)[0]; listTokPhraseCat.add(str); break; } else if (listTokPhraseCat.get(k).length == 1 && listTokPhraseCat.get(k)[0].equals(")")) rBrack++; else if (listTokPhraseCat.get(k).length == 1 && !listTokPhraseCat.get(k)[0].equals(")") && rBrack != 0) rBrack--; } } else if (str.length == 1 && !str[0].equals(")")) { str[0] = str[0] + "-" + phraseCatIndex; listTokPhraseCat.add(str); phraseCatIndex++; } else if (str.length == 1 && str[0].equals(")")) listTokPhraseCat.add(str); } // remove all elements other than words for (int i = listTokPhraseCat.size() - 1; i >= 0; i--) { if (listTokPhraseCat.get(i).length == 1) { listTokPhraseCat.remove(i); } } // remove the words which are in the least NP of a protein // the 1st item in the list is sentence id // search from the 2nd token = 3rd item in the list for (int i = 2; i < allSen.get(s).size(); i++) { tmp = allSen.get(s).get(i).split("\\s+"); if (!tmp[3].equals("O") && listTokPhraseCat.get(i - 1)[1].contains("NP-")) { // checking previous token if (i > 1 && !(allSen.get(s).get(i - 1).contains("\tB-e") || allSen.get(s).get(i - 1).contains("\tI-e")) && listTokPhraseCat.get(i - 1)[1].equals(listTokPhraseCat.get(i - 1 - 1)[1]) // && !listTokPhraseCat.get(i-1-1)[0].equals("DT") ) { listTokPhraseCat.remove(i - 1 - 1); allSen.get(s).remove(i - 1); i--; } /*/ checking next token else if ( i < allSen.get(s).size() && listTokPhraseCat.get(i-1)[1].equals(listTokPhraseCat.get(i)[1]) ) { listTokPhraseCat.remove(i); allSen.remove(s).get(i); i--; }*/ } } // re-writing the input for (int i = 0; i < allSen.get(s).size(); i++) { sb.append(allSen.get(s).get(i) + "\n"); } sb.append("\n"); sb.append(allSen.get(s + 1).get(0) + "\n"); sb.append("\n"); FileUtility.writeInFile(bioRelExInpFile, sb.toString(), true); } }