コード例 #1
0
ファイル: TPWF.java プロジェクト: xuleiboy1234/HyREX
  /**
   * @param isSimplifyEntity
   * @param tokenWithPos
   * @param senID
   * @param sentence
   * @param listEnt
   * @param listRel
   * @param listDependencies
   * @param medtType
   * @param isResolveOverlappingEntities
   * @param relToBeConsidered
   * @param arrClauseBoundOfSen
   * @return
   * @throws IOException
   */
  public boolean setInpVectFromDepGraphOfPairsAndTrigger(
      Relation objRel,
      Sentence objCurSen,
      int medtType,
      boolean discardDepRelUsingProbabilityInReducedGraph,
      boolean useWalkFeatures,
      boolean useRegExPatterns,
      boolean useDepPatterns,
      boolean useTriggers,
      boolean triggersFromWholeRGinsteadOfLCP,
      boolean useNegativeCues,
      Entity e1,
      Entity e2)
      throws IOException {

    ArrayList<Integer> listFeatIndsOfCurInp = new ArrayList<Integer>(),
        listFeatCountOfCurInp = new ArrayList<Integer>();

    DepTreeNode
        headOfEnt1 =
            objCurSen.depTree.getHeadWordFromWordBoundaries(
                e1.getAllWordIndexes(), true, objCurSen),
        headOfEnt2 =
            objCurSen.depTree.getHeadWordFromWordBoundaries(
                e2.getAllWordIndexes(), true, objCurSen);

    DepTreeNode dn = null;

    //	System.out.println(e1.id + "  " + e2.id);

    // All nodes in the shortest path connecting the target pairs must be retained
    // All nodes satisfying the 3 rules of MEDT kernel must be retained
    dn =
        objCurSen
            .depTree
            .clone()
            .findMinimalSubTreeWithEntities(false, headOfEnt1, medtType, headOfEnt2);

    /*
     *  If there is no minimal subtree/path between target entities then we do not consider to generate
     *  instance for training/testing.
     */
    if (dn == null) {
      FileUtility.writeInFile(
          GenericFeatVect.vectOutFile,
          e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n",
          true);

      FileUtility.writeInFile(GenericFeatVect.vectOutFile, objCurSen.text + "\n\n", true);

      return false;
    }

    GenericFeatVect.listOfAllInstances.add(
        e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n" + objCurSen.text + "\n\n");

    // Construct feature set using e-walks and v-walks
    if (dn != null && useWalkFeatures) {

      createNgramFeatures(objCurSen.depTree, 1, e1, listFeatIndsOfCurInp, listFeatCountOfCurInp);
      createNgramFeatures(objCurSen.depTree, 1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp);

      // System.out.println(objCurSen.senID + "  " + dn.wordIndex);
      objCurSen.depTree.replaceEntitiesWithDummies(
          dn, e1.boundaries, e2.boundaries, new ArrayList<Integer>());
      createFeaturesFromInputGraph(
          objCurSen.depTree,
          dn,
          "",
          listFeatIndsOfCurInp,
          listFeatCountOfCurInp,
          new ArrayList<Integer>(),
          e1.boundaries,
          e2.boundaries);
    }

    if (useRegExPatterns) {
      matchPPIpatternOnSentence(
          objCurSen.text, e1.name, e2.name, listFeatIndsOfCurInp, listFeatCountOfCurInp);
      matchPPIpatternOnSentence(
          objCurSen.text, e2.name, e1.name, listFeatIndsOfCurInp, listFeatCountOfCurInp);
    }

    ArrayList<String> listOfDepRelsInReducedGraph = new ArrayList<String>();

    ArrayList<ArrayList<Integer>> listNodesAndLCP =
        new PatternsDepRelFromGraph()
            .extractDepRelsInReducedGraph(
                objRel,
                objCurSen,
                listOfDepRelsInReducedGraph,
                discardDepRelUsingProbabilityInReducedGraph,
                false);

    if (useDepPatterns)
      extractDepPatternFeatures(
          listOfDepRelsInReducedGraph, listFeatIndsOfCurInp, listFeatCountOfCurInp);

    //  NOTE: listNodesAndLCP has two elements - (0) all the nodes in Reduced graph, and (1) the
    // least common parents (LCPs) in Reduced graph.

    if (listNodesAndLCP == null) listNodesAndLCP = new ArrayList<ArrayList<Integer>>();

    if (useTriggers && listNodesAndLCP.size() > 0) {
      if (triggersFromWholeRGinsteadOfLCP)
        for (int i = 0; i < listNodesAndLCP.get(0).size(); i++)
          addTriggerWordFeatures(
              objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word,
              listFeatIndsOfCurInp,
              listFeatCountOfCurInp);
      else
        for (int i = 0; i < listNodesAndLCP.get(1).size(); i++)
          addTriggerWordFeatures(
              objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(1).get(i)].word,
              listFeatIndsOfCurInp,
              listFeatCountOfCurInp);
    }
    // *
    if (useNegativeCues && listNodesAndLCP.size() > 0) {

      for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) {
        // System.out.println(objCurSen.senID + "  " + dn.wordIndex);
        // addNegativeWordFeatures( graph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word,
        // listFeatIndsOfCurInp, listFeatCountOfCurInp);

        if (Triggers.listOfNegativeWords.contains(
            objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma)) {

          // add a feature indicating that there is a negative word in the reduced graph
          String[] feature =
              new String[] {
                "HasNegWord@$"
                    + objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma,
              };

          GenericFeatVect.addNewFeatureInList(
              feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1);

          if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].lemma
                  .equalsIgnoreCase("be")
              || objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].pos
                  .equalsIgnoreCase("IN")) {
            feature =
                new String[] {
                  "WordNextToNegCue@$"
                      + objCurSen
                          .depGraph
                          .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 2]
                          .lemma,
                };
          } else {
            feature =
                new String[] {
                  "WordNextToNegCue@$"
                      + objCurSen
                          .depGraph
                          .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1]
                          .lemma,
                };
          }

          GenericFeatVect.addNewFeatureInList(
              feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1);

          // *
          // extract negation scope features
          if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma.matches(
              "(no|not)"))
            extractNegationScopeFeatures(
                objCurSen,
                listFeatIndsOfCurInp,
                listFeatCountOfCurInp,
                e1,
                e2,
                listNodesAndLCP.get(0).get(i),
                listNodesAndLCP.get(0));
          // */

        }
      }
    }
    // */

    // add Zhou et al. 2005 features
    new ExtAceFeatVect()
        .getZhouEtAl2005FeatVal(objCurSen, e1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp);

    extractNonTargetEntityFeatures(objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2);

    GenericFeatVect.sortFeatValByIndx(listFeatIndsOfCurInp, listFeatCountOfCurInp);

    GenericFeatVect.listOfAllInstancesWithFeat.add(listFeatIndsOfCurInp);
    GenericFeatVect.listOfAllInstancesWithFeatCount.add(listFeatCountOfCurInp);

    return true;
  }
コード例 #2
0
ファイル: TPWF.java プロジェクト: xuleiboy1234/HyREX
  /**
   * @param isSimplifyEntity
   * @param parsedFileName
   * @param aimedDataFileName
   * @param outputFile
   * @param medtType
   * @param isRemoveOverlappingEntities
   * @throws Exception
   */
  public void generateTPWFvectorOutput(
      ArrayList<Sentence> listSentence,
      String outputFile,
      int medtType,
      String entPairFileName,
      ClauseAnalyser.eDataFilterOption relToBeConsidered,
      String inClauseBoundFileName)
      throws Exception {

    boolean useWalkFeatures = true,
        useRegExPatterns = false,
        useDepPatterns = true,
        useTriggers = true,
        useNegativeCues = true,
        discardDepRelUsingProbabilityInReducedGraph = false,
        triggersFromWholeRGinsteadOfLCP = true;

    if (TextUtility.isEmptyString(TKOutputGenerator.triggerFileName)) useTriggers = false;

    String str = "";
    if (discardDepRelUsingProbabilityInReducedGraph)
      str += "discardDepRelUsingProbabilityInReducedGraph ";
    if (useWalkFeatures) str += "WalkFeatures ";
    if (useRegExPatterns) str += "RegExPatterns ";
    if (useDepPatterns) str += "DepPatterns ";
    if (useTriggers) str += "Triggers ";
    if (triggersFromWholeRGinsteadOfLCP) str += "TriggersFromWholeRGinsteadOfLCP ";
    if (useNegativeCues) str += "NegativeCues ";

    System.out.println(str);

    PatternsDepRelFromGraph clsWVG = new PatternsDepRelFromGraph();
    if (PatternsDepRelFromGraph.listOfAllPatterns.size() == 0) {
      clsWVG.collectAllDepRelPatternsFromTrainData(
          listSentence, discardDepRelUsingProbabilityInReducedGraph);
    }

    int[][] arrClauseBoundOfSen = new TKOutputPST().getClauseBoundOfAllSen(inClauseBoundFileName);

    FileUtility.writeInFile(outputFile, "", false);

    // read trigger word list
    Triggers.readTriggersAndNegativeWord();

    for (int s = 0; s < listSentence.size(); s++) {

      Sentence objCurSen = listSentence.get(s);
      int senIndex = TKOutputPST.listAllSenIDs.indexOf(objCurSen.senID);

      // only those sentences are taken into account which has more than one entity annotations
      if (objCurSen.listOfEntities.size() > 1) {
        generateVectorForSen(
            objCurSen,
            medtType,
            entPairFileName,
            discardDepRelUsingProbabilityInReducedGraph,
            useWalkFeatures,
            useRegExPatterns,
            useDepPatterns,
            useTriggers,
            triggersFromWholeRGinsteadOfLCP,
            useNegativeCues,
            relToBeConsidered,
            senIndex > 0 ? arrClauseBoundOfSen[senIndex] : null);
      }
    }

    FileUtility.writeInFile(outputFile, GenericFeatVect.getInstanceVectors(), false);
  }
コード例 #3
0
ファイル: TPWF.java プロジェクト: xuleiboy1234/HyREX
  /**
   * @param tokenWithPos
   * @param senID
   * @param sentence
   * @param listEnt
   * @param listRel
   * @param listDependencies
   * @param medtType
   * @param entPairFileName
   * @return
   * @throws IOException
   */
  private void generateVectorForSen(
      Sentence objCurSen,
      int medtType,
      String entPairFileName,
      boolean discardDepRelUsingProbabilityInReducedGraph,
      boolean useWalkFeatures,
      boolean useRegExPatterns,
      boolean useDepPatterns,
      boolean useTriggers,
      boolean triggersFromWholeRGinsteadOfLCP,
      boolean useNegativeCues,
      ClauseAnalyser.eDataFilterOption relToBeConsidered,
      int[] arrClauseBoundOfSen)
      throws IOException {

    // for each pair of entities, find minimal subtrees and output it with 1 or 0
    // 1 represents there exists a relation between those entities
    for (int r = 0; r < objCurSen.listRels.size(); r++) {

      Entity e1 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg1);
      Entity e2 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg2);

      // checking relation type
      if (TKOutputPST.skipInstance(arrClauseBoundOfSen, relToBeConsidered, e1, e2, objCurSen, r))
        continue;

      if (!objCurSen.listRels.get(r).isPositive) TKOutputPST.totalRelNeg++;
      else TKOutputPST.totalRelPos++;

      boolean isSet =
          setInpVectFromDepGraphOfPairsAndTrigger(
              objCurSen.listRels.get(r),
              objCurSen,
              medtType,
              discardDepRelUsingProbabilityInReducedGraph,
              useWalkFeatures,
              useRegExPatterns,
              useDepPatterns,
              useTriggers,
              triggersFromWholeRGinsteadOfLCP,
              useNegativeCues,
              e1,
              e2);

      if (isSet)
        GenericFeatVect.listOfAllInstancePolarity.add(
            objCurSen.listRels.get(r).isPositive ? 1 : -1);
      // */

      if (!TextUtility.isEmptyString(entPairFileName)) {
        if (isSet) FileUtility.writeInFile(entPairFileName, e1.id + "\t" + e2.id + "\n", true);
        /*
        //if ( !str.isEmpty() )
        if ( !isSet )
        	FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tFOUND\n", true);
        else {
        	FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tNOT_FOUND\n", true);
        }
        */
      }
    }
  }