コード例 #1
0
ファイル: TPWF.java プロジェクト: xuleiboy1234/HyREX
  /**
   * @param isSimplifyEntity
   * @param parsedFileName
   * @param aimedDataFileName
   * @param outputFile
   * @param medtType
   * @param isRemoveOverlappingEntities
   * @throws Exception
   */
  public void generateTPWFvectorOutput(
      ArrayList<Sentence> listSentence,
      String outputFile,
      int medtType,
      String entPairFileName,
      ClauseAnalyser.eDataFilterOption relToBeConsidered,
      String inClauseBoundFileName)
      throws Exception {

    boolean useWalkFeatures = true,
        useRegExPatterns = false,
        useDepPatterns = true,
        useTriggers = true,
        useNegativeCues = true,
        discardDepRelUsingProbabilityInReducedGraph = false,
        triggersFromWholeRGinsteadOfLCP = true;

    if (TextUtility.isEmptyString(TKOutputGenerator.triggerFileName)) useTriggers = false;

    String str = "";
    if (discardDepRelUsingProbabilityInReducedGraph)
      str += "discardDepRelUsingProbabilityInReducedGraph ";
    if (useWalkFeatures) str += "WalkFeatures ";
    if (useRegExPatterns) str += "RegExPatterns ";
    if (useDepPatterns) str += "DepPatterns ";
    if (useTriggers) str += "Triggers ";
    if (triggersFromWholeRGinsteadOfLCP) str += "TriggersFromWholeRGinsteadOfLCP ";
    if (useNegativeCues) str += "NegativeCues ";

    System.out.println(str);

    PatternsDepRelFromGraph clsWVG = new PatternsDepRelFromGraph();
    if (PatternsDepRelFromGraph.listOfAllPatterns.size() == 0) {
      clsWVG.collectAllDepRelPatternsFromTrainData(
          listSentence, discardDepRelUsingProbabilityInReducedGraph);
    }

    int[][] arrClauseBoundOfSen = new TKOutputPST().getClauseBoundOfAllSen(inClauseBoundFileName);

    FileUtility.writeInFile(outputFile, "", false);

    // read trigger word list
    Triggers.readTriggersAndNegativeWord();

    for (int s = 0; s < listSentence.size(); s++) {

      Sentence objCurSen = listSentence.get(s);
      int senIndex = TKOutputPST.listAllSenIDs.indexOf(objCurSen.senID);

      // only those sentences are taken into account which has more than one entity annotations
      if (objCurSen.listOfEntities.size() > 1) {
        generateVectorForSen(
            objCurSen,
            medtType,
            entPairFileName,
            discardDepRelUsingProbabilityInReducedGraph,
            useWalkFeatures,
            useRegExPatterns,
            useDepPatterns,
            useTriggers,
            triggersFromWholeRGinsteadOfLCP,
            useNegativeCues,
            relToBeConsidered,
            senIndex > 0 ? arrClauseBoundOfSen[senIndex] : null);
      }
    }

    FileUtility.writeInFile(outputFile, GenericFeatVect.getInstanceVectors(), false);
  }
コード例 #2
0
ファイル: TPWF.java プロジェクト: xuleiboy1234/HyREX
  /**
   * @param tokenWithPos
   * @param senID
   * @param sentence
   * @param listEnt
   * @param listRel
   * @param listDependencies
   * @param medtType
   * @param entPairFileName
   * @return
   * @throws IOException
   */
  private void generateVectorForSen(
      Sentence objCurSen,
      int medtType,
      String entPairFileName,
      boolean discardDepRelUsingProbabilityInReducedGraph,
      boolean useWalkFeatures,
      boolean useRegExPatterns,
      boolean useDepPatterns,
      boolean useTriggers,
      boolean triggersFromWholeRGinsteadOfLCP,
      boolean useNegativeCues,
      ClauseAnalyser.eDataFilterOption relToBeConsidered,
      int[] arrClauseBoundOfSen)
      throws IOException {

    // for each pair of entities, find minimal subtrees and output it with 1 or 0
    // 1 represents there exists a relation between those entities
    for (int r = 0; r < objCurSen.listRels.size(); r++) {

      Entity e1 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg1);
      Entity e2 = objCurSen.getEntityById(objCurSen.listRels.get(r).arg2);

      // checking relation type
      if (TKOutputPST.skipInstance(arrClauseBoundOfSen, relToBeConsidered, e1, e2, objCurSen, r))
        continue;

      if (!objCurSen.listRels.get(r).isPositive) TKOutputPST.totalRelNeg++;
      else TKOutputPST.totalRelPos++;

      boolean isSet =
          setInpVectFromDepGraphOfPairsAndTrigger(
              objCurSen.listRels.get(r),
              objCurSen,
              medtType,
              discardDepRelUsingProbabilityInReducedGraph,
              useWalkFeatures,
              useRegExPatterns,
              useDepPatterns,
              useTriggers,
              triggersFromWholeRGinsteadOfLCP,
              useNegativeCues,
              e1,
              e2);

      if (isSet)
        GenericFeatVect.listOfAllInstancePolarity.add(
            objCurSen.listRels.get(r).isPositive ? 1 : -1);
      // */

      if (!TextUtility.isEmptyString(entPairFileName)) {
        if (isSet) FileUtility.writeInFile(entPairFileName, e1.id + "\t" + e2.id + "\n", true);
        /*
        //if ( !str.isEmpty() )
        if ( !isSet )
        	FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tFOUND\n", true);
        else {
        	FileUtility.writeInFile(entPairFileName, objCurSen.listRels.get(r).printString() + "\tNOT_FOUND\n", true);
        }
        */
      }
    }
  }
コード例 #3
0
ファイル: TPWF.java プロジェクト: xuleiboy1234/HyREX
  /**
   * @param isSimplifyEntity
   * @param tokenWithPos
   * @param senID
   * @param sentence
   * @param listEnt
   * @param listRel
   * @param listDependencies
   * @param medtType
   * @param isResolveOverlappingEntities
   * @param relToBeConsidered
   * @param arrClauseBoundOfSen
   * @return
   * @throws IOException
   */
  public boolean setInpVectFromDepGraphOfPairsAndTrigger(
      Relation objRel,
      Sentence objCurSen,
      int medtType,
      boolean discardDepRelUsingProbabilityInReducedGraph,
      boolean useWalkFeatures,
      boolean useRegExPatterns,
      boolean useDepPatterns,
      boolean useTriggers,
      boolean triggersFromWholeRGinsteadOfLCP,
      boolean useNegativeCues,
      Entity e1,
      Entity e2)
      throws IOException {

    ArrayList<Integer> listFeatIndsOfCurInp = new ArrayList<Integer>(),
        listFeatCountOfCurInp = new ArrayList<Integer>();

    DepTreeNode
        headOfEnt1 =
            objCurSen.depTree.getHeadWordFromWordBoundaries(
                e1.getAllWordIndexes(), true, objCurSen),
        headOfEnt2 =
            objCurSen.depTree.getHeadWordFromWordBoundaries(
                e2.getAllWordIndexes(), true, objCurSen);

    DepTreeNode dn = null;

    //	System.out.println(e1.id + "  " + e2.id);

    // All nodes in the shortest path connecting the target pairs must be retained
    // All nodes satisfying the 3 rules of MEDT kernel must be retained
    dn =
        objCurSen
            .depTree
            .clone()
            .findMinimalSubTreeWithEntities(false, headOfEnt1, medtType, headOfEnt2);

    /*
     *  If there is no minimal subtree/path between target entities then we do not consider to generate
     *  instance for training/testing.
     */
    if (dn == null) {
      FileUtility.writeInFile(
          GenericFeatVect.vectOutFile,
          e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n",
          true);

      FileUtility.writeInFile(GenericFeatVect.vectOutFile, objCurSen.text + "\n\n", true);

      return false;
    }

    GenericFeatVect.listOfAllInstances.add(
        e1.id + " " + e1.name + "\t" + e2.id + " " + e2.name + "\n\n" + objCurSen.text + "\n\n");

    // Construct feature set using e-walks and v-walks
    if (dn != null && useWalkFeatures) {

      createNgramFeatures(objCurSen.depTree, 1, e1, listFeatIndsOfCurInp, listFeatCountOfCurInp);
      createNgramFeatures(objCurSen.depTree, 1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp);

      // System.out.println(objCurSen.senID + "  " + dn.wordIndex);
      objCurSen.depTree.replaceEntitiesWithDummies(
          dn, e1.boundaries, e2.boundaries, new ArrayList<Integer>());
      createFeaturesFromInputGraph(
          objCurSen.depTree,
          dn,
          "",
          listFeatIndsOfCurInp,
          listFeatCountOfCurInp,
          new ArrayList<Integer>(),
          e1.boundaries,
          e2.boundaries);
    }

    if (useRegExPatterns) {
      matchPPIpatternOnSentence(
          objCurSen.text, e1.name, e2.name, listFeatIndsOfCurInp, listFeatCountOfCurInp);
      matchPPIpatternOnSentence(
          objCurSen.text, e2.name, e1.name, listFeatIndsOfCurInp, listFeatCountOfCurInp);
    }

    ArrayList<String> listOfDepRelsInReducedGraph = new ArrayList<String>();

    ArrayList<ArrayList<Integer>> listNodesAndLCP =
        new PatternsDepRelFromGraph()
            .extractDepRelsInReducedGraph(
                objRel,
                objCurSen,
                listOfDepRelsInReducedGraph,
                discardDepRelUsingProbabilityInReducedGraph,
                false);

    if (useDepPatterns)
      extractDepPatternFeatures(
          listOfDepRelsInReducedGraph, listFeatIndsOfCurInp, listFeatCountOfCurInp);

    //  NOTE: listNodesAndLCP has two elements - (0) all the nodes in Reduced graph, and (1) the
    // least common parents (LCPs) in Reduced graph.

    if (listNodesAndLCP == null) listNodesAndLCP = new ArrayList<ArrayList<Integer>>();

    if (useTriggers && listNodesAndLCP.size() > 0) {
      if (triggersFromWholeRGinsteadOfLCP)
        for (int i = 0; i < listNodesAndLCP.get(0).size(); i++)
          addTriggerWordFeatures(
              objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word,
              listFeatIndsOfCurInp,
              listFeatCountOfCurInp);
      else
        for (int i = 0; i < listNodesAndLCP.get(1).size(); i++)
          addTriggerWordFeatures(
              objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(1).get(i)].word,
              listFeatIndsOfCurInp,
              listFeatCountOfCurInp);
    }
    // *
    if (useNegativeCues && listNodesAndLCP.size() > 0) {

      for (int i = 0; i < listNodesAndLCP.get(0).size(); i++) {
        // System.out.println(objCurSen.senID + "  " + dn.wordIndex);
        // addNegativeWordFeatures( graph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].word,
        // listFeatIndsOfCurInp, listFeatCountOfCurInp);

        if (Triggers.listOfNegativeWords.contains(
            objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma)) {

          // add a feature indicating that there is a negative word in the reduced graph
          String[] feature =
              new String[] {
                "HasNegWord@$"
                    + objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma,
              };

          GenericFeatVect.addNewFeatureInList(
              feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1);

          if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].lemma
                  .equalsIgnoreCase("be")
              || objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1].pos
                  .equalsIgnoreCase("IN")) {
            feature =
                new String[] {
                  "WordNextToNegCue@$"
                      + objCurSen
                          .depGraph
                          .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 2]
                          .lemma,
                };
          } else {
            feature =
                new String[] {
                  "WordNextToNegCue@$"
                      + objCurSen
                          .depGraph
                          .allNodesByWordIndex[listNodesAndLCP.get(0).get(i) + 1]
                          .lemma,
                };
          }

          GenericFeatVect.addNewFeatureInList(
              feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1);

          // *
          // extract negation scope features
          if (objCurSen.depGraph.allNodesByWordIndex[listNodesAndLCP.get(0).get(i)].lemma.matches(
              "(no|not)"))
            extractNegationScopeFeatures(
                objCurSen,
                listFeatIndsOfCurInp,
                listFeatCountOfCurInp,
                e1,
                e2,
                listNodesAndLCP.get(0).get(i),
                listNodesAndLCP.get(0));
          // */

        }
      }
    }
    // */

    // add Zhou et al. 2005 features
    new ExtAceFeatVect()
        .getZhouEtAl2005FeatVal(objCurSen, e1, e2, listFeatIndsOfCurInp, listFeatCountOfCurInp);

    extractNonTargetEntityFeatures(objCurSen, listFeatIndsOfCurInp, listFeatCountOfCurInp, e1, e2);

    GenericFeatVect.sortFeatValByIndx(listFeatIndsOfCurInp, listFeatCountOfCurInp);

    GenericFeatVect.listOfAllInstancesWithFeat.add(listFeatIndsOfCurInp);
    GenericFeatVect.listOfAllInstancesWithFeatCount.add(listFeatCountOfCurInp);

    return true;
  }
コード例 #4
0
ファイル: PreProcessor.java プロジェクト: xuleiboy1234/HyREX
  /**
   * Remove the words that are in the same NP of an entity. Note: This particular pre-processing
   * didn't improve results.
   *
   * @param bioRelExInpFile
   * @param psgParsedFileName
   * @throws Exception
   */
  public void removeOtherWordsInLeastNPofEnt(String bioRelExInpFile, String psgParsedFileName)
      throws Exception {

    ArrayList<ArrayList<String>> allSen = FileUtility.readAllMultiLineInputs(bioRelExInpFile);

    ArrayList<CFGParseOfSen> listCFGParseOfAllSen =
        CFGParseOfSen.readCFGParseForAllSen(psgParsedFileName);
    FileUtility.writeInFile(bioRelExInpFile, "", false);
    String psg = "", prevId = "";

    for (int s = 0; s < allSen.size(); s = s + 2) {
      StringBuilder sb = new StringBuilder();

      if (!prevId.equals(allSen.get(s).get(0)))
        psg = CFGParseOfSen.getBySenId(listCFGParseOfAllSen, allSen.get(s).get(0)).psgParse;

      prevId = allSen.get(s).get(0);

      // find the least phrasal category for each token
      String[] tmp = psg.replaceAll("\\)\\)", ") ()").replaceAll("\\s+", " ").trim().split("\\(");
      ArrayList<String[]> listTokPhraseCat = new ArrayList<String[]>();
      int phraseCatIndex = 0, rBrack = 0;

      // 1st element in tmp is empty
      for (int i = 1; i < tmp.length; i++) {

        String[] str = tmp[i].trim().split("\\s+");
        rBrack = 0;

        if (str.length > 1) {
          for (int k = listTokPhraseCat.size() - 1; k >= 0; k--) {
            if (listTokPhraseCat.get(k).length == 1
                && !listTokPhraseCat.get(k)[0].equals(")")
                && rBrack == 0) {
              // str[0] = str[1];
              str[1] = listTokPhraseCat.get(k)[0];
              listTokPhraseCat.add(str);
              break;
            } else if (listTokPhraseCat.get(k).length == 1
                && listTokPhraseCat.get(k)[0].equals(")")) rBrack++;
            else if (listTokPhraseCat.get(k).length == 1
                && !listTokPhraseCat.get(k)[0].equals(")")
                && rBrack != 0) rBrack--;
          }
        } else if (str.length == 1 && !str[0].equals(")")) {
          str[0] = str[0] + "-" + phraseCatIndex;
          listTokPhraseCat.add(str);
          phraseCatIndex++;
        } else if (str.length == 1 && str[0].equals(")")) listTokPhraseCat.add(str);
      }

      // remove all elements other than words
      for (int i = listTokPhraseCat.size() - 1; i >= 0; i--) {
        if (listTokPhraseCat.get(i).length == 1) {
          listTokPhraseCat.remove(i);
        }
      }

      // remove the words which are in the least NP of a protein
      // the 1st item in the list is sentence id
      // search from the 2nd token = 3rd item in the list
      for (int i = 2; i < allSen.get(s).size(); i++) {
        tmp = allSen.get(s).get(i).split("\\s+");

        if (!tmp[3].equals("O") && listTokPhraseCat.get(i - 1)[1].contains("NP-")) {
          // checking previous token
          if (i > 1
              && !(allSen.get(s).get(i - 1).contains("\tB-e")
                  || allSen.get(s).get(i - 1).contains("\tI-e"))
              && listTokPhraseCat.get(i - 1)[1].equals(listTokPhraseCat.get(i - 1 - 1)[1])
          // && !listTokPhraseCat.get(i-1-1)[0].equals("DT")
          ) {
            listTokPhraseCat.remove(i - 1 - 1);
            allSen.get(s).remove(i - 1);
            i--;
          }
          /*/ checking next token
          else if ( i < allSen.get(s).size() && listTokPhraseCat.get(i-1)[1].equals(listTokPhraseCat.get(i)[1]) ) {
          	listTokPhraseCat.remove(i);
          	allSen.remove(s).get(i);
          	i--;
          }*/
        }
      }

      // re-writing the input
      for (int i = 0; i < allSen.get(s).size(); i++) {
        sb.append(allSen.get(s).get(i) + "\n");
      }

      sb.append("\n");
      sb.append(allSen.get(s + 1).get(0) + "\n");
      sb.append("\n");

      FileUtility.writeInFile(bioRelExInpFile, sb.toString(), true);
    }
  }