Esempio n. 1
0
  /**
   * @param listOfDepRelsInReducedGraph
   * @param listFeatIndsOfCurInp
   * @param listFeatCountOfCurInp
   */
  private void extractDepPatternFeatures(
      ArrayList<String> listOfDepRelsInReducedGraph,
      ArrayList<Integer> listFeatIndsOfCurInp,
      ArrayList<Integer> listFeatCountOfCurInp) {

    ArrayList<Integer> listOfMatchedPatternIndexes = new ArrayList<Integer>();

    for (int i = 0; i < PatternsDepRelFromGraph.listOfAllPatterns.size(); i++) {

      if (DataStrucUtility.hasListOneAllElementsOfListTwo(
          listOfDepRelsInReducedGraph, PatternsDepRelFromGraph.listOfAllPatterns.get(i)))
        listOfMatchedPatternIndexes.add(i);
    }

    for (int i = 0; i < listOfMatchedPatternIndexes.size(); i++) {
      // add a feature indicating that what dep pattern it is
      String[] feature =
          new String[] {
            "DepPattern-" + listOfMatchedPatternIndexes.get(i) + "@",
          };

      GenericFeatVect.addNewFeatureInList(
          feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1);
    }
  }
Esempio n. 2
0
  public String preProcessWordWithDashEndingAndUpdateEntBoundaries(
      String sentence, ArrayList<int[]> entBoundaryList) {

    ArrayList<String> words = DataStrucUtility.arrayToList(sentence.split("\\s+"));
    String old = sentence;
    sentence = "";
    int len = 0;

    if (old.contains(" of p42 MAPK were completely blocked by")) old.trim();

    for (int wi = 0; wi < words.size(); wi++) {
      if (words.get(wi).length() == 1) ; // do nothing
      else if (words.get(wi).matches(".*[a-zA-Z].*-")) {

        /*
         *  TODO: have to adjust entity boundaries for following entities and sentence
         *  [liver-, islet-type glucokinase, glucokinase, actin]
         *  When liver or islet type glucokinase was transiently expressed in COS-7 cells, the expressed glucokinase was also co-localized with actin filaments in the cytoplasm of these transfected cells.
         */

        int lastDashInd = -1;
        // remove dash from the ending
        // identify the next word having dash
        // replace the dash of that word with space
        for (int x = wi + 1; x < words.size(); x++) {
          if ((lastDashInd = words.get(x).lastIndexOf("-")) > 0
              && lastDashInd != words.get(x).length() - 1) {
            // System.out.println(old);
            // System.out.println(words.get(wi));
            String newPart = words.get(x).substring(lastDashInd + 1);
            words.set(wi, words.get(wi).substring(0, words.get(wi).length() - 1));
            updateBoundaries(entBoundaryList, 1, len - 1, false);

            words.set(x, words.get(x).substring(0, lastDashInd));
            words.add(x + 1, newPart);
            updateBoundaries(entBoundaryList, 1, len - 1, false);
            break;
          }
        }
      }

      len += words.get(wi).length();

      sentence += " " + words.get(wi);
    }

    return sentence.trim();
  }
Esempio n. 3
0
  public String preProcessWordWithDashEndingByAddingExtraWordAndUpdateEntBoundaries(
      String sentence, ArrayList<int[]> entBoundaryList) {

    ArrayList<String> words = DataStrucUtility.arrayToList(sentence.split("\\s+"));
    sentence = "";
    int len = 0;

    for (int wi = 0; wi < words.size(); wi++) {
      if (words.get(wi).length() == 1) ; // do nothing
      else if (words.get(wi).matches(".*[0-9a-zA-Z]-")) {

        int lastDashInd = -1;
        // identify the next word having dash
        // add the end token after the dash with the current word
        // we assume, the end token would not contain any entity name
        // we further assume, the end token is correct part that is meant to be related with the
        // current word
        for (int x = wi + 1; x < words.size(); x++) {
          if ((lastDashInd = words.get(x).lastIndexOf("-")) > 0
              && lastDashInd != words.get(x).length() - 1) {
            // System.out.println(old);
            // System.out.println(words.get(wi));
            String newPart = words.get(x).substring(lastDashInd + 1);
            words.set(wi, words.get(wi) + newPart);
            updateBoundaries(entBoundaryList, newPart.length(), len - 1, true);
            break;
          }
        }
      }

      len += words.get(wi).length();

      sentence += " " + words.get(wi);
    }

    return sentence.trim();
  }
Esempio n. 4
0
  /**
   * Note: This particular pre-processing didn't improve results in the LLL corpus.
   *
   * @param bioRelExInpFile
   * @param psgParsedFileName
   * @throws Exception
   */
  public String removeCommentsWithNoEntInParentheses(
      String sentence, ArrayList<int[]> entBoundaryList) {

    ArrayList<String> words = DataStrucUtility.arrayToList(sentence.split("\\s+"));

    int bracS = -1, bracE = -1, tb = 0;
    boolean isFoundEnt = false;

    // the 1st item in the list is sentence id
    for (int i = 0; i < words.size(); i++) {
      if (words.get(i).matches(".*\\)[?,.:;!]")) {
        words.add(i + 1, String.valueOf(words.get(i).charAt(words.get(i).length() - 1)));
        words.set(i, words.get(i).substring(0, words.get(i).length() - 1));
      }

      if (words.get(i).charAt(0) == '(') {
        tb++;
        if (tb == 1) {
          bracS = i;
          isFoundEnt = false;
        }
      }

      if (words.get(i).charAt(words.get(i).length() - 1) == ')') {
        tb--;
        if (tb == 0) {
          bracE = i;

          int offset = 0, from = 0;
          for (int w = 0; w <= bracE; w++) {
            offset += words.get(w).length();
            if (w < bracS) from = offset;
          }

          for (int e = 0; e < entBoundaryList.size(); e++) {
            if (entBoundaryList.get(e)[0] >= from && entBoundaryList.get(e)[1] <= offset) {
              isFoundEnt = true;
              break;
            }
          }

          offset = offset - from;

          if (!isFoundEnt) {

            updateBoundaries(entBoundaryList, offset, from - 1, false);

            for (int k = bracE; k >= bracS; k--) words.remove(k);

            i = bracS - 1;
          }
        }
      }
    }

    sentence = "";

    for (int i = 0; i < words.size(); i++) sentence += " " + words.get(i);

    return sentence.trim();
  }