/** * @param listOfDepRelsInReducedGraph * @param listFeatIndsOfCurInp * @param listFeatCountOfCurInp */ private void extractDepPatternFeatures( ArrayList<String> listOfDepRelsInReducedGraph, ArrayList<Integer> listFeatIndsOfCurInp, ArrayList<Integer> listFeatCountOfCurInp) { ArrayList<Integer> listOfMatchedPatternIndexes = new ArrayList<Integer>(); for (int i = 0; i < PatternsDepRelFromGraph.listOfAllPatterns.size(); i++) { if (DataStrucUtility.hasListOneAllElementsOfListTwo( listOfDepRelsInReducedGraph, PatternsDepRelFromGraph.listOfAllPatterns.get(i))) listOfMatchedPatternIndexes.add(i); } for (int i = 0; i < listOfMatchedPatternIndexes.size(); i++) { // add a feature indicating that what dep pattern it is String[] feature = new String[] { "DepPattern-" + listOfMatchedPatternIndexes.get(i) + "@", }; GenericFeatVect.addNewFeatureInList( feature, 1, listFeatIndsOfCurInp, listFeatCountOfCurInp, 1); } }
public String preProcessWordWithDashEndingAndUpdateEntBoundaries( String sentence, ArrayList<int[]> entBoundaryList) { ArrayList<String> words = DataStrucUtility.arrayToList(sentence.split("\\s+")); String old = sentence; sentence = ""; int len = 0; if (old.contains(" of p42 MAPK were completely blocked by")) old.trim(); for (int wi = 0; wi < words.size(); wi++) { if (words.get(wi).length() == 1) ; // do nothing else if (words.get(wi).matches(".*[a-zA-Z].*-")) { /* * TODO: have to adjust entity boundaries for following entities and sentence * [liver-, islet-type glucokinase, glucokinase, actin] * When liver or islet type glucokinase was transiently expressed in COS-7 cells, the expressed glucokinase was also co-localized with actin filaments in the cytoplasm of these transfected cells. */ int lastDashInd = -1; // remove dash from the ending // identify the next word having dash // replace the dash of that word with space for (int x = wi + 1; x < words.size(); x++) { if ((lastDashInd = words.get(x).lastIndexOf("-")) > 0 && lastDashInd != words.get(x).length() - 1) { // System.out.println(old); // System.out.println(words.get(wi)); String newPart = words.get(x).substring(lastDashInd + 1); words.set(wi, words.get(wi).substring(0, words.get(wi).length() - 1)); updateBoundaries(entBoundaryList, 1, len - 1, false); words.set(x, words.get(x).substring(0, lastDashInd)); words.add(x + 1, newPart); updateBoundaries(entBoundaryList, 1, len - 1, false); break; } } } len += words.get(wi).length(); sentence += " " + words.get(wi); } return sentence.trim(); }
public String preProcessWordWithDashEndingByAddingExtraWordAndUpdateEntBoundaries( String sentence, ArrayList<int[]> entBoundaryList) { ArrayList<String> words = DataStrucUtility.arrayToList(sentence.split("\\s+")); sentence = ""; int len = 0; for (int wi = 0; wi < words.size(); wi++) { if (words.get(wi).length() == 1) ; // do nothing else if (words.get(wi).matches(".*[0-9a-zA-Z]-")) { int lastDashInd = -1; // identify the next word having dash // add the end token after the dash with the current word // we assume, the end token would not contain any entity name // we further assume, the end token is correct part that is meant to be related with the // current word for (int x = wi + 1; x < words.size(); x++) { if ((lastDashInd = words.get(x).lastIndexOf("-")) > 0 && lastDashInd != words.get(x).length() - 1) { // System.out.println(old); // System.out.println(words.get(wi)); String newPart = words.get(x).substring(lastDashInd + 1); words.set(wi, words.get(wi) + newPart); updateBoundaries(entBoundaryList, newPart.length(), len - 1, true); break; } } } len += words.get(wi).length(); sentence += " " + words.get(wi); } return sentence.trim(); }
/** * Note: This particular pre-processing didn't improve results in the LLL corpus. * * @param bioRelExInpFile * @param psgParsedFileName * @throws Exception */ public String removeCommentsWithNoEntInParentheses( String sentence, ArrayList<int[]> entBoundaryList) { ArrayList<String> words = DataStrucUtility.arrayToList(sentence.split("\\s+")); int bracS = -1, bracE = -1, tb = 0; boolean isFoundEnt = false; // the 1st item in the list is sentence id for (int i = 0; i < words.size(); i++) { if (words.get(i).matches(".*\\)[?,.:;!]")) { words.add(i + 1, String.valueOf(words.get(i).charAt(words.get(i).length() - 1))); words.set(i, words.get(i).substring(0, words.get(i).length() - 1)); } if (words.get(i).charAt(0) == '(') { tb++; if (tb == 1) { bracS = i; isFoundEnt = false; } } if (words.get(i).charAt(words.get(i).length() - 1) == ')') { tb--; if (tb == 0) { bracE = i; int offset = 0, from = 0; for (int w = 0; w <= bracE; w++) { offset += words.get(w).length(); if (w < bracS) from = offset; } for (int e = 0; e < entBoundaryList.size(); e++) { if (entBoundaryList.get(e)[0] >= from && entBoundaryList.get(e)[1] <= offset) { isFoundEnt = true; break; } } offset = offset - from; if (!isFoundEnt) { updateBoundaries(entBoundaryList, offset, from - 1, false); for (int k = bracE; k >= bracS; k--) words.remove(k); i = bracS - 1; } } } } sentence = ""; for (int i = 0; i < words.size(); i++) sentence += " " + words.get(i); return sentence.trim(); }