示例#1
0
 public void setFromString(String labelStr, String divider) {
   int first = labelStr.indexOf(divider);
   int second = labelStr.lastIndexOf(divider);
   if (first == second) {
     setWord(labelStr.substring(0, first));
     setTag(labelStr.substring(first + 1));
     setLemma(Morphology.lemmaStatic(labelStr.substring(0, first), labelStr.substring(first + 1)));
   } else if (first >= 0) {
     setWord(labelStr.substring(0, first));
     setLemma(labelStr.substring(first + 1, second));
     setTag(labelStr.substring(second + 1));
   } else {
     setWord(labelStr);
     setLemma(null);
     setTag(null);
   }
 }
  public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();

    String[] punctuationsAndSpecialCharacters = {
      ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&"
    }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"};
    HashMap<String, String> punctuationMarks = new HashMap<String, String>();
    for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) {
      punctuationMarks.put(
          punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]);
    }

    for (int i = 0; i < taggedWords.size(); i++) {
      String word = taggedWords.get(i).word();
      String posTag = taggedWords.get(i).tag();

      if (!punctuationMarks.containsKey(word)) {

        if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) {
          word = Morphology.lemmaStatic(word, posTag, true);
          word = word.replace('-', ' ');
        }

        String newWord, newPosTag;
        if (word.equals("n't")) newWord = "not";
        else if (word.equals("'s")) newWord = "is";
        else if (word.equals("'ll")) newWord = "will";
        else if (word.equals("'m") || word.equals("m")) newWord = "am";
        else if (word.equals("im")) newWord = "am";
        else newWord = word;
        newPosTag = posTag;
        newList.add(new TaggedWord(newWord, newPosTag));
      }
    }
    newList = StopWordRemoval(newList);
    return newList;
  }