예제 #1
0
  public ItemType guessLexicalItemType(String input) {
    String firstWord = input.split(" ")[0];

    if (firstWord.matches("[0-9]*\\-[anvr]")) return ItemType.SENSE_OFFSETS;

    if (firstWord.matches("[^ ]*%[0-9]*:[^ ]*")) return ItemType.SENSE_KEYS;

    if (WordNetUtils.getInstance().mapWordSenseToIWord(firstWord) != null)
      return ItemType.WORD_SENSE;

    if (firstWord.matches("[^ ]*#[nvra]")) return ItemType.SURFACE_TAGGED;

    return ItemType.SURFACE;
  }
예제 #2
0
  public Pair<List<String>, List<String>> cookLexicalItem(
      String text, ItemType textType, boolean discardStopwords) {
    try {

      List<String> cookedSentence = new ArrayList<String>();
      Pair<List<String>, List<String>> out = new Pair<List<String>, List<String>>(null, null);

      for (String item : Arrays.asList(text.split(" "))) {
        if (item.trim().length() == 0) continue;

        switch (textType) {
          case SENSE_OFFSETS:
            cookedSentence.add(item);
            break;

          case SENSE_KEYS:
            IWord sense = WordNetUtils.getInstance().getSenseFromSenseKey(item);
            cookedSentence.add(
                GeneralUtils.fixOffset(sense.getSynset().getOffset(), sense.getPOS()));
            break;

          case WORD_SENSE:
            IWord snse = WordNetUtils.getInstance().mapWordSenseToIWord(item);
            cookedSentence.add(GeneralUtils.fixOffset(snse.getSynset().getOffset(), snse.getPOS()));
            break;

          case SURFACE_TAGGED:
            cookedSentence.add(item);
            break;
        }
      }

      if (textType.equals(ItemType.SURFACE)) {
        out = TextualSimilarity.getInstance().cookSentence(text);
        cookedSentence = out.first;
      }

      if (cookedSentence == null) cookedSentence = new ArrayList<String>();

      List<String> newCS = new ArrayList<String>();

      for (String s : cookedSentence) {
        // if it is a synset
        if (s.matches("[0-9]*\\-[anvr]")) {
          newCS.add(s);
          continue;
        }

        String comps[] = s.split("#");
        String word = comps[0];
        String ps = comps[1];

        // otherwise check word exists in WordNet
        if (!TextualSimilarity.getInstance().isOOV(word, ps)) newCS.add(word + "#" + ps);
      }

      cookedSentence = newCS;

      return new Pair<List<String>, List<String>>(cookedSentence, out.second);
    } catch (Exception e) {
      e.printStackTrace();
    }

    return null;
  }