Esempi in Java per ArrayList, esempi in Java per edu.stanford.nlp.ling.ArrayList

Esempio n. 1

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static ArrayList<TaggedWord> StopWordRemoval(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();

    try {
      String path = "data/nltk_stoplist.txt";
      File textFile = new File(path);
      BufferedReader br = new BufferedReader(new FileReader(textFile));
      String stopwordsLine = br.readLine();
      br.close();

      String[] stopwords = stopwordsLine.split(",");
      HashMap<String, String> stopwordsDict = new HashMap<String, String>();
      for (int i = 0; i < stopwords.length; i++) {
        stopwordsDict.put(stopwords[i], stopwords[i]);
      }

      for (int i = 0; i < taggedWords.size(); i++) {
        String word = taggedWords.get(i).word();
        String posTag = taggedWords.get(i).tag();

        if (!stopwordsDict.containsKey(word.toLowerCase())) {
          String newWord, newPosTag;
          newWord = word;
          newPosTag = posTag;
          newList.add(new TaggedWord(newWord, newPosTag));
        }
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    }

    return newList;
  }

Esempio n. 2

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static ArrayList<ArrayList<TaggedWord>> getPhrasesNaive(
      String sentence, LexicalizedParser lp, AbstractSequenceClassifier<CoreLabel> classifier) {
    ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>();
    ArrayList<TaggedWord> taggedWords = StanfordNER.parse(sentence, lp, classifier);
    HashMap<String, String> phraseBoundaries = new HashMap<String, String>();
    phraseBoundaries.put(",", ",");
    phraseBoundaries.put("\"", "\"");
    phraseBoundaries.put("''", "''");
    phraseBoundaries.put("``", "``");
    phraseBoundaries.put("--", "--");
    // List<Tree> leaves = parse.getLeaves();
    ArrayList<TaggedWord> temp = new ArrayList<TaggedWord>();
    int index = 0;
    while (index < taggedWords.size()) {
      if ((phraseBoundaries.containsKey(taggedWords.get(index).word()))) {
        if (temp.size() > 0) {
          // System.out.println(temp);
          ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp);
          newList.add(Preprocess(tempCopy));
        }
        temp.clear();
      } else {
        // System.out.println(taggedWords.get(index).toString());
        temp.add(taggedWords.get(index));
      }
      index += 1;
    }
    if (temp.size() > 0) {
      ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp);
      newList.add(Preprocess(tempCopy));
    }

    // System.out.println(newList);
    return newList;
  }

Esempio n. 3

0

Mostra file

File: LexicalParsingEngine.java Progetto: ferojuras/EntitySearch

  public ArrayList<String> getNounsFromSentence(String sentence) {
    ArrayList<TaggedWord> tw = parseSentenceTD(sentence);
    ArrayList<String> nouns = new ArrayList<String>();

    for (TaggedWord t : tw) {
      if (t.tag().startsWith("N")) {
        nouns.add(t.value());
      }
    }

    return nouns;
  }

Esempio n. 4

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static double LexicalSimilarityScoreMin(
      ArrayList<TaggedWord> taggedWords1,
      ArrayList<TaggedWord> taggedWords2,
      DISCOSimilarity discoRAM,
      LexicalizedParser lp) {

    // System.out.println(taggedWords1.size() + "," + taggedWords2.size());

    // array of edge weights with default weight 0
    int length1 = taggedWords1.size();
    int length2 = taggedWords2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        String word1 = taggedWords1.get(i).word();
        String word2 = taggedWords2.get(j).word();
        double edgeWeight = 0;

        // LSA Similarity
        // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2);

        // DISCO Similarity
        // DISCOSimilarity discoObj = new DISCOSimilarity();
        try {
          if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1;
          else {
            edgeWeight = discoRAM.similarity2(word1, word2);
            // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2);
          }
        } catch (Exception ex) {
          ex.printStackTrace();
        }

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    int minLength = Math.min(length1, length2);
    finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / minLength * 5;
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/arrSize * 5;

    return finalScore;
  }

Esempio n. 5

0

Mostra file

File: LexicalParsingEngine.java Progetto: ferojuras/EntitySearch

  public ArrayList<String> getPairsFromSentence(String sentence) {
    Collection<TypedDependency> tdl = parseSentenceTDL(sentence);
    ArrayList<String> pairs = new ArrayList<String>();

    for (TypedDependency td : tdl) {
      StringBuilder sb = new StringBuilder();
      sb.append(td.gov().originalText());
      sb.append(" ");
      sb.append(td.dep().originalText());
      pairs.add(sb.toString());
    }

    return pairs;
  }

Esempio n. 6

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static ArrayList<TaggedWord> PreprocessPhrase(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();
    taggedWords = Preprocess(taggedWords);

    for (int i = 0; i < taggedWords.size(); i++) {
      String posTag = taggedWords.get(i).tag();
      if (posTag.length() >= 2) {
        String reducedTag = posTag.substring(0, 2);
        if (reducedTag.equals("CD") || reducedTag.equals("NN") || reducedTag.equals("VB")) {
          newList.add(taggedWords.get(i));
        }
      }
    }

    return newList;
  }

Esempio n. 7

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static ArrayList<ArrayList<TaggedWord>> getPhrases(Tree parse, int phraseSizeLimit) {
    ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>();
    List<Tree> leaves = parse.getLeaves();

    if (leaves.size() <= phraseSizeLimit) {
      // ArrayList<TaggedWord> phraseElements = PreprocessPhrase(parse.taggedYield());
      ArrayList<TaggedWord> phraseElements = Preprocess(parse.taggedYield());
      if (phraseElements.size() > 0) newList.add(phraseElements);
    } else {
      Tree[] childrenNodes = parse.children();
      for (int i = 0; i < childrenNodes.length; i++) {
        Tree currentParse = childrenNodes[i];
        newList.addAll(getPhrases(currentParse, phraseSizeLimit));
      }
    }
    return newList;
  }

Esempio n. 8

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

 public static void main(String args[]) {
   //        String sentence1 = "A large bird standing on a table picks up a plastic glass
   // containing liquid and places it in a bowl of something.";
   //        String sentence2 = "A bird picks up a plastic cup containing a liquid with it's beak
   // and puts the cup into a bowl.";
   //        LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
   //        LeskWSD tm = new LeskWSD(lp);
   //        WordNetSimilarity ws = new WordNetSimilarity();
   //
   //        System.out.println(LexicalSimilarityScoreWordNet(sentence1, sentence2, tm, lp, ws));
   String sentence =
       "The broader Standard & Poor's 500 Index <.SPX> shed 2.38 points, or 0.24 percent, at 995.10.";
   LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
   Tree parse = lp.apply(sentence);
   ArrayList<TaggedWord> taggedWords = parse.taggedYield();
   taggedWords = Preprocess(taggedWords);
   for (int i = 0; i < taggedWords.size(); i++) System.out.println(taggedWords.get(i).word());
 }

Esempio n. 9

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static double BestWordMatchEdgeWeight(
      ArrayList<TaggedWord> taggedWords1,
      ArrayList<TaggedWord> taggedWords2,
      DISCOSimilarity discoRAM) {
    double bestMatchScore = 0;
    for (int i = 0; i < taggedWords1.size(); i++) {
      String word1 = taggedWords1.get(i).word();
      for (int j = 0; j < taggedWords2.size(); j++) {
        String word2 = taggedWords2.get(j).word();
        double currentScore;
        if (word1.equals(word2)) currentScore = 1;
        else currentScore = discoRAM.similarity2(word1, word2);

        if (currentScore > bestMatchScore) bestMatchScore = currentScore;
      }
    }
    return bestMatchScore;
  }

Esempio n. 10

0

Mostra file

File: ChineseMarkovWordSegmenter.java Progetto: siwiwit/EMFText-Zoo

 private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) {
   // CharacterLevelTagExtender ext = new CharacterLevelTagExtender();
   ClassicCounter<Integer> c = new ClassicCounter<Integer>();
   for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) {
     Tree gold = (Tree) iterator.next();
     StringBuilder goldChars = new StringBuilder();
     ArrayList goldYield = gold.yield();
     for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) {
       Word word = (Word) wordIter.next();
       goldChars.append(word);
     }
     List<HasWord> ourWords = segment(goldChars.toString());
     for (int i = 0; i < ourWords.size(); i++) {
       c.incrementCount(Integer.valueOf(ourWords.get(i).word().length()));
     }
   }
   return Distribution.getDistribution(c);
 }

Esempio n. 11

0

Mostra file

File: MaxentPOSNGramsEventDriver.java Progetto: tdutko/jstylo

  @SuppressWarnings("static-access")
  @Override
  public EventSet createEventSet(Document doc) {
    EventSet es = new EventSet(doc.getAuthor());
    char[] text = doc.getProcessedText();
    String stringText = new String(text);

    // use MaxentPOSTagsEventDriver's tagger
    // initialize tagger and return empty event set if encountered a problem
    if (tagger == null) {
      tagger = MaxentPOSTagsEventDriver.initTagger();
      if (tagger == null) return es;
    }

    List<List<HasWord>> sentences =
        tagger.tokenizeText(new BufferedReader(new StringReader(stringText)));
    ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>();
    for (List<HasWord> sentence : sentences) tagged.addAll(tagger.tagSentence(sentence));

    int i, j, n;
    try {
      n = Integer.parseInt(getParameter("N"));
    } catch (NumberFormatException e) {
      n = 2;
    }
    String curr;
    for (i = 0; i < tagged.size() - n + 1; i++) {
      curr = "(" + tagged.get(i).tag() + ")";
      for (j = 1; j < n; j++) {
        curr += "-(" + tagged.get(i + j).tag() + ")";
      }
      es.addEvent(new Event(curr));
    }

    sentences.clear();
    sentences = null;
    return es;
  }

Esempio n. 12

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();

    String[] punctuationsAndSpecialCharacters = {
      ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&"
    }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"};
    HashMap<String, String> punctuationMarks = new HashMap<String, String>();
    for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) {
      punctuationMarks.put(
          punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]);
    }

    for (int i = 0; i < taggedWords.size(); i++) {
      String word = taggedWords.get(i).word();
      String posTag = taggedWords.get(i).tag();

      if (!punctuationMarks.containsKey(word)) {

        if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) {
          word = Morphology.lemmaStatic(word, posTag, true);
          word = word.replace('-', ' ');
        }

        String newWord, newPosTag;
        if (word.equals("n't")) newWord = "not";
        else if (word.equals("'s")) newWord = "is";
        else if (word.equals("'ll")) newWord = "will";
        else if (word.equals("'m") || word.equals("m")) newWord = "am";
        else if (word.equals("im")) newWord = "am";
        else newWord = word;
        newPosTag = posTag;
        newList.add(new TaggedWord(newWord, newPosTag));
      }
    }
    newList = StopWordRemoval(newList);
    return newList;
  }

Esempio n. 13

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static double LexicalSimilarityScoreWordNet(
      String sentence1, String sentence2, LeskWSD tm, LexicalizedParser lp, WordNetSimilarity ws) {

    ArrayList<TaggedWord> taggedWordsPrev1 = Preprocess(StanfordParse(sentence1, lp));
    ArrayList<TaggedWord> taggedWordsPrev2 = Preprocess(StanfordParse(sentence2, lp));
    ArrayList<TaggedWord> taggedWords1 = new ArrayList<TaggedWord>();
    ArrayList<TaggedWord> taggedWords2 = new ArrayList<TaggedWord>();

    WordNetSense[] sensesPrev1 = tm.LeskJWI(sentence1);
    WordNetSense[] sensesPrev2 = tm.LeskJWI(sentence2);

    // System.out.println("Senses found!");

    ArrayList<WordNetSense> senses1 = new ArrayList<WordNetSense>();
    ArrayList<WordNetSense> senses2 = new ArrayList<WordNetSense>();

    for (int i = 0; i < taggedWordsPrev1.size(); i++) {
      String word = taggedWordsPrev1.get(i).word();
      String posTag = taggedWordsPrev1.get(i).tag();
      if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) {
        taggedWords1.add(new TaggedWord(word, "NN"));
        senses1.add(sensesPrev1[i]);
      } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) {
        taggedWords1.add(new TaggedWord(word, "VB"));
        senses1.add(sensesPrev1[i]);
      }
    }
    for (int i = 0; i < taggedWordsPrev2.size(); i++) {
      String word = taggedWordsPrev2.get(i).word();
      String posTag = taggedWordsPrev2.get(i).tag();
      if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) {
        taggedWords2.add(new TaggedWord(word, "NN"));
        senses2.add(sensesPrev2[i]);
      } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) {
        taggedWords2.add(new TaggedWord(word, "VB"));
        senses2.add(sensesPrev2[i]);
      }
    }

    // System.out.println(taggedWords1.size() + "," + taggedWords2.size());

    // array of edge weights with default weight 0
    int length1 = taggedWords1.size();
    int length2 = taggedWords2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        String word1 = taggedWords1.get(i).word();
        String posTag1 = taggedWords1.get(i).tag();
        String word2 = taggedWords2.get(j).word();
        String posTag2 = taggedWords2.get(j).tag();
        double edgeWeight = 0;

        // LSA Similarity
        // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2);

        // DISCO Similarity
        // DISCOSimilarity discoObj = new DISCOSimilarity();
        try {
          if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1;
          else {
            // edgeWeight = ws.wuPalmerSimilarity(senses1.get(i), senses2.get(j));
            edgeWeight = ws.linSimilarity(senses1.get(i), senses2.get(j));
          }
        } catch (Exception ex) {
          ex.printStackTrace();
        }

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    int minLength = Math.min(length1, length2);
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5;
    if (arrSize == 0) finalScore = 0;
    else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5;

    return finalScore;
  }

Esempio n. 14

0

Mostra file

File: LexicalSimilarity.java Progetto: sumitbhagwani/SententialSimilarity

  public static double LexicalSimilarityScoreMax(
      ArrayList<TaggedWord> taggedWords1,
      ArrayList<TaggedWord> taggedWords2,
      DISCOSimilarity discoRAM,
      LexicalizedParser lp) {

    // System.out.println(taggedWords1.size() + "," + taggedWords2.size());

    // array of edge weights with default weight 0
    int length1 = taggedWords1.size();
    int length2 = taggedWords2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        String word1 = taggedWords1.get(i).word();
        String posTag1 = taggedWords1.get(i).tag();
        String word2 = taggedWords2.get(j).word();
        String posTag2 = taggedWords2.get(j).tag();

        ArrayList<TaggedWord> newList1 = new ArrayList<TaggedWord>();
        if (posTag1.length() >= 3 && posTag1.substring(0, 3).equals("NNP")) {
          newList1.add(taggedWords1.get(i));
        } else {
          String[] words = word1.split(" ");
          for (int k = 0; k < words.length; k++) newList1.add(new TaggedWord(words[k], posTag1));
        }

        ArrayList<TaggedWord> newList2 = new ArrayList<TaggedWord>();
        if (posTag2.length() >= 3 && posTag2.substring(0, 3).equals("NNP")) {
          newList2.add(taggedWords2.get(j));
        } else {
          String[] words = word2.split(" ");
          for (int k = 0; k < words.length; k++) newList2.add(new TaggedWord(words[k], posTag2));
        }

        double edgeWeight = LexicalSimilarityScoreMin(newList1, newList2, discoRAM, lp);

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    // int minLength = Math.min(length1, length2);
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5;
    finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5;

    return finalScore;
  }

Esempio n. 15

0

Mostra file

File: ChineseMarkovWordSegmenter.java Progetto: siwiwit/EMFText-Zoo

  /**
   * Do max language model markov segmentation. Note that this algorithm inherently tags words as it
   * goes, but that we throw away the tags in the final result so that the segmented words are
   * untagged. (Note: for a couple of years till Aug 2007, a tagged result was returned, but this
   * messed up the parser, because it could use no tagging but the given tagging, which often wasn't
   * very good. Or in particular it was a subcategorized tagging which never worked with the current
   * forceTags option which assumes that gold taggings are inherently basic taggings.)
   *
   * @param s A String to segment
   * @return The list of segmented words.
   */
  private ArrayList<HasWord> segmentWordsWithMarkov(String s) {
    int length = s.length();
    //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
    int numTags = POSes.size();
    // score of span with initial word of this tag
    double[][][] scores = new double[length][length + 1][numTags];
    // best (length of) first word for this span with this tag
    int[][][] splitBacktrace = new int[length][length + 1][numTags];
    // best tag for second word over this span, if first is this tag
    int[][][] POSbacktrace = new int[length][length + 1][numTags];
    for (int i = 0; i < length; i++) {
      for (int j = 0; j < length + 1; j++) {
        Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY);
      }
    }
    // first fill in word probabilities
    for (int diff = 1; diff <= 10; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        StringBuilder wordBuf = new StringBuilder();
        for (int pos = start; pos < end; pos++) {
          wordBuf.append(s.charAt(pos));
        }
        String word = wordBuf.toString();
        for (String tag : POSes) {
          IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex);
          double score = lex.score(itw, 0, word, null);
          if (start == 0) {
            score += Math.log(initialPOSDist.probabilityOf(tag));
          }
          scores[start][end][itw.tag()] = score;
          splitBacktrace[start][end][itw.tag()] = end;
        }
      }
    }
    // now fill in word combination probabilities
    for (int diff = 2; diff <= length; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        for (int split = start + 1; split < end && split - start <= 10; split++) {
          for (String tag : POSes) {
            int tagNum = tagIndex.indexOf(tag, true);
            if (splitBacktrace[start][split][tagNum] != split) {
              continue;
            }
            Distribution<String> rTagDist = markovPOSDists.get(tag);
            if (rTagDist == null) {
              continue; // this happens with "*" POS
            }
            for (String rTag : POSes) {
              int rTagNum = tagIndex.indexOf(rTag, true);
              double newScore =
                  scores[start][split][tagNum]
                      + scores[split][end][rTagNum]
                      + Math.log(rTagDist.probabilityOf(rTag));
              if (newScore > scores[start][end][tagNum]) {
                scores[start][end][tagNum] = newScore;
                splitBacktrace[start][end][tagNum] = split;
                POSbacktrace[start][end][tagNum] = rTagNum;
              }
            }
          }
        }
      }
    }
    int nextPOS = ArrayMath.argmax(scores[0][length]);
    ArrayList<HasWord> words = new ArrayList<HasWord>();

    int start = 0;
    while (start < length) {
      int split = splitBacktrace[start][length][nextPOS];
      StringBuilder wordBuf = new StringBuilder();
      for (int i = start; i < split; i++) {
        wordBuf.append(s.charAt(i));
      }
      String word = wordBuf.toString();
      // String tag = tagIndex.get(nextPOS);
      // words.add(new TaggedWord(word, tag));
      words.add(new Word(word));
      if (split < length) {
        nextPOS = POSbacktrace[start][length][nextPOS];
      }
      start = split;
    }

    return words;
  }