Ejemplo n.º 1
0
 private String[] toSentence(ArrayList<Word> words) {
   String[] sent = new String[words.size()];
   for (int k = 0; k < words.size(); k++) {
     sent[k] = words.get(k).getValue();
   }
   return sent;
 }
  public static ArrayList<ArrayList<TaggedWord>> getPhrasesNaive(
      String sentence, LexicalizedParser lp, AbstractSequenceClassifier<CoreLabel> classifier) {
    ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>();
    ArrayList<TaggedWord> taggedWords = StanfordNER.parse(sentence, lp, classifier);
    HashMap<String, String> phraseBoundaries = new HashMap<String, String>();
    phraseBoundaries.put(",", ",");
    phraseBoundaries.put("\"", "\"");
    phraseBoundaries.put("''", "''");
    phraseBoundaries.put("``", "``");
    phraseBoundaries.put("--", "--");
    // List<Tree> leaves = parse.getLeaves();
    ArrayList<TaggedWord> temp = new ArrayList<TaggedWord>();
    int index = 0;
    while (index < taggedWords.size()) {
      if ((phraseBoundaries.containsKey(taggedWords.get(index).word()))) {
        if (temp.size() > 0) {
          // System.out.println(temp);
          ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp);
          newList.add(Preprocess(tempCopy));
        }
        temp.clear();
      } else {
        // System.out.println(taggedWords.get(index).toString());
        temp.add(taggedWords.get(index));
      }
      index += 1;
    }
    if (temp.size() > 0) {
      ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp);
      newList.add(Preprocess(tempCopy));
    }

    // System.out.println(newList);
    return newList;
  }
Ejemplo n.º 3
0
  private String findSynonym(ArrayList<Word> sentence, int indexToReplace /*<Word wordIn>*/) {
    /**
     * Takes in one String of a word, hopefully only one word(will truncate after space), and
     * searches for a suitable synonym
     *
     * <p>*
     */
    String output = "";
    // String word;
    // word=wordIn.getString();
    String word = sentence.get(indexToReplace).getValue();
    Synset[] synsets = database.getSynsets(word);

    synonyms.clear();

    if (synsets.length > 0) {

      for (int i = 0; i < synsets.length; i++) {

        String[] wordForms = synsets[i].getWordForms();
        for (int j = 0; j < wordForms.length; j++) {
          if (wordForms[j].contains(" ")) {
            System.out.println(wordForms[j]);
          } else if (!wordForms[j].equals(word)) synonyms.add(wordForms[j]);
        }
      }

      // FIND IF PLURAL OR NO HERE

      /////////////////////////// ###########################////////////////////

      Collections.sort(synonyms, myLengthComparator);
      // synonyms is now sorted as longest first

      // now we have a list of all the terms that can replace the word.
      // We need to check them in the lexical analyzer
      //			System.out.println("kkkk");
      if (synonyms.size() == 0) return word;
      while (!LexicalAnalyzer(sentence, indexToReplace, synonyms.get(0))) {
        //			System.out.println("1");
        synonyms.remove(0);
        if (synonyms.size() == 0) return word; // did not find synonym

        if (synonyms.get(0).length() <= word.length()) {
          return word;
          // do not replace word, synonyms are shorter
        }
      }

      ////////////// ############# PLACEHOLDER CODE
      if (synonyms.size() > 0) output = synonyms.get(0);
      else output = word;
      ///////////// ################# PLACEHOLDER CODE
    } else { // There are no other known synonyms in our wordnet database.
      output = word;
    }

    return output;
  }
  public static double LexicalSimilarityScoreMax(
      ArrayList<TaggedWord> taggedWords1,
      ArrayList<TaggedWord> taggedWords2,
      DISCOSimilarity discoRAM,
      LexicalizedParser lp) {

    // System.out.println(taggedWords1.size() + "," + taggedWords2.size());

    // array of edge weights with default weight 0
    int length1 = taggedWords1.size();
    int length2 = taggedWords2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        String word1 = taggedWords1.get(i).word();
        String posTag1 = taggedWords1.get(i).tag();
        String word2 = taggedWords2.get(j).word();
        String posTag2 = taggedWords2.get(j).tag();

        ArrayList<TaggedWord> newList1 = new ArrayList<TaggedWord>();
        if (posTag1.length() >= 3 && posTag1.substring(0, 3).equals("NNP")) {
          newList1.add(taggedWords1.get(i));
        } else {
          String[] words = word1.split(" ");
          for (int k = 0; k < words.length; k++) newList1.add(new TaggedWord(words[k], posTag1));
        }

        ArrayList<TaggedWord> newList2 = new ArrayList<TaggedWord>();
        if (posTag2.length() >= 3 && posTag2.substring(0, 3).equals("NNP")) {
          newList2.add(taggedWords2.get(j));
        } else {
          String[] words = word2.split(" ");
          for (int k = 0; k < words.length; k++) newList2.add(new TaggedWord(words[k], posTag2));
        }

        double edgeWeight = LexicalSimilarityScoreMin(newList1, newList2, discoRAM, lp);

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    // int minLength = Math.min(length1, length2);
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5;
    finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5;

    return finalScore;
  }
  public static double LexicalSimilarityScoreMin(
      ArrayList<TaggedWord> taggedWords1,
      ArrayList<TaggedWord> taggedWords2,
      DISCOSimilarity discoRAM,
      LexicalizedParser lp) {

    // System.out.println(taggedWords1.size() + "," + taggedWords2.size());

    // array of edge weights with default weight 0
    int length1 = taggedWords1.size();
    int length2 = taggedWords2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        String word1 = taggedWords1.get(i).word();
        String word2 = taggedWords2.get(j).word();
        double edgeWeight = 0;

        // LSA Similarity
        // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2);

        // DISCO Similarity
        // DISCOSimilarity discoObj = new DISCOSimilarity();
        try {
          if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1;
          else {
            edgeWeight = discoRAM.similarity2(word1, word2);
            // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2);
          }
        } catch (Exception ex) {
          ex.printStackTrace();
        }

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    int minLength = Math.min(length1, length2);
    finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / minLength * 5;
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/arrSize * 5;

    return finalScore;
  }
  public static ArrayList<TaggedWord> StopWordRemoval(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();

    try {
      String path = "data/nltk_stoplist.txt";
      File textFile = new File(path);
      BufferedReader br = new BufferedReader(new FileReader(textFile));
      String stopwordsLine = br.readLine();
      br.close();

      String[] stopwords = stopwordsLine.split(",");
      HashMap<String, String> stopwordsDict = new HashMap<String, String>();
      for (int i = 0; i < stopwords.length; i++) {
        stopwordsDict.put(stopwords[i], stopwords[i]);
      }

      for (int i = 0; i < taggedWords.size(); i++) {
        String word = taggedWords.get(i).word();
        String posTag = taggedWords.get(i).tag();

        if (!stopwordsDict.containsKey(word.toLowerCase())) {
          String newWord, newPosTag;
          newWord = word;
          newPosTag = posTag;
          newList.add(new TaggedWord(newWord, newPosTag));
        }
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    }

    return newList;
  }
  public static double BestWordMatchEdgeWeight(
      ArrayList<TaggedWord> taggedWords1,
      ArrayList<TaggedWord> taggedWords2,
      DISCOSimilarity discoRAM) {
    double bestMatchScore = 0;
    for (int i = 0; i < taggedWords1.size(); i++) {
      String word1 = taggedWords1.get(i).word();
      for (int j = 0; j < taggedWords2.size(); j++) {
        String word2 = taggedWords2.get(j).word();
        double currentScore;
        if (word1.equals(word2)) currentScore = 1;
        else currentScore = discoRAM.similarity2(word1, word2);

        if (currentScore > bestMatchScore) bestMatchScore = currentScore;
      }
    }
    return bestMatchScore;
  }
  public static ArrayList<TaggedWord> PreprocessPhrase(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();
    taggedWords = Preprocess(taggedWords);

    for (int i = 0; i < taggedWords.size(); i++) {
      String posTag = taggedWords.get(i).tag();
      if (posTag.length() >= 2) {
        String reducedTag = posTag.substring(0, 2);
        if (reducedTag.equals("CD") || reducedTag.equals("NN") || reducedTag.equals("VB")) {
          newList.add(taggedWords.get(i));
        }
      }
    }

    return newList;
  }
  public static ArrayList<ArrayList<TaggedWord>> getPhrases(Tree parse, int phraseSizeLimit) {
    ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>();
    List<Tree> leaves = parse.getLeaves();

    if (leaves.size() <= phraseSizeLimit) {
      // ArrayList<TaggedWord> phraseElements = PreprocessPhrase(parse.taggedYield());
      ArrayList<TaggedWord> phraseElements = Preprocess(parse.taggedYield());
      if (phraseElements.size() > 0) newList.add(phraseElements);
    } else {
      Tree[] childrenNodes = parse.children();
      for (int i = 0; i < childrenNodes.length; i++) {
        Tree currentParse = childrenNodes[i];
        newList.addAll(getPhrases(currentParse, phraseSizeLimit));
      }
    }
    return newList;
  }
 public static void main(String args[]) {
   //        String sentence1 = "A large bird standing on a table picks up a plastic glass
   // containing liquid and places it in a bowl of something.";
   //        String sentence2 = "A bird picks up a plastic cup containing a liquid with it's beak
   // and puts the cup into a bowl.";
   //        LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
   //        LeskWSD tm = new LeskWSD(lp);
   //        WordNetSimilarity ws = new WordNetSimilarity();
   //
   //        System.out.println(LexicalSimilarityScoreWordNet(sentence1, sentence2, tm, lp, ws));
   String sentence =
       "The broader Standard & Poor's 500 Index <.SPX> shed 2.38 points, or 0.24 percent, at 995.10.";
   LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
   Tree parse = lp.apply(sentence);
   ArrayList<TaggedWord> taggedWords = parse.taggedYield();
   taggedWords = Preprocess(taggedWords);
   for (int i = 0; i < taggedWords.size(); i++) System.out.println(taggedWords.get(i).word());
 }
  public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();

    String[] punctuationsAndSpecialCharacters = {
      ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&"
    }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"};
    HashMap<String, String> punctuationMarks = new HashMap<String, String>();
    for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) {
      punctuationMarks.put(
          punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]);
    }

    for (int i = 0; i < taggedWords.size(); i++) {
      String word = taggedWords.get(i).word();
      String posTag = taggedWords.get(i).tag();

      if (!punctuationMarks.containsKey(word)) {

        if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) {
          word = Morphology.lemmaStatic(word, posTag, true);
          word = word.replace('-', ' ');
        }

        String newWord, newPosTag;
        if (word.equals("n't")) newWord = "not";
        else if (word.equals("'s")) newWord = "is";
        else if (word.equals("'ll")) newWord = "will";
        else if (word.equals("'m") || word.equals("m")) newWord = "am";
        else if (word.equals("im")) newWord = "am";
        else newWord = word;
        newPosTag = posTag;
        newList.add(new TaggedWord(newWord, newPosTag));
      }
    }
    newList = StopWordRemoval(newList);
    return newList;
  }
  public static double LexicalSimilarityScoreWordNet(
      String sentence1, String sentence2, LeskWSD tm, LexicalizedParser lp, WordNetSimilarity ws) {

    ArrayList<TaggedWord> taggedWordsPrev1 = Preprocess(StanfordParse(sentence1, lp));
    ArrayList<TaggedWord> taggedWordsPrev2 = Preprocess(StanfordParse(sentence2, lp));
    ArrayList<TaggedWord> taggedWords1 = new ArrayList<TaggedWord>();
    ArrayList<TaggedWord> taggedWords2 = new ArrayList<TaggedWord>();

    WordNetSense[] sensesPrev1 = tm.LeskJWI(sentence1);
    WordNetSense[] sensesPrev2 = tm.LeskJWI(sentence2);

    // System.out.println("Senses found!");

    ArrayList<WordNetSense> senses1 = new ArrayList<WordNetSense>();
    ArrayList<WordNetSense> senses2 = new ArrayList<WordNetSense>();

    for (int i = 0; i < taggedWordsPrev1.size(); i++) {
      String word = taggedWordsPrev1.get(i).word();
      String posTag = taggedWordsPrev1.get(i).tag();
      if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) {
        taggedWords1.add(new TaggedWord(word, "NN"));
        senses1.add(sensesPrev1[i]);
      } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) {
        taggedWords1.add(new TaggedWord(word, "VB"));
        senses1.add(sensesPrev1[i]);
      }
    }
    for (int i = 0; i < taggedWordsPrev2.size(); i++) {
      String word = taggedWordsPrev2.get(i).word();
      String posTag = taggedWordsPrev2.get(i).tag();
      if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) {
        taggedWords2.add(new TaggedWord(word, "NN"));
        senses2.add(sensesPrev2[i]);
      } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) {
        taggedWords2.add(new TaggedWord(word, "VB"));
        senses2.add(sensesPrev2[i]);
      }
    }

    // System.out.println(taggedWords1.size() + "," + taggedWords2.size());

    // array of edge weights with default weight 0
    int length1 = taggedWords1.size();
    int length2 = taggedWords2.size();
    int arrSize = Math.max(length1, length2);
    double[][] array = new double[arrSize][arrSize];
    for (int i = 0; i < arrSize; i++) {
      for (int j = 0; j < arrSize; j++) {
        array[i][j] = 0;
      }
    }
    for (int i = 0; i < length1; i++) {
      for (int j = 0; j < length2; j++) {
        String word1 = taggedWords1.get(i).word();
        String posTag1 = taggedWords1.get(i).tag();
        String word2 = taggedWords2.get(j).word();
        String posTag2 = taggedWords2.get(j).tag();
        double edgeWeight = 0;

        // LSA Similarity
        // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2);

        // DISCO Similarity
        // DISCOSimilarity discoObj = new DISCOSimilarity();
        try {
          if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1;
          else {
            // edgeWeight = ws.wuPalmerSimilarity(senses1.get(i), senses2.get(j));
            edgeWeight = ws.linSimilarity(senses1.get(i), senses2.get(j));
          }
        } catch (Exception ex) {
          ex.printStackTrace();
        }

        array[i][j] = edgeWeight;
      }
    }

    // System.out.println("Hungarian starts " + arrSize);

    double finalScore;
    String sumType = "max";
    int minLength = Math.min(length1, length2);
    // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5;
    if (arrSize == 0) finalScore = 0;
    else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5;

    return finalScore;
  }
Ejemplo n.º 13
0
		public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ 
			String grammar =  "./jsan_resources/englishPCFG.ser.gz";
			String[] options = { "-maxLength", "120", "-retainTmpSubcategories" };
//			LexicalizedParser lp = new LexicalizedParser(grammar, options);
			
			LexicalizedParser lp = new LexicalizedParser()
			TreebankLanguagePack tlp = new PennTreebankLanguagePack();
			
			GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
			Iterable<List<? extends HasWord>> sentences;
			ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); 
			everything.add(0,otherSampleStrings);
			everything.add(1,authorSampleStrings);
			everything.add(2,toModifyStrings);
			Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator();
			int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings
			int numLoaded = 0;
			while(everythingIter.hasNext()){
				docTypeNumber++;
				HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder();
				Set<String> currentDocStrings = currentSampleStrings.keySet();
				Iterator<String> docStrIter = currentDocStrings.iterator();
				String docID;
				ArrayList<String> sentenceTokens;
				allTreeProcessors[docTypeNumber]  = new TreeProcessor();
				allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps();
				numLoaded=0;
				while(docStrIter.hasNext()){
					docID = docStrIter.next();
					sentenceTokens = currentSampleStrings.get(docID);
					if(sentenceTokens == null){
						allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false);
						numLoaded++;
						continue;
					}
					//System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext());

					numSentences = sentenceTokens.size();
					//initialize(numSentences);
					Iterator<String> sentIter = sentenceTokens.iterator();
					List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>();
					String tempSent;
					while(sentIter.hasNext()){
						tempSent = sentIter.next();
						Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent));
						List<? extends HasWord> sentenceTokenized = toke.tokenize();
						tmp.add(sentenceTokenized);
					}
					
					sentences = tmp;
					//int numDone = 0;
					TreeProcessor.singleDocMap.clear();
					boolean willSaveResults = true;
					for (List<? extends HasWord> sentence : sentences) {
						Tree parse = lp.apply(sentence);
						//parse.pennPrint();
						//System.out.println(parse.treeSkeletonCopy().toString());
						//System.out.println(parse.taggedYield());
						//System.out.println();
						//printSubTrees(parse);
						//TreeContainer.recurseTree(parse,"breadth");
						allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); 
						//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
						//numDone++;
						//System.out.println("sent "+numDone+" of "+numSentences+" done ");
						//System.out.println(tc.processedTrees.toString());
						//in.nextLine();
						//TreeContainer.recurseTree(parse, "depth");
						//in.nextLine();
						//addTree(parse);
						//GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS
						//Collection tdl = gs.typedDependenciesCCprocessed(true);
						//System.out.println(tdl);
						//System.out.println();
					}
					if(willSaveResults == true)
						ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR);

					//System.out.println("After all sents: ");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//String sent3 = "This is one last test!";
					//Tree parse3 = lp.apply(sent3);
					//parse3.pennPrint();
					//System.out.println("After sorting and writing:");
					//System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\("));
					//Scanner in = new Scanner(System.in);
					//System.out.println("First one done.");
					//in.nextLine();
					//viewTrees();
				}
				
				//TreeProcessor.writeTreeDataToCSV(sortedTD,docID);
				allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1);
				
			}	
			
			
			int i= 0;
			allParsedAndOrdered.clear();
			String[] docTypes = new String[]{"otherSample","authorSample","toModify"};
			for(i=0; i < 3; i++){
				allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees);
				allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps);
				allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps);
				allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap));
				
			}
			
			//ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees);
			//TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS");
			
			return allParsedAndOrdered;
		}