/** * Get the type of the target term representation, query with the suitable input formatted file * and the corresponding index * * @param type * @return HashMap<String,ArrayList<ScoreDoc>> A set of target terms with their extracted * documents * @throws IOException * @throws ParseException */ @Override public HashMap<String, ArrayList<ScoreDoc>> extractDocsByRepresentation() throws IOException, ParseException { String indexName = null, inputFileName = null; indexName = "modernJewishOnly"; m_qg.setType(InputType.Query); inputFileName = "hozOrigQueryAll.txt"; // read the suitable input file LinkedList<Pair<String, String>> queries = new LinkedList<Pair<String, String>>(); BufferedReader reader = new BufferedReader(new FileReader(m_inputDir + inputFileName)); String line = reader.readLine(); while (line != null) { int index = line.indexOf("\t"); queries.add(new Pair<String, String>(line.substring(0, index), line.substring(index + 1))); line = reader.readLine(); } reader.close(); // search for the queries in the index IndexSearcher searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File(m_indexDir + indexName)))); HashMap<String, ArrayList<ScoreDoc>> termDocs = new HashMap<String, ArrayList<ScoreDoc>>(); for (Pair<String, String> term : queries) { Query q = m_qg.generate(term.value()); termDocs.put( TargetTerm2Id.getStrDesc(Integer.parseInt(term.key())), new ArrayList<ScoreDoc>(Arrays.asList(searcher.search(q, 1000).scoreDocs))); } return termDocs; }
/** * Write top lin and balanced inclusion (cover) scores to a file * * @param writer * @param entailedElementId * @param entailingElements * @param entailedElement2NormMap * @param entailingElement2NormMap */ private void writeEntailedElementScores( PrintWriter writer, Integer entailedElementId, TIntObjectMap<Pair<DoubleContainer, DoubleContainer>> entailingElements, TIntDoubleHashMap entailedElement2NormMap, TIntDoubleHashMap entailingElement2NormMap) { BoundedPriorityQueue<DistSimRule> linScores = new BoundedPriorityQueue<DistSimRule>(new DistSimRuleComparator(), m_maxRulesPerElement); BoundedPriorityQueue<DistSimRule> coverScores = new BoundedPriorityQueue<DistSimRule>(new DistSimRuleComparator(), m_maxRulesPerElement); double entailedElementNorm = entailedElement2NormMap.get(entailedElementId); TIntIterator iter = entailingElements.keySet().iterator(); while (iter.hasNext()) { Integer entailingElementId = iter.next(); // if there are no features for this element - then it is not similar to any other and we can // move on if (entailingElement2NormMap.get(entailingElementId) == 0.0) continue; double entailingElementNorm = entailingElement2NormMap.get(entailingElementId); Pair<DoubleContainer, DoubleContainer> scores = entailingElements.get(entailingElementId); Double linNominator = scores.key().value(); Double coverNominator = scores.value().value(); double linScore = linNominator / (entailedElementNorm + entailingElementNorm); double coverScore = coverNominator / entailingElementNorm; coverScore = Math.sqrt(linScore * coverScore); if (linScore > 0.0) { linScores.offer(new DistSimRule(entailedElementId, entailingElementId, linScore)); } if (coverScore > 0.0) { coverScores.offer(new DistSimRule(entailedElementId, entailingElementId, coverScore)); } } while (!linScores.isEmpty()) { DistSimRule linRule = linScores.poll(); writer.println("LIN\t" + linRule); } while (!coverScores.isEmpty()) { DistSimRule coverRule = coverScores.poll(); writer.println("COVER\t" + coverRule); } // we are done with the left element so we clear rightElements entailingElements.clear(); }
public void mergeRules(File vectorsDir, int maxVectorLen) throws NumberFormatException, IOException { TIntObjectHashMap<BoundedPriorityQueue<Pair<Integer, Double>>> linScores = new TIntObjectHashMap<BoundedPriorityQueue<Pair<Integer, Double>>>(); TIntObjectHashMap<BoundedPriorityQueue<Pair<Integer, Double>>> balScores = new TIntObjectHashMap<BoundedPriorityQueue<Pair<Integer, Double>>>(); String line; BufferedReader reader = null; for (String fileName : vectorsDir.list()) { if (fileName.endsWith(".rules") && fileName.contains("Trunc" + maxVectorLen)) { System.out.println("Reading: " + fileName); reader = new BufferedReader(new FileReader(vectorsDir.getAbsolutePath() + "/" + fileName)); while ((line = reader.readLine()) != null) { String[] tokens = line.split("\t"); String scoreType = tokens[0]; Integer entailedElement = Integer.parseInt(tokens[1]); Integer entailingElement = Integer.parseInt(tokens[2]); Double score = Double.parseDouble(tokens[3]); if (scoreType.equals("LIN")) { if (linScores.contains(entailedElement)) linScores .get(entailedElement) .offer(new Pair<Integer, Double>(entailingElement, score)); else { linScores.put( entailedElement, new BoundedPriorityQueue<Pair<Integer, Double>>( new obj.PairComparator(), m_maxRulesPerElement)); linScores .get(entailedElement) .offer(new Pair<Integer, Double>(entailingElement, score)); } } else if (scoreType.equals("COVER")) if (balScores.contains(entailedElement)) balScores .get(entailedElement) .offer(new Pair<Integer, Double>(entailingElement, score)); else { balScores.put( entailedElement, new BoundedPriorityQueue<Pair<Integer, Double>>( new obj.PairComparator(), m_maxRulesPerElement)); balScores .get(entailedElement) .offer(new Pair<Integer, Double>(entailingElement, score)); } } reader.close(); } } System.out.println("Uploading elements"); TIntObjectMap<String> id2elementDesc = new TIntObjectHashMap<String>(); TIntObjectMap<String> id2targetElementDesc = new TIntObjectHashMap<String>(); reader = new BufferedReader(new FileReader(vectorsDir.getAbsolutePath() + "/elements.txt")); while ((line = reader.readLine()) != null) { id2elementDesc.put(Integer.parseInt(line.split("\t")[1]), line.split("\t")[0]); } reader.close(); reader = new BufferedReader(new FileReader(vectorsDir.getAbsolutePath() + "/targetElements.txt")); while ((line = reader.readLine()) != null) { id2targetElementDesc.put(Integer.parseInt(line.split("\t")[1]), line.split("\t")[0]); } reader.close(); PrintWriter writer = new PrintWriter(new FileOutputStream(vectorsDir.getAbsolutePath() + "/linRules.txt")); TIntIterator iter = linScores.keySet().iterator(); while (iter.hasNext()) { Integer entailedElementId = iter.next(); String entailedStr = id2targetElementDesc.get(entailedElementId); // if (entailedStr.contains("_")) { // String confDir = entailedStr.substring(entailedStr.indexOf("_")+1); // File resultsDir = new File(f.getAbsolutePath()+"/" + confDir); // if(!resultsDir.exists()) // resultsDir.mkdir(); // writer = new PrintWriter(new // FileOutputStream(resultsDir.getAbsolutePath()+"/"+entailedStr.substring(0,entailedStr.indexOf("_")))); // writer = new PrintWriter(new // FileOutputStream(resultsDir.getAbsolutePath()+"/"+entailedElementId)); // } // else // writer = new PrintWriter(new FileOutputStream(f.getAbsoluteFile()+"/"+entailedStr)); // writer = new PrintWriter(new // FileOutputStream(f.getAbsoluteFile()+"/"+entailedElementId)); while (!linScores.get(entailedElementId).isEmpty()) { Pair<Integer, Double> linRule = linScores.get(entailedElementId).poll(); writer.println( entailedStr + "\t" + id2elementDesc.get(linRule.key()) + "\t" + linRule.value()); } } if (writer != null) writer.close(); writer = new PrintWriter(new FileOutputStream(vectorsDir.getAbsolutePath() + "/balRules.txt")); iter = balScores.keySet().iterator(); while (iter.hasNext()) { Integer entailedElementId = iter.next(); String entailedStr = id2targetElementDesc.get(entailedElementId); // if (entailedStr.contains("_")) { // String confDir = entailedStr.substring(entailedStr.indexOf("_")+1); // File resultsDir = new File(f.getAbsolutePath()+"/" + confDir); // if(!resultsDir.exists()) // resultsDir.mkdir(); // writer = new PrintWriter(new // FileOutputStream(resultsDir.getAbsolutePath()+"/"+entailedStr.substring(0,entailedStr.indexOf("_")))); // writer = new PrintWriter(new // FileOutputStream(resultsDir.getAbsolutePath()+"/"+entailedElementId)); // } // else // writer = new PrintWriter(new FileOutputStream(f.getAbsoluteFile()+"/"+entailedStr)); // writer = new PrintWriter(new // FileOutputStream(f.getAbsoluteFile()+"/"+entailedElementId)); while (!balScores.get(entailedElementId).isEmpty()) { Pair<Integer, Double> balRule = balScores.get(entailedElementId).poll(); writer.println( entailedStr + "\t" + id2elementDesc.get(balRule.key()) + "\t" + balRule.value()); } } writer.close(); }
/** * Compute rule scores after generating an inverted index an loading normalization data * * @param invertedIndex * @param entailedElement2NormMap * @param entailingElement2NormMap * @param targetVectorsFile * @param vectorsFile * @throws NumberFormatException * @throws IOException */ private void computeRuleScores( TIntObjectMap<List<Pair<Integer, Float>>> invertedIndex, TIntDoubleHashMap entailedElement2NormMap, TIntDoubleHashMap entailingElement2NormMap, File targetVectorsFile, File vectorsFile) throws NumberFormatException, IOException { BufferedReader reader = new BufferedReader(new FileReader(targetVectorsFile)); String vectorFileString = vectorsFile.getAbsolutePath(); File rulesFile = new File(vectorFileString.substring(0, vectorFileString.lastIndexOf(".")) + ".rules"); PrintWriter writer = new PrintWriter(new FileOutputStream(rulesFile)); String line; Integer currEntailedElementId = null; TIntObjectMap<Pair<DoubleContainer, DoubleContainer>> entailingElements = new TIntObjectHashMap<Pair<DoubleContainer, DoubleContainer>>(); int i = 0; int missingFeaturesCount = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split("\t"); Integer entailedElement = Integer.parseInt(tokens[0]); Integer featureId = Integer.parseInt(tokens[2]); Double entailedPmi = Double.parseDouble(tokens[3]); // if there are no features for this element - then it is not similar to any other and we can // move on if (entailedElement2NormMap.get(entailedElement) == 0.0) continue; if (currEntailedElementId != null && !currEntailedElementId.equals(entailedElement)) { i++; if (i % 1000 == 0) { System.out.println( "Entailed element Id: " + entailedElement + ". Number of entailing elements: " + entailingElements.size()); System.out.println("Number of elements gone through: " + i); } writeEntailedElementScores( writer, currEntailedElementId, entailingElements, entailedElement2NormMap, entailingElement2NormMap); } List<Pair<Integer, Float>> elementPmiList = invertedIndex.get(featureId); if (elementPmiList == null) { missingFeaturesCount++; System.out.println("line: " + i + " " + tokens[2]); System.out.println("feature id: " + featureId); continue; } for (Pair<Integer, Float> elementPmiPair : elementPmiList) { // if(!entailedElement.equals(elementPmiPair.key())) { int entailingId = elementPmiPair.key(); double entailingPmi = elementPmiPair.value(); double linScore = entailedPmi + entailingPmi; double coverScore = entailingPmi; Pair<DoubleContainer, DoubleContainer> scoresMap = entailingElements.get(entailingId); if (scoresMap == null) { scoresMap = new Pair<DoubleContainer, DoubleContainer>( new DoubleContainer(linScore), new DoubleContainer(coverScore)); entailingElements.put(entailingId, scoresMap); } else { scoresMap.key().add(linScore); scoresMap.value().add(coverScore); } // } } currEntailedElementId = entailedElement; } writeEntailedElementScores( writer, currEntailedElementId, entailingElements, entailedElement2NormMap, entailingElement2NormMap); reader.close(); writer.close(); System.out.println("Num of missing features: " + missingFeaturesCount); }