public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s : fileSentences) { Document doc1 = new Document(); StringReader d1reader = new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } // System.out.println("Num terms:"+terms.size()); for (int i = 0; i < fileSentences.size(); i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i] = new DocVector(terms); if (tfvs == null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue = getIDF(RAMreader, termTexts[j]); double tfIdfValue = termFreqs[j] * idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); // ramDir.close(); // System.out.println(RAMreader.numDocs()); // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
/** @see LuceneIndexReader#getTermFreqVectors(int) */ public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { return indexReader.getTermFreqVectors(docNumber); }