private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Fieldable fieldable = (Fieldable) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldable.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(fieldable)); Term t = new Term( fieldable.name(), ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable)); // TODO: this really should be "stored" f.add("internal", fieldable.stringValue()); // may be a binary number byte[] arr = fieldable.getBinaryValue(); if (arr != null) { f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length)); } f.add("boost", fieldable.getBoost()); f.add( "docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (fieldable.isTermVectorStored()) { try { TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); for (int i = 0; i < v.size(); i++) { tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(fieldable.name(), f); } return finfo; }
/** * Calculates the similarity between two TermFreqVectors * * @param vec1 the first TermFreqVector * @param vec2 the second TermFreqVector * @return the cosine similarity of the TermFreqVectors */ public double cosineSimilarity(TermFreqVector vec1, TermFreqVector vec2) throws IOException { HashMap<String, Integer> terms = new HashMap<String, Integer>(); // Get all of the terms and term frequencies in the two vecors String[] termTexts1 = vec1.getTerms(); String[] termTexts2 = vec2.getTerms(); int[] termFreqs1 = vec1.getTermFrequencies(); int[] termFreqs2 = vec2.getTermFrequencies(); // Store the terms and their positions in a hashmap - this represents the vocabulary int pos = 0; for (String term : termTexts1) { terms.put(term, pos++); } for (String term : termTexts2) { if (!terms.containsKey(term)) { terms.put(term, pos++); } } // Create vectors representing the two documents DocVector dv1 = new DocVector(terms); DocVector dv2 = new DocVector(terms); // Set the entries in the two documents, i.e., the term weights in the document vectors for (int i = 0; i < termTexts1.length; i++) { dv1.setEntry(termTexts1[i], termFreqs1[i]); } for (int i = 0; i < termTexts2.length; i++) { dv2.setEntry(termTexts2[i], termFreqs2[i]); } // Normalize dv1.normalize(); dv2.normalize(); // Return the cosine similarity of the two document vectors return (dv1.vector.dotProduct(dv2.vector)) / (dv1.vector.getNorm() * dv2.vector.getNorm()); }
public final String[] get( final ReaderLocal reader, final int docId, final String field, final Timer timer) throws IOException, ParseException, SyntaxError { FieldContentCacheKey key = new FieldContentCacheKey(field, docId); String[] terms = getAndPromote(key); if (terms != null) return terms; TermFreqVector termFreqVector = reader.getTermFreqVector(docId, field); if (termFreqVector == null) return null; terms = termFreqVector.getTerms(); if (terms == null) return null; put(key, terms); return terms; }
private static final Map<String, FacetItem> computeMultivaluedTFV( ReaderAbstract reader, String fieldName, DocIdInterface docIdInterface) throws IOException, SearchLibException { Map<String, FacetItem> termMap = new HashMap<String, FacetItem>(); if (docIdInterface.getSize() == 0) return termMap; for (int docId : docIdInterface.getIds()) { TermFreqVector tfv = reader.getTermFreqVector(docId, fieldName); if (tfv == null) continue; String[] terms = tfv.getTerms(); int[] freqs = tfv.getTermFrequencies(); if (terms == null || freqs == null) continue; int i = 0; for (String term : terms) { if (freqs[i++] > 0) { FacetItem facetItem = termMap.get(term); if (facetItem == null) termMap.put(term, new FacetItem(term, 1)); else facetItem.count++; } } } return termMap; }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s : fileSentences) { Document doc1 = new Document(); StringReader d1reader = new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } // System.out.println("Num terms:"+terms.size()); for (int i = 0; i < fileSentences.size(); i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i] = new DocVector(terms); if (tfvs == null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue = getIDF(RAMreader, termTexts[j]); double tfIdfValue = termFreqs[j] * idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); // ramDir.close(); // System.out.println(RAMreader.numDocs()); // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
/** * Calculates the cosine similarity between two documents. * * @param d1 the first document * @param d2 the second document * @return the cosine similarity * @throws IOException */ public double getCosineSimilarity(String d1, String d2) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile"))); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents @SuppressWarnings("deprecation") // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); Document doc1 = new Document(); StringReader d1reader = new StringReader(d1); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); Document doc2 = new Document(); StringReader d2reader = new StringReader(d2); doc2.add(new Field("contents", d2reader, TermVector.YES)); writer.addDocument(doc2); // writer.commit(); writer.close(); DocVector[] docs = new DocVector[2]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents"); TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents"); // System.out.println(tfvs1.toString()); if (tfvs1 == null || tfvs2 == null) { return 0.0; } String[] termTexts1 = tfvs1.getTerms(); String[] termTexts2 = tfvs2.getTerms(); // Store the terms and their positions in a hashmap - this represents the vocabulary int pos = 0; for (String term : termTexts1) { terms.put(term, pos++); } for (String term : termTexts2) { if (!terms.containsKey(term)) { terms.put(term, pos++); } } docs[0] = new DocVector(terms); docs[1] = new DocVector(terms); int[] termFreqs1 = tfvs1.getTermFrequencies(); for (int j = 0; j < termTexts1.length; j++) { // System.out.println("termtext:"+termTexts1[j]); double idfValue = getIDF(RAMreader, termTexts1[j]); // System.out.println("idf:"+idfValue); double tfIdfValue = termFreqs1[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[0].setEntry(termTexts1[j], tfIdfValue); } int[] termFreqs2 = tfvs2.getTermFrequencies(); for (int j = 0; j < termTexts2.length; j++) { double idfValue = getIDF(RAMreader, termTexts2[j]); double tfIdfValue = termFreqs2[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[1].setEntry(termTexts2[j], tfIdfValue); } // // // // System.out.println(terms.toString()); // System.out.println(docs[0]); // System.out.println(docs[1]); RAMreader.close(); ramDir.close(); // docs[0].normalize(); // docs[1].normalize(); // Return the cosine similarity of the term vectors return calcCosineSimilarity(docs[0], docs[1]); }