public TermFreqVector searchIndexReturnFreqTerms(String searchString, String termString) { System.out.println("Searching for '" + searchString + "'"); // Directory directory = FSDirectory.getDirectory(); IndexReader indexReader; TermFreqVector termFreqDoc = null; try { indexReader = IndexReader.open(indexDirectory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); Term term = new Term(termString, searchString); TermQuery query = new TermQuery(term); TopDocs topDocs = indexSearcher.search(query, 10); if (topDocs.scoreDocs.length > 0) { // while(it.hasNext()){ int docId = topDocs.scoreDocs[0].doc; Document doc = indexSearcher.doc(docId); // textOfURL = doc.get("text"); // sourceCodeOfURL = doc.get("html"); // this.docId = docID; termFreqDoc = indexReader.getTermFreqVector(docId, "text"); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return termFreqDoc; }
@Test(groups = "ch12") public void vectorTest() throws Exception { FullTextSession session = Search.getFullTextSession(openSession()); Transaction tx = session.beginTransaction(); buildIndex(session, tx); try { tx = session.beginTransaction(); Query query = new TermQuery(new Term("content", "properties")); System.out.println(query.toString()); FullTextQuery hibQuery = session.createFullTextQuery(query, ElectricalProperties.class); hibQuery.setProjection( FullTextQuery.DOCUMENT, FullTextQuery.DOCUMENT_ID, FullTextQuery.SCORE); reader = getReader(session); List<Object[]> results = hibQuery.list(); assert results.size() > 0 : "no results returned"; for (int x = 0; x < results.size(); x++) { Integer docId = (Integer) results.get(x)[1]; TermPositionVector vector = (TermPositionVector) reader.getTermFreqVector(docId, "content"); String[] terms = vector.getTerms(); int[] f = vector.getTermFrequencies(); System.out.println(results.get(x)[2]); for (int y = 0; y < vector.size(); y++) { System.out.print("docID# =>" + docId); System.out.print(" term => " + terms[y]); System.out.print(" freq => " + f[y]); int[] positions = vector.getTermPositions(y); TermVectorOffsetInfo[] offsets = vector.getOffsets(y); for (int z = 0; z < positions.length; z++) { System.out.print(" position => " + positions[z]); System.out.print(" starting offset => " + offsets[z].getStartOffset()); System.out.println(" ending offset => " + offsets[z].getEndOffset()); } System.out.println("---------------"); } } for (Object element : session.createQuery("from " + ElectricalProperties.class.getName()).list()) session.delete(element); tx.commit(); } finally { session.close(); if (provider != null) { provider.closeReader(reader); } } }
private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Fieldable fieldable = (Fieldable) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldable.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(fieldable)); Term t = new Term( fieldable.name(), ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable)); // TODO: this really should be "stored" f.add("internal", fieldable.stringValue()); // may be a binary number byte[] arr = fieldable.getBinaryValue(); if (arr != null) { f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length)); } f.add("boost", fieldable.getBoost()); f.add( "docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (fieldable.isTermVectorStored()) { try { TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); for (int i = 0; i < v.size(); i++) { tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(fieldable.name(), f); } return finfo; }
public static void main(String[] args) { TestIndexing t = new TestIndexing(); t.createIndex(); try { IndexReader reader = t.getReader(); TweetVocabulary vocabulary = new TweetVocabulary(reader, "tweet"); System.out.println(vocabulary); // for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { continue; } TermFreqVector tfv = reader.getTermFreqVector(i, "tweet"); System.out.println(tfv); // } } catch (Exception e) { e.printStackTrace(); } }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }
/** * Calculates the cosine similarity between two documents. * * @param d1 the first document * @param d2 the second document * @return the cosine similarity * @throws IOException */ public double getCosineSimilarity(String d1, String d2) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile"))); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents @SuppressWarnings("deprecation") // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); Document doc1 = new Document(); StringReader d1reader = new StringReader(d1); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); Document doc2 = new Document(); StringReader d2reader = new StringReader(d2); doc2.add(new Field("contents", d2reader, TermVector.YES)); writer.addDocument(doc2); // writer.commit(); writer.close(); DocVector[] docs = new DocVector[2]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents"); TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents"); // System.out.println(tfvs1.toString()); if (tfvs1 == null || tfvs2 == null) { return 0.0; } String[] termTexts1 = tfvs1.getTerms(); String[] termTexts2 = tfvs2.getTerms(); // Store the terms and their positions in a hashmap - this represents the vocabulary int pos = 0; for (String term : termTexts1) { terms.put(term, pos++); } for (String term : termTexts2) { if (!terms.containsKey(term)) { terms.put(term, pos++); } } docs[0] = new DocVector(terms); docs[1] = new DocVector(terms); int[] termFreqs1 = tfvs1.getTermFrequencies(); for (int j = 0; j < termTexts1.length; j++) { // System.out.println("termtext:"+termTexts1[j]); double idfValue = getIDF(RAMreader, termTexts1[j]); // System.out.println("idf:"+idfValue); double tfIdfValue = termFreqs1[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[0].setEntry(termTexts1[j], tfIdfValue); } int[] termFreqs2 = tfvs2.getTermFrequencies(); for (int j = 0; j < termTexts2.length; j++) { double idfValue = getIDF(RAMreader, termTexts2[j]); double tfIdfValue = termFreqs2[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[1].setEntry(termTexts2[j], tfIdfValue); } // // // // System.out.println(terms.toString()); // System.out.println(docs[0]); // System.out.println(docs[1]); RAMreader.close(); ramDir.close(); // docs[0].normalize(); // docs[1].normalize(); // Return the cosine similarity of the term vectors return calcCosineSimilarity(docs[0], docs[1]); }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } NamedList termVectors = new NamedList(); rb.rsp.add(TERM_VECTORS, termVectors); FieldOptions allFields = new FieldOptions(); // figure out what options we have, and try to get the appropriate vector allFields.termFreq = params.getBool(TermVectorParams.TF, false); allFields.positions = params.getBool(TermVectorParams.POSITIONS, false); allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false); allFields.docFreq = params.getBool(TermVectorParams.DF, false); allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false); // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); // short cut to all values. boolean all = params.getBool(TermVectorParams.ALL, false); if (all == true) { allFields.termFreq = true; allFields.positions = true; allFields.offsets = true; allFields.docFreq = true; allFields.tfIdf = true; } String fldLst = params.get(TermVectorParams.FIELDS); if (fldLst == null) { fldLst = params.get(CommonParams.FL); } // use this to validate our fields IndexSchema schema = rb.req.getSchema(); // Build up our per field mapping Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>(); NamedList warnings = new NamedList(); List<String> noTV = new ArrayList<String>(); List<String> noPos = new ArrayList<String>(); List<String> noOff = new ArrayList<String>(); // we have specific fields to retrieve if (fldLst != null) { String[] fields = SolrPluginUtils.split(fldLst); for (String field : fields) { SchemaField sf = schema.getFieldOrNull(field); if (sf != null) { if (sf.storeTermVector()) { FieldOptions option = fieldOptions.get(field); if (option == null) { option = new FieldOptions(); option.fieldName = field; fieldOptions.put(field, option); } // get the per field mappings option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq); option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq); option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf); // Validate these are even an option option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions); if (option.positions == true && sf.storeTermPositions() == false) { noPos.add(field); } option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets); if (option.offsets == true && sf.storeTermOffsets() == false) { noOff.add(field); } } else { // field doesn't have term vectors noTV.add(field); } } else { // field doesn't exist throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field); } } } // else, deal with all fields boolean hasWarnings = false; if (noTV.isEmpty() == false) { warnings.add("noTermVectors", noTV); hasWarnings = true; } if (noPos.isEmpty() == false) { warnings.add("noPositions", noPos); hasWarnings = true; } if (noOff.isEmpty() == false) { warnings.add("noOffsets", noOff); hasWarnings = true; } if (hasWarnings == true) { termVectors.add("warnings", warnings); } DocListAndSet listAndSet = rb.getResults(); List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); Iterator<Integer> iter; if (docIds != null && docIds.isEmpty() == false) { iter = docIds.iterator(); } else { DocList list = listAndSet.docList; iter = list.iterator(); } SolrIndexSearcher searcher = rb.req.getSearcher(); IndexReader reader = searcher.getReader(); // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors SchemaField keyField = schema.getUniqueKeyField(); String uniqFieldName = null; if (keyField != null) { uniqFieldName = keyField.getName(); } // Only load the id field to get the uniqueKey of that field SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector( Collections.singleton(uniqFieldName), Collections.<String>emptySet()); TVMapper mapper = new TVMapper(reader); mapper.fieldOptions = allFields; // this will only stay set if fieldOptions.isEmpty() (in other words, only if the // user didn't set any fields) while (iter.hasNext()) { Integer docId = iter.next(); NamedList docNL = new NamedList(); mapper.docNL = docNL; termVectors.add("doc-" + docId, docNL); if (keyField != null) { Document document = reader.document(docId, fieldSelector); Fieldable uniqId = document.getFieldable(uniqFieldName); String uniqVal = null; if (uniqId != null) { uniqVal = keyField.getType().storedToReadable(uniqId); } if (uniqVal != null) { docNL.add("uniqueKey", uniqVal); termVectors.add("uniqueKeyFieldName", uniqFieldName); } } if (fieldOptions.isEmpty() == false) { for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) { mapper.fieldOptions = entry.getValue(); reader.getTermFreqVector(docId, entry.getKey(), mapper); } } else { // deal with all fields by using the allFieldMapper reader.getTermFreqVector(docId, mapper); } } }
/** @see LuceneIndexReader#getTermFreqVector(int, String) */ public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { return indexReader.getTermFreqVector(docNumber, field); }