/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
public SparseInstances readIndex(String indexPath, String destFile, int threshold) throws Exception { if (indexPath == null || destFile == null) { System.out.println("error: indexPath or destFile is null\n"); return null; } DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey); int capacity = (int) terms.size(); HashMap<String, Integer> wordDict = new HashMap<>(capacity); capacity = capacity > 65535 ? 65535 : capacity; SparseInstances instData = new SparseInstances(capacity, reader.numDocs()); TermsEnum termsEnum = terms.iterator(); int index = 0; BytesRef term = null; String strTerm = null; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); if (termsEnum.totalTermFreq() < threshold) { continue; } if (strTerm.isEmpty()) { continue; } if (wordDict.get(strTerm) != null) { continue; } instData.addAttribute(strTerm); index++; } int numAtt = instData.numAttributes(); int numInst = instData.numInstances(); Integer attIndex = null; String id = null; int termIndex = 0; for (int docIndex = 0; docIndex < numInst; docIndex++) { id = reader.document(docIndex).getField(idKey).stringValue(); Terms docTerms = reader.getTermVector(docIndex, reviewKey); if (docTerms == null) { continue; } int[] indices = new int[(int) docTerms.size()]; double[] attValues = new double[(int) docTerms.size()]; termsEnum = docTerms.iterator(); termIndex = 0; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); attIndex = wordDict.get(strTerm); if (attIndex == null) { continue; } indices[termIndex] = attIndex.intValue(); attValues[termIndex] = termsEnum.totalTermFreq(); } ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt); instData.addInstance(instance); } return null; }