@Test public void testProviderVersion2() throws IOException { AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true); RAMDirectory dir = new RAMDirectory(); writeData(dir, currentProvider); IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT); LookupFactory load = currentProvider.load(input); PostingsFormat format = new Elasticsearch090PostingsFormat(); NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer()); AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder analyzingSuggestHolder = load.getAnalyzingSuggestHolder( new CompletionFieldMapper( new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE, indexSettings, AbstractFieldMapper.MultiFields.empty(), null, ContextMapping.EMPTY_MAPPING)); assertThat(analyzingSuggestHolder.sepLabel, is(XAnalyzingSuggester.SEP_LABEL)); assertThat(analyzingSuggestHolder.payloadSep, is(XAnalyzingSuggester.PAYLOAD_SEP)); assertThat(analyzingSuggestHolder.endByte, is(XAnalyzingSuggester.END_BYTE)); dir.close(); }
/** {@inheritDoc} */ public LuceneIndexDataManager merge(final Collection<LuceneIndexDataManager> chains) throws IndexException { final List<TransactionLog> transactionsLogs = new ArrayList<TransactionLog>(); final List<Directory> mergeDirectorys = new ArrayList<Directory>(); final Map<String, Document> documentsBuffer = new HashMap<String, Document>(); final Map<String, Document> pendingBuffer = new HashMap<String, Document>(); for (final IndexDataKeeper<Document> indexDataKeeper : chains) { final ReducibleInMemoryIndexDataKeeper reducibleInMemoryIndexDataKeeper = (ReducibleInMemoryIndexDataKeeper) indexDataKeeper; if (reducibleInMemoryIndexDataKeeper.getDocumentCount() > 0) { final RAMDirectory directory = (RAMDirectory) reducibleInMemoryIndexDataKeeper.getDirectory(); if (directory.sizeInBytes() > 0) { mergeDirectorys.add(directory); } pendingBuffer.putAll(reducibleInMemoryIndexDataKeeper.getPendingDocumentsBuffer()); documentsBuffer.putAll(reducibleInMemoryIndexDataKeeper.getDocumentsBuffer()); transactionsLogs.add(reducibleInMemoryIndexDataKeeper.getTransactionLog()); } } LuceneIndexDataManager reducibleInMemoryIndexDataKeeper = null; try { RAMDirectory newDirectory = null; if (mergeDirectorys.size() > 0) { newDirectory = new RAMDirectory(); final IndexWriter newWriter = new IndexWriter(newDirectory, new StandardAnalyzer(), MaxFieldLength.UNLIMITED); final Directory[] dirsToMerge = new Directory[mergeDirectorys.size()]; newWriter.addIndexesNoOptimize(mergeDirectorys.toArray(dirsToMerge)); newWriter.optimize(); newWriter.close(); // } else { newDirectory = new RAMDirectory(); } reducibleInMemoryIndexDataKeeper = new ReducibleInMemoryIndexDataKeeper( newDirectory, documentsBuffer, pendingBuffer, new CompositeTransactionLog(transactionsLogs)); } catch (final IOException e) { throw new IndexException(e.getLocalizedMessage(), e); } catch (final TransactionLogException e) { throw new IndexException(e.getLocalizedMessage(), e); } return reducibleInMemoryIndexDataKeeper; }
private void resetForm() throws IOException { deleteList.clear(); if (dir.sizeInBytes() > 0) { // it's ok if we don't close a ram directory dir.close(); // an alternative is to delete all the files and reuse the ram directory dir = new RAMDirectory(); } assert (writer == null); numDocs = 0; }
// LUCENE-1196 public void testIllegalEOF() throws Exception { RAMDirectory dir = new RAMDirectory(); IndexOutput o = dir.createOutput("out"); byte[] b = new byte[1024]; o.writeBytes(b, 0, 1024); o.close(); IndexInput i = dir.openInput("out"); i.seek(1024); i.close(); dir.close(); }
@Test public void testNoDocs() throws IOException { AnalyzingCompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, true); RAMDirectory dir = new RAMDirectory(); IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT); FieldsConsumer consumer = provider.consumer(output); consumer.write( new Fields() { @Override public Iterator<String> iterator() { return Arrays.asList("foo").iterator(); } @Override public Terms terms(String field) throws IOException { return null; } @Override public int size() { return 1; } }); consumer.close(); output.close(); IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT); LookupFactory load = provider.load(input); PostingsFormat format = new Elasticsearch090PostingsFormat(); NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer()); assertNull( load.getLookup( new CompletionFieldMapper( new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE, indexSettings, AbstractFieldMapper.MultiFields.empty(), null, ContextMapping.EMPTY_MAPPING), new CompletionSuggestionContext(null))); dir.close(); }
// LUCENE-2852 public void testSeekToEOFThenBack() throws Exception { RAMDirectory dir = new RAMDirectory(); IndexOutput o = dir.createOutput("out"); byte[] bytes = new byte[3 * RAMInputStream.BUFFER_SIZE]; o.writeBytes(bytes, 0, bytes.length); o.close(); IndexInput i = dir.openInput("out"); i.seek(2 * RAMInputStream.BUFFER_SIZE - 1); i.seek(3 * RAMInputStream.BUFFER_SIZE); i.seek(RAMInputStream.BUFFER_SIZE); i.readBytes(bytes, 0, 2 * RAMInputStream.BUFFER_SIZE); i.close(); dir.close(); }
/* (non-Javadoc) * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) */ public void write(DataOutput out) throws IOException { out.writeInt(deleteList.size()); for (Term term : deleteList) { Text.writeString(out, term.field()); Text.writeString(out, term.text()); } String[] files = dir.list(); RAMDirectoryUtil.writeRAMFiles(out, dir, files); }
public Lookup buildAnalyzingLookup( final CompletionFieldMapper mapper, String[] terms, String[] surfaces, long[] weights) throws IOException { RAMDirectory dir = new RAMDirectory(); FilterCodec filterCodec = new FilterCodec("filtered", Codec.getDefault()) { @Override public PostingsFormat postingsFormat() { final PostingsFormat in = super.postingsFormat(); return mapper.postingsFormat(in); } }; IndexWriterConfig indexWriterConfig = new IndexWriterConfig(mapper.indexAnalyzer()); indexWriterConfig.setCodec(filterCodec); IndexWriter writer = new IndexWriter(dir, indexWriterConfig); for (int i = 0; i < weights.length; i++) { Document doc = new Document(); BytesRef payload = mapper.buildPayload( new BytesRef(surfaces[i]), weights[i], new BytesRef(Long.toString(weights[i]))); doc.add(mapper.getCompletionField(ContextMapping.EMPTY_CONTEXT, terms[i], payload)); if (randomBoolean()) { writer.commit(); } writer.addDocument(doc); } writer.commit(); writer.forceMerge(1, true); writer.commit(); DirectoryReader reader = DirectoryReader.open(writer, true); assertThat(reader.leaves().size(), equalTo(1)); assertThat(reader.leaves().get(0).reader().numDocs(), equalTo(weights.length)); LeafReaderContext atomicReaderContext = reader.leaves().get(0); Terms luceneTerms = atomicReaderContext.reader().terms(mapper.name()); Lookup lookup = ((Completion090PostingsFormat.CompletionTerms) luceneTerms) .getLookup(mapper, new CompletionSuggestionContext(null)); reader.close(); writer.close(); dir.close(); return lookup; }
@Test public void testCompletionPostingsFormat() throws IOException { AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true); AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true); List<Completion090PostingsFormat.CompletionLookupProvider> providers = Lists.newArrayList(providerV1, currentProvider); Completion090PostingsFormat.CompletionLookupProvider randomProvider = providers.get(getRandom().nextInt(providers.size())); RAMDirectory dir = new RAMDirectory(); writeData(dir, randomProvider); IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT); LookupFactory load = currentProvider.load(input); PostingsFormat format = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT); NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer()); Lookup lookup = load.getLookup( new CompletionFieldMapper( new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE, indexSettings, AbstractFieldMapper.MultiFields.empty(), null, ContextMapping.EMPTY_MAPPING), new CompletionSuggestionContext(null)); List<LookupResult> result = lookup.lookup("ge", false, 10); assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters")); assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10")); dir.close(); }
final synchronized byte[] addBuffer(int size) { byte[] buffer = newBuffer(size); if (directory != null) synchronized ( directory) { // Ensure addition of buffer and adjustment to directory size are atomic wrt // directory buffers.add(buffer); directory.sizeInBytes += size; sizeInBytes += size; } else buffers.add(buffer); return buffer; }
public void seek(TermEnum terms) throws IOException { original.seek(terms); docFreq = terms.docFreq(); pointer = -1; if (docFreq > postingMaps.length) { // grow postingsMap PostingMap[] newMap = new PostingMap[docFreq]; System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); for (int i = postingMaps.length; i < docFreq; i++) { newMap[i] = new PostingMap(); } postingMaps = newMap; } out.reset(); int i = 0; while (original.next()) { PostingMap map = postingMaps[i++]; map.newDoc = oldToNew[original.doc()]; // remap the newDoc id map.offset = out.getFilePointer(); // save pointer to buffer final int tf = original.freq(); // buffer tf & positions out.writeVInt(tf); int prevPosition = 0; for (int j = tf; j > 0; j--) { // delta encode positions int p = original.nextPosition(); out.writeVInt(p - prevPosition); prevPosition = p; } } out.flush(); docFreq = i; // allow for deletions Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space // NOTE: this might be substantially faster if RAMInputStream were public // and supported a reset() operation. in = tempDir.openInput(TEMP_FILE); }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s : fileSentences) { Document doc1 = new Document(); StringReader d1reader = new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } // System.out.println("Num terms:"+terms.size()); for (int i = 0; i < fileSentences.size(); i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i] = new DocVector(terms); if (tfvs == null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue = getIDF(RAMreader, termTexts[j]); double tfIdfValue = termFreqs[j] * idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); // ramDir.close(); // System.out.println(RAMreader.numDocs()); // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
/** * Calculates the cosine similarity between two documents. * * @param d1 the first document * @param d2 the second document * @return the cosine similarity * @throws IOException */ public double getCosineSimilarity(String d1, String d2) throws IOException { RAMDirectory ramDir = new RAMDirectory(); FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile"))); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr); // Index the full text of both documents @SuppressWarnings("deprecation") // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, // IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); Document doc1 = new Document(); StringReader d1reader = new StringReader(d1); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); Document doc2 = new Document(); StringReader d2reader = new StringReader(d2); doc2.add(new Field("contents", d2reader, TermVector.YES)); writer.addDocument(doc2); // writer.commit(); writer.close(); DocVector[] docs = new DocVector[2]; // Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String, Integer> terms = new HashMap<String, Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); // System.out.println(RAMreader.numDocs()); TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents"); TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents"); // System.out.println(tfvs1.toString()); if (tfvs1 == null || tfvs2 == null) { return 0.0; } String[] termTexts1 = tfvs1.getTerms(); String[] termTexts2 = tfvs2.getTerms(); // Store the terms and their positions in a hashmap - this represents the vocabulary int pos = 0; for (String term : termTexts1) { terms.put(term, pos++); } for (String term : termTexts2) { if (!terms.containsKey(term)) { terms.put(term, pos++); } } docs[0] = new DocVector(terms); docs[1] = new DocVector(terms); int[] termFreqs1 = tfvs1.getTermFrequencies(); for (int j = 0; j < termTexts1.length; j++) { // System.out.println("termtext:"+termTexts1[j]); double idfValue = getIDF(RAMreader, termTexts1[j]); // System.out.println("idf:"+idfValue); double tfIdfValue = termFreqs1[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[0].setEntry(termTexts1[j], tfIdfValue); } int[] termFreqs2 = tfvs2.getTermFrequencies(); for (int j = 0; j < termTexts2.length; j++) { double idfValue = getIDF(RAMreader, termTexts2[j]); double tfIdfValue = termFreqs2[j] * idfValue; // docs[i].setEntry(termTexts[j], termFreqs[j]); // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+" // "+idfValue+"\t"+tfIdfValue); docs[1].setEntry(termTexts2[j], tfIdfValue); } // // // // System.out.println(terms.toString()); // System.out.println(docs[0]); // System.out.println(docs[1]); RAMreader.close(); ramDir.close(); // docs[0].normalize(); // docs[1].normalize(); // Return the cosine similarity of the term vectors return calcCosineSimilarity(docs[0], docs[1]); }
public void write(DataOutput out) throws IOException { String[] files = dir.listAll(); RAMDirectoryUtil.writeRAMFiles(out, dir, files); }
private static class SortedTermPositions implements TermPositions { private TermPositions original; private int[] oldToNew; private int docFreq; private PostingMap[] postingMaps = new PostingMap[0]; private int pointer; private int freq; private int position; private static final String TEMP_FILE = "temp"; private final RAMDirectory tempDir = new RAMDirectory(); private final RAMOutputStream out = (RAMOutputStream) tempDir.createOutput(TEMP_FILE); private IndexInput in; public SortedTermPositions(TermPositions original, int[] oldToNew) { this.original = original; this.oldToNew = oldToNew; } public void seek(Term term) throws IOException { throw new UnsupportedOperationException(); } public void seek(TermEnum terms) throws IOException { original.seek(terms); docFreq = terms.docFreq(); pointer = -1; if (docFreq > postingMaps.length) { // grow postingsMap PostingMap[] newMap = new PostingMap[docFreq]; System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); for (int i = postingMaps.length; i < docFreq; i++) { newMap[i] = new PostingMap(); } postingMaps = newMap; } out.reset(); int i = 0; while (original.next()) { PostingMap map = postingMaps[i++]; map.newDoc = oldToNew[original.doc()]; // remap the newDoc id map.offset = out.getFilePointer(); // save pointer to buffer final int tf = original.freq(); // buffer tf & positions out.writeVInt(tf); int prevPosition = 0; for (int j = tf; j > 0; j--) { // delta encode positions int p = original.nextPosition(); out.writeVInt(p - prevPosition); prevPosition = p; } } out.flush(); docFreq = i; // allow for deletions Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space // NOTE: this might be substantially faster if RAMInputStream were public // and supported a reset() operation. in = tempDir.openInput(TEMP_FILE); } public boolean next() throws IOException { pointer++; if (pointer < docFreq) { in.seek(postingMaps[pointer].offset); freq = in.readVInt(); position = 0; return true; } return false; } public int doc() { return postingMaps[pointer].newDoc; } public int freq() { return freq; } public int nextPosition() throws IOException { int positionIncrement = in.readVInt(); position += positionIncrement; return position; } public int read(int[] docs, int[] freqs) { throw new UnsupportedOperationException(); } public boolean skipTo(int target) { throw new UnsupportedOperationException(); } public void close() throws IOException { original.close(); } }