@Test
  public void testProviderVersion2() throws IOException {
    AnalyzingCompletionLookupProvider currentProvider =
        new AnalyzingCompletionLookupProvider(true, false, true, true);

    RAMDirectory dir = new RAMDirectory();
    writeData(dir, currentProvider);

    IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
    LookupFactory load = currentProvider.load(input);
    PostingsFormat format = new Elasticsearch090PostingsFormat();
    NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer());
    AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder analyzingSuggestHolder =
        load.getAnalyzingSuggestHolder(
            new CompletionFieldMapper(
                new Names("foo"),
                analyzer,
                analyzer,
                format,
                null,
                true,
                true,
                true,
                Integer.MAX_VALUE,
                indexSettings,
                AbstractFieldMapper.MultiFields.empty(),
                null,
                ContextMapping.EMPTY_MAPPING));
    assertThat(analyzingSuggestHolder.sepLabel, is(XAnalyzingSuggester.SEP_LABEL));
    assertThat(analyzingSuggestHolder.payloadSep, is(XAnalyzingSuggester.PAYLOAD_SEP));
    assertThat(analyzingSuggestHolder.endByte, is(XAnalyzingSuggester.END_BYTE));
    dir.close();
  }
  /** {@inheritDoc} */
  public LuceneIndexDataManager merge(final Collection<LuceneIndexDataManager> chains)
      throws IndexException {
    final List<TransactionLog> transactionsLogs = new ArrayList<TransactionLog>();
    final List<Directory> mergeDirectorys = new ArrayList<Directory>();
    final Map<String, Document> documentsBuffer = new HashMap<String, Document>();
    final Map<String, Document> pendingBuffer = new HashMap<String, Document>();

    for (final IndexDataKeeper<Document> indexDataKeeper : chains) {
      final ReducibleInMemoryIndexDataKeeper reducibleInMemoryIndexDataKeeper =
          (ReducibleInMemoryIndexDataKeeper) indexDataKeeper;

      if (reducibleInMemoryIndexDataKeeper.getDocumentCount() > 0) {

        final RAMDirectory directory =
            (RAMDirectory) reducibleInMemoryIndexDataKeeper.getDirectory();
        if (directory.sizeInBytes() > 0) {
          mergeDirectorys.add(directory);
        }
        pendingBuffer.putAll(reducibleInMemoryIndexDataKeeper.getPendingDocumentsBuffer());
        documentsBuffer.putAll(reducibleInMemoryIndexDataKeeper.getDocumentsBuffer());
        transactionsLogs.add(reducibleInMemoryIndexDataKeeper.getTransactionLog());
      }
    }
    LuceneIndexDataManager reducibleInMemoryIndexDataKeeper = null;
    try {
      RAMDirectory newDirectory = null;

      if (mergeDirectorys.size() > 0) {

        newDirectory = new RAMDirectory();
        final IndexWriter newWriter =
            new IndexWriter(newDirectory, new StandardAnalyzer(), MaxFieldLength.UNLIMITED);
        final Directory[] dirsToMerge = new Directory[mergeDirectorys.size()];
        newWriter.addIndexesNoOptimize(mergeDirectorys.toArray(dirsToMerge));
        newWriter.optimize();
        newWriter.close();

        //
      } else {
        newDirectory = new RAMDirectory();
      }
      reducibleInMemoryIndexDataKeeper =
          new ReducibleInMemoryIndexDataKeeper(
              newDirectory,
              documentsBuffer,
              pendingBuffer,
              new CompositeTransactionLog(transactionsLogs));

    } catch (final IOException e) {
      throw new IndexException(e.getLocalizedMessage(), e);
    } catch (final TransactionLogException e) {
      throw new IndexException(e.getLocalizedMessage(), e);
    }

    return reducibleInMemoryIndexDataKeeper;
  }
 private void resetForm() throws IOException {
   deleteList.clear();
   if (dir.sizeInBytes() > 0) {
     // it's ok if we don't close a ram directory
     dir.close();
     // an alternative is to delete all the files and reuse the ram directory
     dir = new RAMDirectory();
   }
   assert (writer == null);
   numDocs = 0;
 }
 // LUCENE-1196
 public void testIllegalEOF() throws Exception {
   RAMDirectory dir = new RAMDirectory();
   IndexOutput o = dir.createOutput("out");
   byte[] b = new byte[1024];
   o.writeBytes(b, 0, 1024);
   o.close();
   IndexInput i = dir.openInput("out");
   i.seek(1024);
   i.close();
   dir.close();
 }
  @Test
  public void testNoDocs() throws IOException {
    AnalyzingCompletionLookupProvider provider =
        new AnalyzingCompletionLookupProvider(true, false, true, true);
    RAMDirectory dir = new RAMDirectory();
    IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
    FieldsConsumer consumer = provider.consumer(output);
    consumer.write(
        new Fields() {
          @Override
          public Iterator<String> iterator() {
            return Arrays.asList("foo").iterator();
          }

          @Override
          public Terms terms(String field) throws IOException {
            return null;
          }

          @Override
          public int size() {
            return 1;
          }
        });
    consumer.close();
    output.close();

    IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
    LookupFactory load = provider.load(input);
    PostingsFormat format = new Elasticsearch090PostingsFormat();
    NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer());
    assertNull(
        load.getLookup(
            new CompletionFieldMapper(
                new Names("foo"),
                analyzer,
                analyzer,
                format,
                null,
                true,
                true,
                true,
                Integer.MAX_VALUE,
                indexSettings,
                AbstractFieldMapper.MultiFields.empty(),
                null,
                ContextMapping.EMPTY_MAPPING),
            new CompletionSuggestionContext(null)));
    dir.close();
  }
  // LUCENE-2852
  public void testSeekToEOFThenBack() throws Exception {
    RAMDirectory dir = new RAMDirectory();

    IndexOutput o = dir.createOutput("out");
    byte[] bytes = new byte[3 * RAMInputStream.BUFFER_SIZE];
    o.writeBytes(bytes, 0, bytes.length);
    o.close();

    IndexInput i = dir.openInput("out");
    i.seek(2 * RAMInputStream.BUFFER_SIZE - 1);
    i.seek(3 * RAMInputStream.BUFFER_SIZE);
    i.seek(RAMInputStream.BUFFER_SIZE);
    i.readBytes(bytes, 0, 2 * RAMInputStream.BUFFER_SIZE);
    i.close();
    dir.close();
  }
  /* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
   */
  public void write(DataOutput out) throws IOException {
    out.writeInt(deleteList.size());
    for (Term term : deleteList) {
      Text.writeString(out, term.field());
      Text.writeString(out, term.text());
    }

    String[] files = dir.list();
    RAMDirectoryUtil.writeRAMFiles(out, dir, files);
  }
  public Lookup buildAnalyzingLookup(
      final CompletionFieldMapper mapper, String[] terms, String[] surfaces, long[] weights)
      throws IOException {
    RAMDirectory dir = new RAMDirectory();
    FilterCodec filterCodec =
        new FilterCodec("filtered", Codec.getDefault()) {
          @Override
          public PostingsFormat postingsFormat() {
            final PostingsFormat in = super.postingsFormat();
            return mapper.postingsFormat(in);
          }
        };
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(mapper.indexAnalyzer());

    indexWriterConfig.setCodec(filterCodec);
    IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
    for (int i = 0; i < weights.length; i++) {
      Document doc = new Document();
      BytesRef payload =
          mapper.buildPayload(
              new BytesRef(surfaces[i]), weights[i], new BytesRef(Long.toString(weights[i])));
      doc.add(mapper.getCompletionField(ContextMapping.EMPTY_CONTEXT, terms[i], payload));
      if (randomBoolean()) {
        writer.commit();
      }
      writer.addDocument(doc);
    }
    writer.commit();
    writer.forceMerge(1, true);
    writer.commit();
    DirectoryReader reader = DirectoryReader.open(writer, true);
    assertThat(reader.leaves().size(), equalTo(1));
    assertThat(reader.leaves().get(0).reader().numDocs(), equalTo(weights.length));
    LeafReaderContext atomicReaderContext = reader.leaves().get(0);
    Terms luceneTerms = atomicReaderContext.reader().terms(mapper.name());
    Lookup lookup =
        ((Completion090PostingsFormat.CompletionTerms) luceneTerms)
            .getLookup(mapper, new CompletionSuggestionContext(null));
    reader.close();
    writer.close();
    dir.close();
    return lookup;
  }
  @Test
  public void testCompletionPostingsFormat() throws IOException {
    AnalyzingCompletionLookupProviderV1 providerV1 =
        new AnalyzingCompletionLookupProviderV1(true, false, true, true);
    AnalyzingCompletionLookupProvider currentProvider =
        new AnalyzingCompletionLookupProvider(true, false, true, true);
    List<Completion090PostingsFormat.CompletionLookupProvider> providers =
        Lists.newArrayList(providerV1, currentProvider);

    Completion090PostingsFormat.CompletionLookupProvider randomProvider =
        providers.get(getRandom().nextInt(providers.size()));
    RAMDirectory dir = new RAMDirectory();
    writeData(dir, randomProvider);

    IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
    LookupFactory load = currentProvider.load(input);
    PostingsFormat format = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT);
    NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer());
    Lookup lookup =
        load.getLookup(
            new CompletionFieldMapper(
                new Names("foo"),
                analyzer,
                analyzer,
                format,
                null,
                true,
                true,
                true,
                Integer.MAX_VALUE,
                indexSettings,
                AbstractFieldMapper.MultiFields.empty(),
                null,
                ContextMapping.EMPTY_MAPPING),
            new CompletionSuggestionContext(null));
    List<LookupResult> result = lookup.lookup("ge", false, 10);
    assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters"));
    assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10"));
    dir.close();
  }
Beispiel #10
0
 final synchronized byte[] addBuffer(int size) {
   byte[] buffer = newBuffer(size);
   if (directory != null)
     synchronized (
         directory) { // Ensure addition of buffer and adjustment to directory size are atomic wrt
       // directory
       buffers.add(buffer);
       directory.sizeInBytes += size;
       sizeInBytes += size;
     }
   else buffers.add(buffer);
   return buffer;
 }
    public void seek(TermEnum terms) throws IOException {
      original.seek(terms);

      docFreq = terms.docFreq();
      pointer = -1;

      if (docFreq > postingMaps.length) { // grow postingsMap
        PostingMap[] newMap = new PostingMap[docFreq];
        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
        for (int i = postingMaps.length; i < docFreq; i++) {
          newMap[i] = new PostingMap();
        }
        postingMaps = newMap;
      }

      out.reset();

      int i = 0;
      while (original.next()) {
        PostingMap map = postingMaps[i++];
        map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
        map.offset = out.getFilePointer(); // save pointer to buffer

        final int tf = original.freq(); // buffer tf & positions
        out.writeVInt(tf);
        int prevPosition = 0;
        for (int j = tf; j > 0; j--) { // delta encode positions
          int p = original.nextPosition();
          out.writeVInt(p - prevPosition);
          prevPosition = p;
        }
      }
      out.flush();
      docFreq = i; // allow for deletions

      Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
      // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space

      // NOTE: this might be substantially faster if RAMInputStream were public
      // and supported a reset() operation.
      in = tempDir.openInput(TEMP_FILE);
    }
  public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences)
      throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File("lib/stoplists/en.txt"));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    for (String s : fileSentences) {
      Document doc1 = new Document();
      StringReader d1reader = new StringReader(s);
      doc1.add(new Field("contents", d1reader, TermVector.YES));
      writer.addDocument(doc1);
    }

    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[fileSentences.size()];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    int pos = 0;
    while (termEnum.next()) {
      Term term = termEnum.term();
      if (!"contents".equals(term.field())) break;
      terms.put(term.text(), pos++);
    }

    // System.out.println("Num terms:"+terms.size());

    for (int i = 0; i < fileSentences.size(); i++) {
      TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i);
      docs[i] = new DocVector(terms);
      if (tfvs == null) continue;
      for (TermFreqVector tfv : tfvs) {
        String[] termTexts = tfv.getTerms();
        int[] termFreqs = tfv.getTermFrequencies();
        for (int j = 0; j < termTexts.length; j++) {
          double idfValue = getIDF(RAMreader, termTexts[j]);
          double tfIdfValue = termFreqs[j] * idfValue;
          docs[i].setEntry(termTexts[j], tfIdfValue);
        }
      }
      docs[i].normalize();
    }

    RAMreader.close();
    ramDir.close();
    // ramDir.close();
    // System.out.println(RAMreader.numDocs());
    // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19]));
    return docs;
  }
  /**
   * Calculates the cosine similarity between two documents.
   *
   * @param d1 the first document
   * @param d2 the second document
   * @return the cosine similarity
   * @throws IOException
   */
  public double getCosineSimilarity(String d1, String d2) throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile")));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    @SuppressWarnings("deprecation")
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    Document doc1 = new Document();
    StringReader d1reader = new StringReader(d1);
    doc1.add(new Field("contents", d1reader, TermVector.YES));

    writer.addDocument(doc1);
    Document doc2 = new Document();
    StringReader d2reader = new StringReader(d2);

    doc2.add(new Field("contents", d2reader, TermVector.YES));
    writer.addDocument(doc2);
    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[2];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents");
    TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents");
    // System.out.println(tfvs1.toString());
    if (tfvs1 == null || tfvs2 == null) {
      return 0.0;
    }

    String[] termTexts1 = tfvs1.getTerms();

    String[] termTexts2 = tfvs2.getTerms();

    // Store the terms and their positions in a hashmap - this represents the vocabulary
    int pos = 0;
    for (String term : termTexts1) {
      terms.put(term, pos++);
    }
    for (String term : termTexts2) {
      if (!terms.containsKey(term)) {
        terms.put(term, pos++);
      }
    }

    docs[0] = new DocVector(terms);
    docs[1] = new DocVector(terms);
    int[] termFreqs1 = tfvs1.getTermFrequencies();
    for (int j = 0; j < termTexts1.length; j++) {
      // System.out.println("termtext:"+termTexts1[j]);
      double idfValue = getIDF(RAMreader, termTexts1[j]);
      // System.out.println("idf:"+idfValue);
      double tfIdfValue = termFreqs1[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[0].setEntry(termTexts1[j], tfIdfValue);
    }

    int[] termFreqs2 = tfvs2.getTermFrequencies();
    for (int j = 0; j < termTexts2.length; j++) {
      double idfValue = getIDF(RAMreader, termTexts2[j]);
      double tfIdfValue = termFreqs2[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[1].setEntry(termTexts2[j], tfIdfValue);
    }

    //
    //
    //
    //		System.out.println(terms.toString());
    //		System.out.println(docs[0]);
    //		System.out.println(docs[1]);
    RAMreader.close();
    ramDir.close();
    //        docs[0].normalize();
    //        docs[1].normalize();

    // Return the cosine similarity of the term vectors

    return calcCosineSimilarity(docs[0], docs[1]);
  }
  public void write(DataOutput out) throws IOException {

    String[] files = dir.listAll();
    RAMDirectoryUtil.writeRAMFiles(out, dir, files);
  }
  private static class SortedTermPositions implements TermPositions {
    private TermPositions original;
    private int[] oldToNew;

    private int docFreq;

    private PostingMap[] postingMaps = new PostingMap[0];
    private int pointer;

    private int freq;
    private int position;

    private static final String TEMP_FILE = "temp";
    private final RAMDirectory tempDir = new RAMDirectory();
    private final RAMOutputStream out = (RAMOutputStream) tempDir.createOutput(TEMP_FILE);
    private IndexInput in;

    public SortedTermPositions(TermPositions original, int[] oldToNew) {
      this.original = original;
      this.oldToNew = oldToNew;
    }

    public void seek(Term term) throws IOException {
      throw new UnsupportedOperationException();
    }

    public void seek(TermEnum terms) throws IOException {
      original.seek(terms);

      docFreq = terms.docFreq();
      pointer = -1;

      if (docFreq > postingMaps.length) { // grow postingsMap
        PostingMap[] newMap = new PostingMap[docFreq];
        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
        for (int i = postingMaps.length; i < docFreq; i++) {
          newMap[i] = new PostingMap();
        }
        postingMaps = newMap;
      }

      out.reset();

      int i = 0;
      while (original.next()) {
        PostingMap map = postingMaps[i++];
        map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
        map.offset = out.getFilePointer(); // save pointer to buffer

        final int tf = original.freq(); // buffer tf & positions
        out.writeVInt(tf);
        int prevPosition = 0;
        for (int j = tf; j > 0; j--) { // delta encode positions
          int p = original.nextPosition();
          out.writeVInt(p - prevPosition);
          prevPosition = p;
        }
      }
      out.flush();
      docFreq = i; // allow for deletions

      Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
      // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space

      // NOTE: this might be substantially faster if RAMInputStream were public
      // and supported a reset() operation.
      in = tempDir.openInput(TEMP_FILE);
    }

    public boolean next() throws IOException {
      pointer++;
      if (pointer < docFreq) {
        in.seek(postingMaps[pointer].offset);
        freq = in.readVInt();
        position = 0;
        return true;
      }
      return false;
    }

    public int doc() {
      return postingMaps[pointer].newDoc;
    }

    public int freq() {
      return freq;
    }

    public int nextPosition() throws IOException {
      int positionIncrement = in.readVInt();
      position += positionIncrement;
      return position;
    }

    public int read(int[] docs, int[] freqs) {
      throw new UnsupportedOperationException();
    }

    public boolean skipTo(int target) {
      throw new UnsupportedOperationException();
    }

    public void close() throws IOException {
      original.close();
    }
  }