Esempio n. 1
0
  public void tttestGetDistribution() throws IOException {
    BufferedWriter bw = new BufferedWriter(new FileWriter("data.csv"));
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    // get the first document:
    //        if (!IndexReader.indexExists(reader.directory()))
    //            throw new FileNotFoundException("No index found at this specific location.");

    CEDD cedd1 = new CEDD();
    FCTH fcth1 = new FCTH();

    CEDD cedd2 = new CEDD();
    FCTH fcth2 = new FCTH();

    JCD jcd1 = new JCD();
    JCD jcd2 = new JCD();
    String[] cls;

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.

      Document doc = reader.document(i);
      cls = doc.getValues(DocumentBuilder.FIELD_NAME_CEDD);
      if (cls != null && cls.length > 0) cedd1.setStringRepresentation(cls[0]);
      cls = doc.getValues(DocumentBuilder.FIELD_NAME_FCTH);
      if (cls != null && cls.length > 0) fcth1.setStringRepresentation(cls[0]);

      for (int j = i + 1; j < docs; j++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
          continue; // if it is deleted, just ignore it.
        Document doc2 = reader.document(j);
        cls = doc2.getValues(DocumentBuilder.FIELD_NAME_CEDD);
        if (cls != null && cls.length > 0) cedd2.setStringRepresentation(cls[0]);
        cls = doc2.getValues(DocumentBuilder.FIELD_NAME_FCTH);
        if (cls != null && cls.length > 0) fcth2.setStringRepresentation(cls[0]);
        jcd1.init(cedd1, fcth1);
        jcd2.init(cedd2, fcth2);
        bw.write(
            cedd1.getDistance(cedd2)
                + ";"
                + fcth1.getDistance(fcth2)
                + ";"
                + jcd1.getDistance(jcd2)
                + "\n");
      }
      if (i % 100 == 0) System.out.println(i + " entries processed ... ");
    }
    bw.close();
  }
  /**
   * We assume that the initial indexing has been done and a set of reference objects has been found
   * and indexed in the separate directory. However further documents were added and they now need
   * to get a ranked list of reference objects. So we (i) get all these new documents missing the
   * field "ro-order" and (ii) add this field.
   *
   * @param indexPath the index to update
   * @throws IOException
   */
  public void updateIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();
    boolean hasDeletions = reader.hasDeletions();
    int countUpdated = 0;

    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher =
        new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1);
    perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper =
        new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField);

    IndexWriter iw =
        new IndexWriter(
            FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
      Document document = reader.document(i);
      if (document.getField("ro-order") == null) { // if the field is not here we create it.
        ImageSearchHits hits = searcher.search(document, readerRo);
        sb.delete(0, sb.length());
        for (int j = 0; j < numReferenceObjectsUsed; j++) {
          sb.append(hits.doc(j).getValues("ro-id")[0]);
          sb.append(' ');
        }
        // System.out.println(sb.toString());
        document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
        iw.updateDocument(
            new Term(
                DocumentBuilder.FIELD_NAME_IDENTIFIER,
                document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]),
            document);
        countUpdated++;
      }

      // progress report
      progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

      // debug:
      System.out.println("countUpdated = " + countUpdated);
    }
    iw.commit();
    iw.close();
  }
  public static SimpleOrderedMap<Object> getIndexInfo(IndexReader reader, boolean countTerms)
      throws IOException {
    Directory dir = reader.directory();
    SimpleOrderedMap<Object> indexInfo = new SimpleOrderedMap<Object>();

    indexInfo.add("numDocs", reader.numDocs());
    indexInfo.add("maxDoc", reader.maxDoc());

    if (countTerms) {
      TermEnum te = null;
      try {
        te = reader.terms();
        int numTerms = 0;
        while (te.next()) {
          numTerms++;
        }
        indexInfo.add("numTerms", numTerms);
      } finally {
        if (te != null) te.close();
      }
    }

    indexInfo.add(
        "version",
        reader.getVersion()); // TODO? Is this different then: IndexReader.getCurrentVersion( dir )?
    indexInfo.add("optimized", reader.isOptimized());
    indexInfo.add("current", reader.isCurrent());
    indexInfo.add("hasDeletions", reader.hasDeletions());
    indexInfo.add("directory", dir);
    indexInfo.add("lastModified", new Date(IndexReader.lastModified(dir)));
    return indexInfo;
  }
  public ImageDuplicates findDuplicates(IndexReader reader) throws IOException {
    // get the first document:
    if (!IndexReader.indexExists(reader.directory()))
      throw new FileNotFoundException("No index found at this specific location.");
    Document doc = reader.document(0);
    ScalableColor sc = null;
    ColorLayout cl = null;
    EdgeHistogram eh = null;

    String[] cls = doc.getValues(DocumentBuilder.FIELD_NAME_COLORLAYOUT);
    if (cls != null && cls.length > 0) {
      cl = new ColorLayout();
      cl.setStringRepresentation(cls[0]);
    }
    String[] scs = doc.getValues(DocumentBuilder.FIELD_NAME_SCALABLECOLOR);
    if (scs != null && scs.length > 0) {
      sc = new ScalableColor();
      sc.setStringRepresentation(scs[0]);
    }
    String[] ehs = doc.getValues(DocumentBuilder.FIELD_NAME_EDGEHISTOGRAM);
    if (ehs != null && ehs.length > 0) {
      eh = new EdgeHistogram();
      eh.setStringRepresentation(ehs[0]);
    }

    HashMap<Float, List<String>> duplicates = new HashMap<Float, List<String>>();

    // find duplicates ...
    boolean hasDeletions = reader.hasDeletions();

    int docs = reader.numDocs();
    int numDuplicates = 0;
    for (int i = 0; i < docs; i++) {
      if (hasDeletions && reader.isDeleted(i)) {
        continue;
      }
      Document d = reader.document(i);
      float distance = getDistance(d, cl, sc, eh);

      if (!duplicates.containsKey(distance)) {
        duplicates.put(distance, new LinkedList<String>());
      } else {
        numDuplicates++;
      }
      duplicates.get(distance).add(d.getField(DocumentBuilder.FIELD_NAME_IDENTIFIER).stringValue());
    }

    if (numDuplicates == 0) return null;

    LinkedList<List<String>> results = new LinkedList<List<String>>();
    for (float f : duplicates.keySet()) {
      if (duplicates.get(f).size() > 1) {
        results.add(duplicates.get(f));
      }
    }
    return new SimpleImageDuplicates(results);
  }
 public AllScorer(Similarity similarity, IndexReader reader, FunctionWeight w)
     throws IOException {
   super(similarity, w);
   this.weight = w;
   this.qWeight = w.getValue();
   this.reader = reader;
   this.maxDoc = reader.maxDoc();
   this.hasDeletions = reader.hasDeletions();
   vals = func.getValues(weight.context, reader);
 }
  /**
   * @param reader
   * @param lireFeature
   * @return the maximum distance found for normalizing.
   * @throws java.io.IOException
   */
  @SuppressWarnings("unchecked")
  private float[] findSimilar(IndexReader reader, LireFeature[] lireFeature) throws IOException {
    float[] maxDistance = new float[lireFeature.length];
    float[] overallMaxDistance = new float[lireFeature.length];

    for (int i = 0; i < overallMaxDistance.length; i++) {
      overallMaxDistance[i] = -1f;
      maxDistance[i] = -1f;
    }

    parDocs = new TreeSet[lireFeature.length];
    for (int i = 0; i < parDocs.length; i++) {
      parDocs[i] = new TreeSet<SimpleResult>();
    }

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    // clear result set ...

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.

      Document d = reader.document(i);
      float[] distance = getDistance(d, lireFeature);
      // calculate the overall max distance to normalize score afterwards
      for (int j = 0; j < distance.length; j++) {
        float f = distance[j];
        if (overallMaxDistance[j] < f) {
          overallMaxDistance[j] = f;
        }
        // if it is the first document:
        if (maxDistance[j] < 0) {
          maxDistance[j] = f;
        }
        // if the array is not full yet:
        if (this.parDocs[j].size() < maxHits) {
          this.parDocs[j].add(new SimpleResult(f, d));
          if (f > maxDistance[j]) {
            maxDistance[j] = f;
          }
        } else if (f < maxDistance[j]) {
          // if it is nearer to the sample than at least on of the current set:
          // remove the last one ...
          this.parDocs[j].remove(this.parDocs[j].last());
          // add the new one ...
          this.parDocs[j].add(new SimpleResult(f, d));
          // and set our new distance border ...
          maxDistance[j] = this.parDocs[j].last().getDistance();
        }
      }
    }
    return maxDistance;
  }
  // test using a sparse index (with deleted docs).
  @Test
  public void testSparseIndex() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer =
        new IndexWriter(
            dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));

    for (int d = -20; d <= 20; d++) {
      Document doc = new Document();
      doc.add(new IntField("id_int", d, Field.Store.NO));
      doc.add(newStringField("body", "body", Field.Store.NO));
      writer.addDocument(doc);
    }

    writer.forceMerge(1);
    BytesRef term0 = new BytesRef();
    NumericUtils.intToPrefixCoded(0, 0, term0);
    writer.deleteDocuments(new Term("id_int", term0));
    writer.close();

    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher search = newSearcher(reader);
    assertTrue(reader.hasDeletions());

    ScoreDoc[] result;
    Query q = new TermQuery(new Term("body", "body"));

    result =
        search.search(q, FieldCacheRangeFilter.newIntRange("id_int", -20, 20, T, T), 100).scoreDocs;
    assertEquals("find all", 40, result.length);

    result =
        search.search(q, FieldCacheRangeFilter.newIntRange("id_int", 0, 20, T, T), 100).scoreDocs;
    assertEquals("find all", 20, result.length);

    result =
        search.search(q, FieldCacheRangeFilter.newIntRange("id_int", -20, 0, T, T), 100).scoreDocs;
    assertEquals("find all", 20, result.length);

    result =
        search.search(q, FieldCacheRangeFilter.newIntRange("id_int", 10, 20, T, T), 100).scoreDocs;
    assertEquals("find all", 11, result.length);

    result =
        search.search(q, FieldCacheRangeFilter.newIntRange("id_int", -20, -10, T, T), 100)
            .scoreDocs;
    assertEquals("find all", 11, result.length);
    reader.close();
    dir.close();
  }
  /**
   * @param reader
   * @param cl
   * @param sc
   * @param eh
   * @return the maximum distance found for normalizing.
   * @throws IOException
   */
  private float findSimilar(IndexReader reader, ColorLayout cl, ScalableColor sc, EdgeHistogram eh)
      throws IOException {
    float maxDistance = -1f, overallMaxDistance = -1f;
    boolean hasDeletions = reader.hasDeletions();

    // clear result set ...
    docs.clear();

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      // bugfix by Roman Kern
      if (hasDeletions && reader.isDeleted(i)) {
        continue;
      }

      Document d = reader.document(i);
      float distance = getDistance(d, cl, sc, eh);
      // calculate the overall max distance to normalize score afterwards
      if (overallMaxDistance < distance) {
        overallMaxDistance = distance;
      }
      // if it is the first document:
      if (maxDistance < 0) {
        maxDistance = distance;
      }
      // if the array is not full yet:
      if (this.docs.size() < maxHits) {
        this.docs.add(new SimpleResult(distance, d));
        if (distance > maxDistance) maxDistance = distance;
      } else if (distance < maxDistance) {
        // if it is nearer to the sample than at least on of the current set:
        // remove the last one ...
        this.docs.remove(this.docs.last());
        // add the new one ...
        this.docs.add(new SimpleResult(distance, d));
        // and set our new distance border ...
        maxDistance = this.docs.last().getDistance();
      }
    }
    return maxDistance;
  }
  @Override
  public boolean reload(String collectionName, String topRankingField) {
    if (collectionName == null) {
      return false;
    }

    CrescentCollectionHandler collectionHandler =
        SpringApplicationContext.getBean(
            "crescentCollectionHandler", CrescentCollectionHandler.class);

    CrescentCollection collection =
        collectionHandler.getCrescentCollections().getCrescentCollection(collectionName);

    if (collection == null) {
      logger.debug("doesn't Collection Info => {}", collectionName);
      init(View.Overview);
      return false;
    }

    if (topRankingField == null) {
      if (collection.getDefaultSearchFields().get(0) != null) {
        topRankingField = collection.getDefaultSearchFields().get(0).getName();
      } else {
        logger.debug("doesn't defaultSearchField => {}", collectionName);
        init(View.Overview);
        return false;
      }
    }

    List<String> fieldName = new ArrayList<String>();
    for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName());
    TopRankingQueue topRankingQueue =
        new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator());

    try {
      Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory()));
      IndexReader reader = IndexReader.open(directory);

      TermEnum terms = reader.terms();

      int termFreq = 0;
      int termCount = 0;
      Term beforeTerm = null;
      // init term count
      fieldTermCount.clear();
      for (CrescentCollectionField field : collection.getFields())
        fieldTermCount.put(field.getName(), 0);
      topRankingQueue.clear();

      while (terms.next()) {
        Term currTerm = terms.term();
        if (beforeTerm == null) {
          beforeTerm = currTerm;
        }

        if (beforeTerm.field() == currTerm.field()) {
          termCount++;
        } else {
          fieldTermCount.put(beforeTerm.field(), termCount);
          termCount = 1;
          beforeTerm = currTerm;
        }

        TermDocs termDocs = reader.termDocs(currTerm);

        while (termDocs.next()) {
          if (currTerm.field().equals(topRankingField)) {
            RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq());
            topRankingQueue.add(e);
          }
        }
        termFreq++;
      }
      if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount);

      terms.close();
      result.put("numOfTerm", termFreq);
      result.put("numOfDoc", reader.numDocs());
      result.put("hasDel", reader.hasDeletions());
      result.put("isOptimize", reader.isOptimized());
      result.put("indexVersion", reader.getVersion());
      result.put("lastModify", new Date(IndexReader.lastModified(directory)));
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    if (topRankingQueue.size() != 0) {
      topRankingTerms = topRankingQueue.toArray();
      Arrays.sort(topRankingTerms);
    }
    result.put("collectionName", collectionName);
    result.put("indexName", collection.getIndexingDirectory());
    result.put("numOfField", collection.getFields().size());
    result.put("termCount", fieldTermCount);
    result.put("topRanking", topRankingTerms);
    result.put("fieldName", fieldName);

    return true;
  }
  // test using a sparse index (with deleted docs). The DocIdSet should be not cacheable, as it uses
  // TermDocs if the range contains 0
  public void testSparseIndex() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer =
        new IndexWriter(dir, new SimpleAnalyzer(), T, IndexWriter.MaxFieldLength.LIMITED);

    for (int d = -20; d <= 20; d++) {
      Document doc = new Document();
      doc.add(new Field("id", Integer.toString(d), Field.Store.NO, Field.Index.NOT_ANALYZED));
      doc.add(new Field("body", "body", Field.Store.NO, Field.Index.NOT_ANALYZED));
      writer.addDocument(doc);
    }

    writer.optimize();
    writer.deleteDocuments(new Term("id", "0"));
    writer.close();

    IndexReader reader = IndexReader.open(dir, true);
    IndexSearcher search = new IndexSearcher(reader);
    assertTrue(reader.hasDeletions());

    ScoreDoc[] result;
    FieldCacheRangeFilter fcrf;
    Query q = new TermQuery(new Term("body", "body"));

    result =
        search.search(
                q,
                fcrf =
                    FieldCacheRangeFilter.newByteRange(
                        "id", Byte.valueOf((byte) -20), Byte.valueOf((byte) 20), T, T),
                100)
            .scoreDocs;
    assertFalse(
        "DocIdSet must be not cacheable",
        fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
    assertEquals("find all", 40, result.length);

    result =
        search.search(
                q,
                fcrf =
                    FieldCacheRangeFilter.newByteRange(
                        "id", Byte.valueOf((byte) 0), Byte.valueOf((byte) 20), T, T),
                100)
            .scoreDocs;
    assertFalse(
        "DocIdSet must be not cacheable",
        fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
    assertEquals("find all", 20, result.length);

    result =
        search.search(
                q,
                fcrf =
                    FieldCacheRangeFilter.newByteRange(
                        "id", Byte.valueOf((byte) -20), Byte.valueOf((byte) 0), T, T),
                100)
            .scoreDocs;
    assertFalse(
        "DocIdSet must be not cacheable",
        fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
    assertEquals("find all", 20, result.length);

    result =
        search.search(
                q,
                fcrf =
                    FieldCacheRangeFilter.newByteRange(
                        "id", Byte.valueOf((byte) 10), Byte.valueOf((byte) 20), T, T),
                100)
            .scoreDocs;
    assertTrue(
        "DocIdSet must be cacheable",
        fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
    assertEquals("find all", 11, result.length);

    result =
        search.search(
                q,
                fcrf =
                    FieldCacheRangeFilter.newByteRange(
                        "id", Byte.valueOf((byte) -20), Byte.valueOf((byte) -10), T, T),
                100)
            .scoreDocs;
    assertTrue(
        "DocIdSet must be cacheable",
        fcrf.getDocIdSet(reader.getSequentialSubReaders()[0]).isCacheable());
    assertEquals("find all", 11, result.length);
  }
  /**
   * Creates a set of reference objects and stores it in a new index (name "<indexPath>-ro"). Then
   * creates ordered lists of reference object positions for each data item in the index with given
   * feature. Finally a new index (name "<indexPath>-ms") is created where all the original
   * documents as well as the new data are stored.
   *
   * @param indexPath the path to the original index
   * @throws IOException
   */
  public void createIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();

    if (numDocs < numReferenceObjects) {
      throw new UnsupportedOperationException("Too few documents in index.");
    }

    // progress report
    progress.setNumDocsAll(numDocs);
    progress.setCurrentState(State.RoSelection);

    boolean hasDeletions = reader.hasDeletions();

    // init reference objects:
    IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true);
    HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects);

    double numDocsDouble = (double) numDocs;
    while (referenceObjsIds.size() < numReferenceObjects) {
      referenceObjsIds.add((int) (numDocsDouble * Math.random()));
    }
    int count = 0;

    if (hasDeletions) {
      System.err.println(
          "WARNING: There are deleted docs in your index. You should "
              + "optimize your index before using this method.");
    }

    // progress report
    progress.setCurrentState(State.RoIndexing);

    // find them in the index and put them into a separate index:
    for (int i : referenceObjsIds) {
      count++;
      Document document = reader.document(i);
      document.add(new Field("ro-id", count + "", StringField.TYPE_STORED));
      iw.addDocument(document);
    }
    iw.commit();
    iw.close();

    // progress report
    progress.setCurrentState(State.Indexing);

    // now find the reference objects for each entry ;)
    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher =
        new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
    analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper =
        new PerFieldAnalyzerWrapper(
            new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField);

    iw =
        new IndexWriter(
            FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
      Document document = reader.document(i);
      ImageSearchHits hits = searcher.search(document, readerRo);
      sb.delete(0, sb.length());
      for (int j = 0; j < numReferenceObjectsUsed; j++) {
        sb.append(hits.doc(j).getValues("ro-id")[0]);
        sb.append(' ');
      }
      // System.out.println(sb.toString());
      document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
      iw.updateDocument(
          new Term(
              DocumentBuilder.FIELD_NAME_IDENTIFIER,
              document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]),
          document);

      // progress report
      progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);
    }
    iw.commit();
    iw.close();

    // progress report
    progress.setCurrentState(State.Idle);
  }
Esempio n. 12
0
 public void setCheckDeletes(boolean checkDeletes) {
   this.checkDeletes = checkDeletes && reader.hasDeletions();
 }
 /** @see LuceneIndexReader#hasDeletions() */
 public boolean hasDeletions() {
   return indexReader.hasDeletions();
 }