Example #1
0
 /**
  * Prepare a document reconstructor.
  *
  * @param reader IndexReader to read from.
  * @param fieldNames if non-null or not empty, data will be collected only from these fields,
  *     otherwise data will be collected from all fields
  * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated)
  * @throws Exception
  */
 public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception {
   if (reader == null) {
     throw new Exception("IndexReader cannot be null.");
   }
   this.reader = reader;
   if (fieldNames == null || fieldNames.length == 0) {
     // collect fieldNames
     this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]);
   } else {
     this.fieldNames = fieldNames;
   }
   if (numTerms == -1) {
     Fields fields = MultiFields.getFields(reader);
     numTerms = 0;
     FieldsEnum fe = fields.iterator();
     String fld = null;
     while ((fld = fe.next()) != null) {
       TermsEnum te = fe.terms();
       while (te.next() != null) {
         numTerms++;
       }
     }
     this.numTerms = numTerms;
   }
   deleted = MultiFields.getDeletedDocs(reader);
 }
  /*
   *  listPostings displays the first n postings for a term in a
   *  field in an index (specified by reader).  Set n to MAX_VALUE
   *  to display all postings.
   */
  static void listPostings(IndexReader reader, String termString, String field, Integer n)
      throws IOException {

    System.out.println("\nPostings:  " + termString + " " + field);

    /*
     *  Prepare to access the index.
     */
    BytesRef termBytes = new BytesRef(termString);
    Term term = new Term(field, termBytes);
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    /*
     *  Lookup the collection term frequency (ctf).
     */
    long df = reader.docFreq(term);
    System.out.println("\tdf:  " + df);

    long ctf = reader.totalTermFreq(term);
    System.out.println("\tctf:  " + ctf);

    if (df < 1) return;

    /*
     *  Lookup the inverted list.
     */
    DocsAndPositionsEnum postings =
        MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes);

    /*
     *  Iterate through the first n postings.
     */
    long count = 0;

    while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) {

      System.out.println("\tdocid: " + postings.docID());
      int tf = postings.freq();
      System.out.println("\ttf: " + tf);
      System.out.print("\tPositions: ");

      for (int j = 0; j < tf; j++) {
        int pos = postings.nextPosition();
        System.out.print(pos + " ");
      }

      System.out.println("");

      count++;
    }
    ;

    return;
  }
  public void testKeepsLastFilter() throws Throwable {
    DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
    df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
    ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
    assertTrue("Filtered searching should have found some matches", hits.length > 0);
    for (ScoreDoc hit : hits) {
      StoredDocument d = searcher.doc(hit.doc);
      String url = d.get(KEY_FIELD);
      DocsEnum td =
          _TestUtil.docs(
              random(),
              reader,
              KEY_FIELD,
              new BytesRef(url),
              MultiFields.getLiveDocs(reader),
              null,
              0);

      int lastDoc = 0;
      while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        lastDoc = td.docID();
      }
      assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
    }
  }
  public void testSeekCeilNotFound() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    // Get empty string in there!
    doc.add(newStringField("field", "", Field.Store.NO));
    w.addDocument(doc);

    for (int i = 0; i < 36; i++) {
      doc = new Document();
      String term = "" + (char) (97 + i);
      String term2 = "a" + (char) (97 + i);
      doc.add(newTextField("field", term + " " + term2, Field.Store.NO));
      w.addDocument(doc);
    }

    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
    assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22})));
    assertEquals("a", te.term().utf8ToString());
    assertEquals(1L, te.ord());
    r.close();
    w.close();
    dir.close();
  }
Example #5
0
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Example #6
0
  public TermInfo collect(String term) throws IOException {
    TermInfo info = new TermInfo();
    BytesRef luceneTerm = new BytesRef(term.getBytes());
    // this gives documents in which the term is found, but no offset information can be retrieved
    PostingsEnum postings =
        MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm);
    // now go through each document
    int docId = postings.nextDoc();
    while (docId != PostingsEnum.NO_MORE_DOCS) {
      // get the term vector for that document.
      TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator();
      // find the term of interest
      it.seekExact(luceneTerm);
      // get its posting info. this will contain offset info
      PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS);
      postingsInDoc.nextDoc();

      Document doc = indexReader.document(docId);
      String id = doc.get(idFieldname);
      JATEDocument jd = new JATEDocument(id);
      Set<int[]> offsets = new HashSet<>();
      int totalFreq = postingsInDoc.freq();
      for (int i = 0; i < totalFreq; i++) {
        postingsInDoc.nextPosition();
        offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()});
      }
      info.getOffsets().put(jd, offsets);

      docId = postings.nextDoc();
    }

    return info;
  }
Example #7
0
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
Example #9
0
  /**
   * @param reader
   * @param numTerms
   * @param field
   * @return TermStats[] ordered by terms with highest docFreq first.
   * @throws Exception
   */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
      throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      for (String field : fieldNames) {
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      Iterator<String> fieldIterator = fields.iterator();
      while (fieldIterator.hasNext()) {
        String field = fieldIterator.next();
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
Example #10
0
 /**
  * @param filteredDocument Document with field values extracted for selected fields.
  * @return More Like This query for the passed document.
  */
 public Query like(Map<String, Collection<Object>> filteredDocument) throws IOException {
   if (fieldNames == null) {
     // gather list of valid fields from lucene
     Collection<String> fields = MultiFields.getIndexedFields(ir);
     fieldNames = fields.toArray(new String[fields.size()]);
   }
   return createQuery(retrieveTerms(filteredDocument));
 }
Example #11
0
 protected ValueSourceScorer(IndexReader reader, DocValues values) {
   super(null);
   this.reader = reader;
   this.maxDoc = reader.maxDoc();
   this.values = values;
   setCheckDeletes(true);
   this.delDocs = MultiFields.getDeletedDocs(reader);
 }
Example #12
0
  /**
   * Return a query that will return docs like the passed lucene document ID.
   *
   * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
   * @return a query that will return docs like the passed lucene document ID.
   */
  public Query like(int docNum) throws IOException {
    if (fieldNames == null) {
      // gather list of valid fields from lucene
      Collection<String> fields = MultiFields.getIndexedFields(ir);
      fieldNames = fields.toArray(new String[fields.size()]);
    }

    return createQuery(retrieveTerms(docNum));
  }
  public void testThreeBlocks() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "m" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "mo" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    if (VERBOSE) {
      while (te.next() != null) {
        System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString());
      }
    }

    assertTrue(te.seekExact(new BytesRef("mo")));
    assertEquals(27, te.ord());

    te.seekExact(90);
    assertEquals(new BytesRef("s"), te.term());

    testEnum(te, terms);

    r.close();
    w.close();
    dir.close();
  }
  public void test10kPulsed() throws Exception {
    // we always run this test with pulsing codec.
    Codec cp = _TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));

    File f = _TestUtil.getTempDir("10kpulsed");
    BaseDirectoryWrapper dir = newFSDirectory(f);
    dir.setCheckIndexOnClose(false); // we do this ourselves explicitly
    RandomIndexWriter iw =
        new RandomIndexWriter(
            random(),
            dir,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));

    Document document = new Document();
    FieldType ft = new FieldType(TextField.TYPE_STORED);

    switch (_TestUtil.nextInt(random(), 0, 2)) {
      case 0:
        ft.setIndexOptions(IndexOptions.DOCS_ONLY);
        break;
      case 1:
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        break;
      default:
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        break;
    }

    Field field = newField("field", "", ft);
    document.add(field);

    NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

    for (int i = 0; i < 10050; i++) {
      field.setStringValue(df.format(i));
      iw.addDocument(document);
    }

    IndexReader ir = iw.getReader();
    iw.close();

    TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
    DocsEnum de = null;

    for (int i = 0; i < 10050; i++) {
      String expected = df.format(i);
      assertEquals(expected, te.next().utf8ToString());
      de = _TestUtil.docs(random(), te, null, de, 0);
      assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
    }
    ir.close();

    _TestUtil.checkIndex(dir);
    dir.close();
  }
Example #15
0
 /* Copied from lucene 4.2.x core */
 private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException {
   final Terms terms = MultiFields.getTerms(r, field);
   if (terms != null) {
     final TermsEnum termsEnum = terms.iterator(null);
     if (termsEnum.seekExact(text, true)) {
       return termsEnum.totalTermFreq();
     }
   }
   return 0;
 }
  /**
   * @param reader
   * @param lireFeature
   * @return the maximum distance found for normalizing.
   * @throws java.io.IOException
   */
  @SuppressWarnings("unchecked")
  private float[] findSimilar(IndexReader reader, LireFeature[] lireFeature) throws IOException {
    float[] maxDistance = new float[lireFeature.length];
    float[] overallMaxDistance = new float[lireFeature.length];

    for (int i = 0; i < overallMaxDistance.length; i++) {
      overallMaxDistance[i] = -1f;
      maxDistance[i] = -1f;
    }

    parDocs = new TreeSet[lireFeature.length];
    for (int i = 0; i < parDocs.length; i++) {
      parDocs[i] = new TreeSet<SimpleResult>();
    }

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    // clear result set ...

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.

      Document d = reader.document(i);
      float[] distance = getDistance(d, lireFeature);
      // calculate the overall max distance to normalize score afterwards
      for (int j = 0; j < distance.length; j++) {
        float f = distance[j];
        if (overallMaxDistance[j] < f) {
          overallMaxDistance[j] = f;
        }
        // if it is the first document:
        if (maxDistance[j] < 0) {
          maxDistance[j] = f;
        }
        // if the array is not full yet:
        if (this.parDocs[j].size() < maxHits) {
          this.parDocs[j].add(new SimpleResult(f, d));
          if (f > maxDistance[j]) {
            maxDistance[j] = f;
          }
        } else if (f < maxDistance[j]) {
          // if it is nearer to the sample than at least on of the current set:
          // remove the last one ...
          this.parDocs[j].remove(this.parDocs[j].last());
          // add the new one ...
          this.parDocs[j].add(new SimpleResult(f, d));
          // and set our new distance border ...
          maxDistance[j] = this.parDocs[j].last().getDistance();
        }
      }
    }
    return maxDistance;
  }
Example #17
0
  private FixedBitSet createExpectedResult(
      String queryValue, boolean from, IndexReader topLevelReader, IndexIterationContext context)
      throws IOException {
    final Map<String, List<RandomDoc>> randomValueDocs;
    final Map<String, List<RandomDoc>> linkValueDocuments;
    if (from) {
      randomValueDocs = context.randomValueFromDocs;
      linkValueDocuments = context.toDocuments;
    } else {
      randomValueDocs = context.randomValueToDocs;
      linkValueDocuments = context.fromDocuments;
    }

    FixedBitSet expectedResult = new FixedBitSet(topLevelReader.maxDoc());
    List<RandomDoc> matchingDocs = randomValueDocs.get(queryValue);
    if (matchingDocs == null) {
      return new FixedBitSet(topLevelReader.maxDoc());
    }

    for (RandomDoc matchingDoc : matchingDocs) {
      for (String linkValue : matchingDoc.linkValues) {
        List<RandomDoc> otherMatchingDocs = linkValueDocuments.get(linkValue);
        if (otherMatchingDocs == null) {
          continue;
        }

        for (RandomDoc otherSideDoc : otherMatchingDocs) {
          DocsEnum docsEnum =
              MultiFields.getTermDocsEnum(
                  topLevelReader,
                  MultiFields.getLiveDocs(topLevelReader),
                  "id",
                  new BytesRef(otherSideDoc.id),
                  0);
          assert docsEnum != null;
          int doc = docsEnum.nextDoc();
          expectedResult.set(doc);
        }
      }
    }
    return expectedResult;
  }
 private long getSumTermFrequency(IndexReader reader, String fieldName) {
   Terms collectionTermVector = null;
   try {
     collectionTermVector = MultiFields.getTerms(reader, fieldName);
     long totalTermFreq = collectionTermVector.getSumTotalTermFreq();
     return totalTermFreq;
   } catch (IOException e) {
     LOG.warn("Unable to get total term frequency, it might not be indexed");
   }
   return 0;
 }
Example #19
0
 public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext)
     throws Exception {
   // BytesRef br = termtext;
   long totalTF = 0;
   try {
     Bits liveDocs = MultiFields.getLiveDocs(reader);
     totalTF = reader.getSumTotalTermFreq(field);
     return totalTF;
   } catch (Exception e) {
     return 0;
   }
 }
Example #20
0
  public void tttestGetDistribution() throws IOException {
    BufferedWriter bw = new BufferedWriter(new FileWriter("data.csv"));
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    // get the first document:
    //        if (!IndexReader.indexExists(reader.directory()))
    //            throw new FileNotFoundException("No index found at this specific location.");

    CEDD cedd1 = new CEDD();
    FCTH fcth1 = new FCTH();

    CEDD cedd2 = new CEDD();
    FCTH fcth2 = new FCTH();

    JCD jcd1 = new JCD();
    JCD jcd2 = new JCD();
    String[] cls;

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.

      Document doc = reader.document(i);
      cls = doc.getValues(DocumentBuilder.FIELD_NAME_CEDD);
      if (cls != null && cls.length > 0) cedd1.setStringRepresentation(cls[0]);
      cls = doc.getValues(DocumentBuilder.FIELD_NAME_FCTH);
      if (cls != null && cls.length > 0) fcth1.setStringRepresentation(cls[0]);

      for (int j = i + 1; j < docs; j++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
          continue; // if it is deleted, just ignore it.
        Document doc2 = reader.document(j);
        cls = doc2.getValues(DocumentBuilder.FIELD_NAME_CEDD);
        if (cls != null && cls.length > 0) cedd2.setStringRepresentation(cls[0]);
        cls = doc2.getValues(DocumentBuilder.FIELD_NAME_FCTH);
        if (cls != null && cls.length > 0) fcth2.setStringRepresentation(cls[0]);
        jcd1.init(cedd1, fcth1);
        jcd2.init(cedd2, fcth2);
        bw.write(
            cedd1.getDistance(cedd2)
                + ";"
                + fcth1.getDistance(fcth2)
                + ";"
                + jcd1.getDistance(jcd2)
                + "\n");
      }
      if (i % 100 == 0) System.out.println(i + " entries processed ... ");
    }
    bw.close();
  }
  /**
   * We assume that the initial indexing has been done and a set of reference objects has been found
   * and indexed in the separate directory. However further documents were added and they now need
   * to get a ranked list of reference objects. So we (i) get all these new documents missing the
   * field "ro-order" and (ii) add this field.
   *
   * @param indexPath the index to update
   * @throws IOException
   */
  public void updateIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();
    boolean hasDeletions = reader.hasDeletions();
    int countUpdated = 0;

    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher =
        new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1);
    perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper =
        new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField);

    IndexWriter iw =
        new IndexWriter(
            FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
      Document document = reader.document(i);
      if (document.getField("ro-order") == null) { // if the field is not here we create it.
        ImageSearchHits hits = searcher.search(document, readerRo);
        sb.delete(0, sb.length());
        for (int j = 0; j < numReferenceObjectsUsed; j++) {
          sb.append(hits.doc(j).getValues("ro-id")[0]);
          sb.append(' ');
        }
        // System.out.println(sb.toString());
        document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
        iw.updateDocument(
            new Term(
                DocumentBuilder.FIELD_NAME_IDENTIFIER,
                document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]),
            document);
        countUpdated++;
      }

      // progress report
      progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

      // debug:
      System.out.println("countUpdated = " + countUpdated);
    }
    iw.commit();
    iw.close();
  }
Example #22
0
  /** Returns TermStats[] ordered by terms with highest docFreq first. */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field)
      throws Exception {
    TermStatsQueue tiq = null;

    if (field != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("field " + field + " not found");
      }
      Terms terms = fields.terms(field);
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator(null);
        tiq = new TermStatsQueue(numTerms);
        tiq.fill(field, termsEnum);
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("no fields found for this index");
      }
      tiq = new TermStatsQueue(numTerms);
      for (String fieldName : fields) {
        Terms terms = fields.terms(fieldName);
        if (terms != null) {
          tiq.fill(fieldName, terms.iterator(null));
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
Example #23
0
  public static Terms getTermVector(String fieldname, SolrIndexSearcher solrIndexSearcher)
      throws JATEException {
    try {
      Fields fields = MultiFields.getFields(solrIndexSearcher.getLeafReader());

      Terms vector = fields.terms(fieldname);
      if (vector == null)
        throw new JATEException(String.format("Cannot find expected field: %s", fieldname));
      return vector;
    } catch (IOException ioe) {
      StringBuilder sb =
          new StringBuilder(
              String.format("Cannot find expected field: %s. Error stacktrack: \n", fieldname));
      sb.append(org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(ioe));
      throw new JATEException(sb.toString());
    }
  }
 private Fields generateTermVectors(
     Collection<GetField> getFields,
     boolean withOffsets,
     @Nullable Map<String, String> perFieldAnalyzer)
     throws IOException {
   /* store document in memory index */
   MemoryIndex index = new MemoryIndex(withOffsets);
   for (GetField getField : getFields) {
     String field = getField.getName();
     Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer);
     for (Object text : getField.getValues()) {
       index.addField(field, text.toString(), analyzer);
     }
   }
   /* and read vectors from it */
   return MultiFields.getFields(index.createSearcher().getIndexReader());
 }
  public void testNonRootFloorBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 128; i++) {
      Document doc = new Document();
      String term = "m" + (char) i;
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
      }
      doc.add(newStringField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    BytesRef term;
    int ord = 0;
    while ((term = te.next()) != null) {
      if (VERBOSE) {
        System.out.println("TEST: " + te.ord() + ": " + term.utf8ToString());
      }
      assertEquals(ord, te.ord());
      ord++;
    }

    testEnum(te, terms);

    r.close();
    w.close();
    dir.close();
  }
 private int countTerms(MultiTermQuery q) throws Exception {
   final Terms terms = MultiFields.getTerms(reader, q.getField());
   if (terms == null) return 0;
   final TermsEnum termEnum = q.getTermsEnum(terms);
   assertNotNull(termEnum);
   int count = 0;
   BytesRef cur, last = null;
   while ((cur = termEnum.next()) != null) {
     count++;
     if (last != null) {
       assertTrue(last.compareTo(cur) < 0);
     }
     last = BytesRef.deepCopyOf(cur);
   }
   // LUCENE-3314: the results after next() already returned null are undefined,
   // assertNull(termEnum.next());
   return count;
 }
 @Override
 public FieldStats.Long stats(IndexReader reader) throws IOException {
   int maxDoc = reader.maxDoc();
   Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, name());
   if (terms == null) {
     return null;
   }
   long minValue = LegacyNumericUtils.getMinInt(terms);
   long maxValue = LegacyNumericUtils.getMaxInt(terms);
   return new FieldStats.Long(
       maxDoc,
       terms.getDocCount(),
       terms.getSumDocFreq(),
       terms.getSumTotalTermFreq(),
       isSearchable(),
       isAggregatable(),
       minValue,
       maxValue);
 }
  /** Test the WordScorer emitted by the smoothing model */
  public void testBuildWordScorer() throws IOException {
    SmoothingModel testModel = createTestModel();

    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("field", new WhitespaceAnalyzer());
    PerFieldAnalyzerWrapper wrapper =
        new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper));
    Document doc = new Document();
    doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED));
    writer.addDocument(doc);
    DirectoryReader ir = DirectoryReader.open(writer);

    WordScorer wordScorer =
        testModel
            .buildWordScorerFactory()
            .newScorer(
                ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" "));
    assertWordScorer(wordScorer, testModel);
  }
  public void testSeveralNonRootBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 30; i++) {
      for (int j = 0; j < 30; j++) {
        Document doc = new Document();
        String term = "" + (char) (97 + i) + (char) (97 + j);
        terms.add(term);
        if (VERBOSE) {
          System.out.println("term=" + term);
        }
        doc.add(newTextField("body", term, Field.Store.NO));
        w.addDocument(doc);
      }
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "body").iterator(null);

    for (int i = 0; i < 30; i++) {
      for (int j = 0; j < 30; j++) {
        String term = "" + (char) (97 + i) + (char) (97 + j);
        if (VERBOSE) {
          System.out.println("TEST: check term=" + term);
        }
        assertEquals(term, te.next().utf8ToString());
        assertEquals(30 * i + j, te.ord());
      }
    }

    testEnum(te, terms);

    te.seekExact(0);
    assertEquals("aa", te.term().utf8ToString());

    r.close();
    w.close();
    dir.close();
  }
  @Override
  protected ShardTermlistResponse shardOperation(ShardTermlistRequest request)
      throws ElasticSearchException {
    synchronized (termlistMutex) {
      InternalIndexShard indexShard =
          (InternalIndexShard)
              indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId());
      indexShard.store().directory();
      Engine.Searcher searcher = indexShard.searcher();
      try {
        Set<String> set = new CompactHashSet();

        Fields fields = MultiFields.getFields(searcher.reader());
        if (fields != null) {
          for (Iterator<String> it = fields.iterator(); it.hasNext(); ) {
            String field = it.next();
            if (field.charAt(0) == '_') {
              continue;
            }
            if (request.getField() == null || field.equals(request.getField())) {
              Terms terms = fields.terms(field);
              if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                  set.add(text.utf8ToString());
                  System.out.println("field=" + field + "; text=" + text.utf8ToString());
                }
              }
            }
          }
        }
        return new ShardTermlistResponse(request.index(), request.shardId(), set);
      } catch (IOException ex) {
        throw new ElasticSearchException(ex.getMessage(), ex);
      }
    }
  }