   * Sets boost of termClaimsDescriptionAbstractTitles. boost = weight = factor(tf*idf)
   * @param vecsTerms
   * @param currentField
   * @param factor - adjustment factor ( ex. alpha or beta )
   * @param decayFactor
   * @return
   * @throws java.io.IOException
  public Map<String, TermQuery> setBoost(
      Map<TermFreqVector, String> vecsTerms, String currentField, float factor, float decayFactor)
      throws IOException {
    Map<String, TermQuery> terms = new HashMap<>();
    // setBoost for each of the terms of each of the docs
    int i = 0;
    float norm = (float) 1 / vecsTerms.size();
    for (Map.Entry<TermFreqVector, String> e : vecsTerms.entrySet()) {
      // Increase decay
      String field = e.getValue();
      TermFreqVector docTerms = e.getKey();
      float decay = decayFactor * i;
      // Populate terms: with TermQuries and set boost
      for (String termTxt : docTerms.getTerms()) {
        // Create Term
        Term term = new Term(currentField, termTxt);
        // Calculate weight
        float tf = docTerms.getFreq(termTxt);
        int docs;
        float idf;
        if (sourceField.equals(PatentQuery.all)) {
          docs = ir.getDocCount(field);
          idf = (float) Math.log10((double) docs / (ir.docFreq(new Term(field, termTxt)) + 1));
        } else {
          docs = ir.getDocCount(sourceField);
          idf =
              (float) Math.log10((double) docs / (ir.docFreq(new Term(sourceField, termTxt)) + 1));
        float weight = tf * idf;

        //                System.out.println(term.text() + " -> tf= " + tf + " idf= " + idf + "
        // tfidf= " + weight);
        // Adjust weight by decay factor
        weight = weight - (weight * decay);
        // Create TermQuery and add it to the collection
        TermQuery termQuery = new TermQuery(term);
        // Calculate and set boost
        float boost;
        if (vecsTerms.size() == 1) {
          boost = factor * tf;
        } else {
          boost = factor;

        if (boost != 0) {
          termQuery.setBoost(boost * norm);
          if (terms.containsKey(termTxt)) {
            TermQuery tq = terms.get(termTxt);
            tq.setBoost(tq.getBoost() + termQuery.getBoost());
          } else {
            terms.put(termTxt, termQuery);
    return terms;
  * Merges <code>termClaimsDescriptionAbstractTitleQueries</code> into a single query. In the
  * future this method should probably be in <code>Query</code> class. This is akward way of doing
  * it; but only merge queries method that is available is mergeBooleanQueries; so actually have to
  * make a string termClaimsDescriptionAbstractTitle1^boost1,
  * termClaimsDescriptionAbstractTitle2^boost and then parse it into a query
  * @param termQueries - to merge
  * @param maxTerms
  * @return query created from termClaimsDescriptionAbstractTitleQueries including boost parameters
  * @throws org.apache.lucene.queryparser.classic.ParseException
 public Query mergeQueries(List<TermQuery> termQueries, int maxTerms) throws ParseException {
   BooleanQuery query = new BooleanQuery();
   // Select only the maxTerms number of terms
   int termCount = Math.min(termQueries.size(), maxTerms);
   for (int i = 0; i < termCount; i++) {
     TermQuery termQuery = termQueries.get(i);
     query.add(termQuery, BooleanClause.Occur.SHOULD);
   return query;
 private void readIndexInputFullyWithRandomSeeks(IndexInput indexInput) throws IOException {
   BytesRef ref = new BytesRef(scaledRandomIntBetween(1, 1024));
   long pos = 0;
   while (pos < indexInput.length()) {
     assertEquals(pos, indexInput.getFilePointer());
     int op = random().nextInt(5);
     if (op == 0) {
       int shift = 100 - randomIntBetween(0, 200);
       pos = Math.min(indexInput.length() - 1, Math.max(0, pos + shift));
     } else if (op == 1) {
     } else {
       int min = (int) Math.min(indexInput.length() - pos, ref.bytes.length);
       indexInput.readBytes(ref.bytes, ref.offset, min);
       pos += min;
   * Adjust termClaimsDescriptionAbstractTitle features of the docs with alpha * query; and beta;
   * and assign weights/boost to termClaimsDescriptionAbstractTitles (tf*idf).
   * @param query
   * @param currentField
   * @param alpha
   * @param beta - factor of the equation
   * @param gamma
   * @param decay
   * @param maxExpandedQueryTerms - maximum number of termClaimsDescriptionAbstractTitles in
   *     expanded query
   * @return expandedQuery with boost factors adjusted using Rocchio's algorithm
   * @throws IOException
   * @throws ParseException
  public Query adjust(
      Query query,
      String currentField,
      float alpha,
      float beta,
      float gamma,
      float decay,
      int maxExpandedQueryTerms)
      throws IOException, ParseException {
    Query expandedQuery;
    // setBoost of docs terms
    Map<String, TermQuery> relevantDocsTerms =
        setBoost(docsTermVectorReldocs, currentField, beta, decay);
    Map<String, TermQuery> irrrelevantDocsTerms =
        setBoost(docsTermVectorIrreldocs, currentField, gamma, decay);
    // setBoost of query terms
    // Get queryTerms from the query

    // combine weights according to expansion formula
    List<TermQuery> expandedQueryTerms =
        combine(new HashMap<String, TermQuery>(), relevantDocsTerms, irrrelevantDocsTerms);
    // Sort by boost=weight
    Comparator comparator = new QueryBoostComparator();
    Collections.sort(expandedQueryTerms, comparator);
    int termCount = Math.min(expandedQueryTerms.size(), maxExpandedQueryTerms);
    for (int i = 0; i < termCount; i++) {
      TermQuery tq = expandedQueryTerms.get(i);
      relevantDocsTerms.put(tq.getTerm().text(), tq);
      System.out.print(tq.getTerm().text() + ", ");
    TermFreqVector queryTermsVector = new TermFreqVector(query);
    Map<String, TermQuery> queryTerms;

    queryTerms = setBoost(queryTermsVector, currentField, alpha);

    expandedQueryTerms = combine(queryTerms, relevantDocsTerms, new HashMap<String, TermQuery>());
    Collections.sort(expandedQueryTerms, comparator);
    // Create Expanded Query
    expandedQuery = mergeQueries(expandedQueryTerms, Integer.MAX_VALUE);

    return expandedQuery;
   * Create a PriorityQueue from a word->tf map.
   * @param words a map of words keyed on the word(String) with Int objects as the values.
   * @param fieldNames an array of field names to override defaults.
  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames)
      throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, words.size());
    FreqQ queue = new FreqQ(limit); // will order words by score

    for (String word : words.keySet()) { // for every word
      int tf = words.get(word).x; // term freq in the source doc
      if (minTermFreq > 0 && tf < minTermFreq) {
        continue; // filter out words that don't occur enough times in the source

      // go through all the fields and find the largest document frequency
      String topField = fieldNames[0];
      int docFreq = 0;
      for (String fieldName : fieldNames) {
        int freq = ir.docFreq(new Term(fieldName, word));
        topField = (freq > docFreq) ? fieldName : topField;
        docFreq = (freq > docFreq) ? freq : docFreq;

      if (minDocFreq > 0 && docFreq < minDocFreq) {
        continue; // filter out words that don't occur in enough docs

      if (docFreq > maxDocFreq) {
        continue; // filter out words that occur in too many docs

      if (docFreq == 0) {
        continue; // index update problem?

      float idf = similarity.idf(docFreq, numDocs);
      float score = tf * idf;

      if (queue.size() < limit) {
        // there is still space in the queue
        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
      } else {
        ScoreTerm term = queue.top();
        if (term.score < score) { // update the smallest in the queue in place and update the queue.
          term.update(word, topField, score, idf, docFreq, tf);
    return queue;
 private void corruptFile(Directory dir, String fileIn, String fileOut) throws IOException {
   IndexInput input = dir.openInput(fileIn, IOContext.READONCE);
   IndexOutput output = dir.createOutput(fileOut, IOContext.DEFAULT);
   long len = input.length();
   byte[] b = new byte[1024];
   long broken = randomInt((int) len);
   long pos = 0;
   while (pos < len) {
     int min = (int) Math.min(input.length() - pos, b.length);
     input.readBytes(b, 0, min);
     if (broken >= pos && broken < pos + min) {
       // Flip one byte
       int flipPos = (int) (broken - pos);
       b[flipPos] = (byte) (b[flipPos] ^ 42);
     output.writeBytes(b, min);
     pos += min;
   IOUtils.close(input, output);
 public void testVerifyingIndexOutput() throws IOException {
   Directory dir = newDirectory();
   IndexOutput output = dir.createOutput("foo.bar", IOContext.DEFAULT);
   int iters = scaledRandomIntBetween(10, 100);
   for (int i = 0; i < iters; i++) {
     BytesRef bytesRef = new BytesRef(TestUtil.randomRealisticUnicodeString(random(), 10, 1024));
     output.writeBytes(bytesRef.bytes, bytesRef.offset, bytesRef.length);
   IndexInput indexInput = dir.openInput("foo.bar", IOContext.DEFAULT);
   String checksum = Store.digestToString(CodecUtil.retrieveChecksum(indexInput));
   BytesRef ref = new BytesRef(scaledRandomIntBetween(1, 1024));
   long length = indexInput.length();
   IndexOutput verifyingOutput =
       new Store.LuceneVerifyingIndexOutput(
           new StoreFileMetaData("foo1.bar", length, checksum),
           dir.createOutput("foo1.bar", IOContext.DEFAULT));
   while (length > 0) {
     if (random().nextInt(10) == 0) {
     } else {
       int min = (int) Math.min(length, ref.bytes.length);
       indexInput.readBytes(ref.bytes, ref.offset, min);
       verifyingOutput.writeBytes(ref.bytes, ref.offset, min);
       length -= min;
   verifyingOutput.writeByte((byte) 0x0);
   try {
     fail("should be a corrupted index");
   } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
     // ok
   IOUtils.close(indexInput, verifyingOutput, dir);
  public void testRandom() throws Exception {
    Directory directory = newDirectory();
    final Random r = random();
    final IndexWriterConfig iwc =
        LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r))
                scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here
    RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc);
    int numUniqueChildValues = scaledRandomIntBetween(100, 2000);
    String[] childValues = new String[numUniqueChildValues];
    for (int i = 0; i < numUniqueChildValues; i++) {
      childValues[i] = Integer.toString(i);

    IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet();

    int childDocId = 0;
    int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues);
    ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds =
        new ObjectObjectOpenHashMap<>();
    for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) {
      boolean markParentAsDeleted = rarely();
      boolean filterMe = rarely();
      String parent = Integer.toString(parentDocId);
      Document document = new Document();
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
      if (markParentAsDeleted) {
        document.add(new StringField("delete", "me", Field.Store.NO));
      if (filterMe) {
        document.add(new StringField("filter", "me", Field.Store.NO));

      int numChildDocs = scaledRandomIntBetween(0, 100);
      for (int i = 0; i < numChildDocs; i++) {
        boolean markChildAsDeleted = rarely();
        String childValue = childValues[random().nextInt(childValues.length)];

        document = new Document();
            new StringField(
                Uid.createUid("child", Integer.toString(childDocId++)),
        document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        document.add(new StringField("field1", childValue, Field.Store.NO));
        if (markChildAsDeleted) {
          document.add(new StringField("delete", "me", Field.Store.NO));

        if (!markChildAsDeleted) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores;
          if (childValueToParentIds.containsKey(childValue)) {
            parentIdToChildScores = childValueToParentIds.lget();
          } else {
            childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>());
          if (!markParentAsDeleted && !filterMe) {
            FloatArrayList childScores = parentIdToChildScores.get(parent);
            if (childScores == null) {
              parentIdToChildScores.put(parent, childScores = new FloatArrayList());

    // Delete docs that are marked to be deleted.
    indexWriter.deleteDocuments(new Term("delete", "me"));

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) SearchContext.current())
        .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

    int max = numUniqueChildValues / 4;
    for (int i = 0; i < max; i++) {
      // Simulate a parent update
      if (random().nextBoolean()) {
        final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size();
        int numberOfUpdates =
                random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5));
        for (int j = 0; j < numberOfUpdates; j++) {
          int parentId;
          do {
            parentId = random().nextInt(numParentDocs);
          } while (filteredOrDeletedDocs.contains(parentId));

          String parentUid = Uid.createUid("parent", Integer.toString(parentId));
          indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid));

          Document document = new Document();
          document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES));
          document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));

        indexReader = DirectoryReader.open(indexWriter.w, true);
        searcher = new IndexSearcher(indexReader);
        engineSearcher =
            new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
        ((TestSearchContext) SearchContext.current())
            .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

      String childValue = childValues[random().nextInt(numUniqueChildValues)];
      int shortCircuitParentDocSet = random().nextInt(numParentDocs);
      ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)];
      // leave min/max set to 0 half the time
      int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110);
      int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110);

      QueryBuilder queryBuilder =
          hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue)))
      // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not
      // get live docs as acceptedDocs
      queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me")));
      Query query = parseQuery(queryBuilder);
      BitSetCollector collector = new BitSetCollector(indexReader.maxDoc());
      int numHits = 1 + random().nextInt(25);
      TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits);
      searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector));
      FixedBitSet actualResult = collector.getResult();

      FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc());
      TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits);
      if (childValueToParentIds.containsKey(childValue)) {
        LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader);
        final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()];
        Terms terms = slowLeafReader.terms(UidFieldMapper.NAME);
        if (terms != null) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget();
          TermsEnum termsEnum = terms.iterator(null);
          DocsEnum docsEnum = null;
          for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) {
            int count = entry.getValue().elementsCount;
            if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) {
              TermsEnum.SeekStatus seekStatus =
                  termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey()));
              if (seekStatus == TermsEnum.SeekStatus.FOUND) {
                docsEnum =
                    termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
                scores[docsEnum.docID()] = new FloatArrayList(entry.getValue());
              } else if (seekStatus == TermsEnum.SeekStatus.END) {
        MockScorer mockScorer = new MockScorer(scoreType);
        final LeafCollector leafCollector =
        for (int doc = expectedResult.nextSetBit(0);
            doc < slowLeafReader.maxDoc();
            doc =
                doc + 1 >= expectedResult.length()
                    ? DocIdSetIterator.NO_MORE_DOCS
                    : expectedResult.nextSetBit(doc + 1)) {
          mockScorer.scores = scores[doc];

      assertBitSet(actualResult, expectedResult, searcher);
      assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs());

 public int compare(ScoreDoc o1, ScoreDoc o2) {
   return (int) Math.signum(o2.score - o1.score);
   * Creates a set of reference objects and stores it in a new index (name "<indexPath>-ro"). Then
   * creates ordered lists of reference object positions for each data item in the index with given
   * feature. Finally a new index (name "<indexPath>-ms") is created where all the original
   * documents as well as the new data are stored.
   * @param indexPath the path to the original index
   * @throws IOException
  public void createIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();

    if (numDocs < numReferenceObjects) {
      throw new UnsupportedOperationException("Too few documents in index.");

    // progress report

    boolean hasDeletions = reader.hasDeletions();

    // init reference objects:
    IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true);
    HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects);

    double numDocsDouble = (double) numDocs;
    while (referenceObjsIds.size() < numReferenceObjects) {
      referenceObjsIds.add((int) (numDocsDouble * Math.random()));
    int count = 0;

    if (hasDeletions) {
          "WARNING: There are deleted docs in your index. You should "
              + "optimize your index before using this method.");

    // find them in the index and put them into a separate index:
    for (int i : referenceObjsIds) {
      Document document = reader.document(i);
      document.add(new Field("ro-id", count + "", StringField.TYPE_STORED));

    // now find the reference objects for each entry ;)
    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher =
        new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
    analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper =
        new PerFieldAnalyzerWrapper(
            new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField);

    iw =
        new IndexWriter(
            FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
      Document document = reader.document(i);
      ImageSearchHits hits = searcher.search(document, readerRo);
      sb.delete(0, sb.length());
      for (int j = 0; j < numReferenceObjectsUsed; j++) {
        sb.append(' ');
      document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
          new Term(

      progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

 /** @return number of bytes per term, based on the NumericValue.requiredBits() */
 public long bytesPerValue(BytesRef term) {
   // Estimate about  about 0.8 (8 / 10) compression ratio for
   // numbers, but at least 4 bytes
   return Math.max(type.requiredBits() / 10, 4);
  protected CommonSettings.MemoryStorageFormat chooseStorageFormat(
      LeafReader reader,
      PackedLongValues values,
      Ordinals build,
      RandomAccessOrds ordinals,
      long minValue,
      long maxValue,
      float acceptableOverheadRatio,
      int pageSize) {

    CommonSettings.MemoryStorageFormat format;

    // estimate memory usage for a single packed array
    long packedDelta = maxValue - minValue + 1; // allow for a missing value
    // valuesDelta can be negative if the difference between max and min values overflows the
    // positive side of longs.
    int bitsRequired = packedDelta < 0 ? 64 : PackedInts.bitsRequired(packedDelta);
    PackedInts.FormatAndBits formatAndBits =
        PackedInts.fastestFormatAndBits(reader.maxDoc(), bitsRequired, acceptableOverheadRatio);
    final long singleValuesSize =
                PackedInts.VERSION_CURRENT, reader.maxDoc(), formatAndBits.bitsPerValue)
            * 8L;

    // ordinal memory usage
    final long ordinalsSize = build.ramBytesUsed() + values.ramBytesUsed();

    // estimate the memory signature of paged packing
    long pagedSingleValuesSize =
        (reader.maxDoc() / pageSize + 1) * RamUsageEstimator.NUM_BYTES_OBJECT_REF; // array of pages
    int pageIndex = 0;
    long pageMinOrdinal = Long.MAX_VALUE;
    long pageMaxOrdinal = Long.MIN_VALUE;
    for (int i = 1; i < reader.maxDoc(); ++i, pageIndex = (pageIndex + 1) % pageSize) {
      if (ordinals.cardinality() > 0) {
        long ordinal = ordinals.ordAt(0);
        pageMaxOrdinal = Math.max(ordinal, pageMaxOrdinal);
        pageMinOrdinal = Math.min(ordinal, pageMinOrdinal);
      if (pageIndex == pageSize - 1) {
        // end of page, we now know enough to estimate memory usage
        pagedSingleValuesSize +=
                values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal);

        pageMinOrdinal = Long.MAX_VALUE;
        pageMaxOrdinal = Long.MIN_VALUE;

    if (pageIndex > 0) {
      // last page estimation
      pagedSingleValuesSize +=
              values, acceptableOverheadRatio, pageSize, pageMinOrdinal, pageMaxOrdinal);

    if (ordinalsSize < singleValuesSize) {
      if (ordinalsSize < pagedSingleValuesSize) {
        format = CommonSettings.MemoryStorageFormat.ORDINALS;
      } else {
        format = CommonSettings.MemoryStorageFormat.PAGED;
    } else {
      if (pagedSingleValuesSize < singleValuesSize) {
        format = CommonSettings.MemoryStorageFormat.PAGED;
      } else {
        format = CommonSettings.MemoryStorageFormat.PACKED;
    return format;