private void initBiggerDiagonal(IndexReader reader) throws IOException {
    logger.info("Initializing Spatial Indexes for Queries Strategies");
    if (biggerDiagonal == null) {
      biggerDiagonal = (Double) IndexReaderPersistentCache.get(reader, biggerDiagonalCacheKey);
      twiceBiggerDiagonal =
          (Double) IndexReaderPersistentCache.get(reader, twiceBiggerDiagonalCacheKey);
      if (biggerDiagonal == null || twiceBiggerDiagonal == null) {
        biggerDiagonal = 0.0;
        Term last = null;
        TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_DIAGONAL_INDEX, ""));
        if (termEnum.term() != null
            && termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX))
          last = termEnum.term();
        if (termEnum.term() != null)
          while (termEnum.next())
            if (termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX))
              last = termEnum.term();
        if (last != null) {
          biggerDiagonal = NumberUtils.SortableStr2double(last.text());
          logger.info("Found bigger spatial width:" + biggerDiagonal);
        }
        twiceBiggerDiagonal = 2 * biggerDiagonal;
        halfBiggerDiagonal = biggerDiagonal / ((double) 2);
        logger.info("defining twice bigger spatial width:" + twiceBiggerDiagonal);
        termEnum.close();
        IndexReaderPersistentCache.put(biggerDiagonalCacheKey, biggerDiagonal, reader);
        IndexReaderPersistentCache.put(twiceBiggerDiagonalCacheKey, twiceBiggerDiagonal, reader);
      }
    }

    if (biggerInternalCircleRadium == null) {
      biggerInternalCircleRadium =
          (Double) IndexReaderPersistentCache.get(reader, biggerRadiumCacheKey);
      if (biggerInternalCircleRadium == null) {
        biggerInternalCircleRadium = 0.0;
        Term last = null;
        TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_RADIUM_INDEX, ""));
        if (termEnum.term() != null && termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX))
          last = termEnum.term();
        if (termEnum.term() != null)
          while (termEnum.next())
            if (termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX)) last = termEnum.term();
        if (last != null) {
          biggerInternalCircleRadium = NumberUtils.SortableStr2double(last.text());
          logger.info("Found bigger spatial width:" + biggerInternalCircleRadium);
        }
        termEnum.close();
        IndexReaderPersistentCache.put(biggerRadiumCacheKey, biggerInternalCircleRadium, reader);
      }
    }
  }
Example #2
0
 @Override
 public void writeTo(StreamOutput out) throws IOException {
   out.writeVInt(1); // version
   out.writeUTF(uid.field());
   out.writeUTF(uid.text());
   out.writeLong(version);
 }
Example #3
0
 @Override
 public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
     throws IOException {
   boolean expanded = false;
   int prefixLength = prefix.length();
   TermEnum enumerator = reader.terms(new Term(fieldName, prefix));
   Matcher matcher = pattern.matcher("");
   try {
     do {
       Term term = enumerator.term();
       if (term != null) {
         String text = term.text();
         if ((!text.startsWith(prefix)) || (!term.field().equals(fieldName))) {
           break;
         } else {
           matcher.reset(text.substring(prefixLength));
           if (matcher.matches()) {
             mtv.visitMatchingTerm(term);
             expanded = true;
           }
         }
       }
     } while (enumerator.next());
   } finally {
     enumerator.close();
     matcher.reset();
   }
   if (!expanded) {
     System.out.println("No terms in " + fieldName + " field for: " + toString());
   }
 }
  public void testSimpleSkip() throws IOException {
    Directory dir = new CountingRAMDirectory(new RAMDirectory());
    IndexWriter writer =
        new IndexWriter(
            dir,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())
                .setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()))
                .setMergePolicy(newLogMergePolicy()));
    Term term = new Term("test", "a");
    for (int i = 0; i < 5000; i++) {
      Document d1 = new Document();
      d1.add(newTextField(term.field(), term.text(), Field.Store.NO));
      writer.addDocument(d1);
    }
    writer.commit();
    writer.forceMerge(1);
    writer.close();

    AtomicReader reader = getOnlySegmentReader(DirectoryReader.open(dir));

    for (int i = 0; i < 2; i++) {
      counter = 0;
      DocsAndPositionsEnum tp = reader.termPositionsEnum(term);
      checkSkipTo(tp, 14, 185); // no skips
      checkSkipTo(tp, 17, 190); // one skip on level 0
      checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0

      // this test would fail if we had only one skip level,
      // because than more bytes would be read from the freqStream
      checkSkipTo(tp, 4800, 250); // one skip on level 2
    }
  }
Example #5
0
 public float queryScore(float idf) {
   return (float) Math.log(1 + term.text().length())
       * dictidf
       * dictidf
       * (fromfreq + boost(fromfield))
       * idf;
 }
  /**
   * This is best effort only: the PhraseQuery may contain multiple terms at the same position
   * (think synonyms) or gaps (think stopwords) and it's in this case impossible to translate it
   * into a correct ElasticsearchQuery.
   */
  private static JsonObject convertPhraseQuery(PhraseQuery query) {
    Term[] terms = query.getTerms();

    if (terms.length == 0) {
      throw LOG.cannotQueryOnEmptyPhraseQuery();
    }

    String field = terms[0].field(); // phrase queries are only supporting one field
    StringBuilder phrase = new StringBuilder();
    for (Term term : terms) {
      phrase.append(" ").append(term.text());
    }

    JsonObject phraseQuery =
        JsonBuilder.object()
            .add(
                "match_phrase",
                JsonBuilder.object()
                    .add(
                        field,
                        JsonBuilder.object()
                            .addProperty("query", phrase.toString().trim())
                            .addProperty("slop", query.getSlop())
                            .addProperty("boost", query.getBoost())))
            .build();

    return wrapQueryForNestedIfRequired(field, phraseQuery);
  }
  private static TInfo parseTerm(FunctionQParser fp) throws SyntaxError {
    TInfo tinfo = new TInfo();

    tinfo.indexedField = tinfo.field = fp.parseArg();
    tinfo.val = fp.parseArg();
    tinfo.indexedBytes = new BytesRef();

    FieldType ft = fp.getReq().getSchema().getFieldTypeNoEx(tinfo.field);
    if (ft == null) ft = new StrField();

    if (ft instanceof TextField) {
      // need to do analysis on the term
      String indexedVal = tinfo.val;
      Query q =
          ft.getFieldQuery(fp, fp.getReq().getSchema().getFieldOrNull(tinfo.field), tinfo.val);
      if (q instanceof TermQuery) {
        Term term = ((TermQuery) q).getTerm();
        tinfo.indexedField = term.field();
        indexedVal = term.text();
      }
      UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes);
    } else {
      ft.readableToIndexed(tinfo.val, tinfo.indexedBytes);
    }

    return tinfo;
  }
  /**
   * @param clause
   * @param clauseQuery
   * @param ands
   * @param ors
   */
  private void extractTerms(
      BooleanClause clause,
      org.apache.lucene.search.Query clauseQuery,
      Map<String, Object> ands,
      Map<String, Object> ors) {
    Set<Term> terms = Sets.newHashSet();
    clauseQuery.extractTerms(terms);

    for (Term term : terms) {
      if (clause != null && clause.getOccur() == Occur.SHOULD) {
        accumulateValue(ors, term.field(), term.text());
      } else {
        accumulateValue(ands, term.field(), term.text());
      }
    }
  }
Example #9
0
 @Override
 protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
   if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
     return new SingleTermsEnum(terms.iterator(), term.bytes());
   }
   return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
 }
  public boolean skipTo(Term target) throws IOException {
    // already here
    if (t != null && t.equals(target)) return true;

    int startIdx = tindex.index.search(target.text());

    if (startIdx >= 0) {
      // we hit the term exactly... lucky us!
      if (tenum != null) tenum.close();
      tenum = reader.terms(target);
      pos = startIdx << tindex.intervalBits;
      return setTerm();
    }

    // we didn't hit the term exactly
    startIdx = -startIdx - 1;

    if (startIdx == 0) {
      // our target occurs *before* the first term
      if (tenum != null) tenum.close();
      tenum = reader.terms(target);
      pos = 0;
      return setTerm();
    }

    // back up to the start of the block
    startIdx--;

    if ((pos >> tindex.intervalBits) == startIdx
        && t != null
        && t.text().compareTo(target.text()) <= 0) {
      // we are already in the right block and the current term is before the term we want,
      // so we don't need to seek.
    } else {
      // seek to the right block
      if (tenum != null) tenum.close();
      tenum = reader.terms(target.createTerm(tindex.index.get(startIdx)));
      pos = startIdx << tindex.intervalBits;
      setTerm(); // should be true since it's in the index
    }

    while (t != null && t.text().compareTo(target.text()) < 0) {
      next();
    }

    return t != null;
  }
Example #11
0
 protected Query blendTermQuery(Term term, MappedFieldType fieldType) {
   if (fuzziness != null) {
     if (fieldType != null) {
       try {
         Query query =
             fieldType.fuzzyQuery(
                 term.text(), fuzziness, fuzzyPrefixLength, maxExpansions, transpositions);
         if (query instanceof FuzzyQuery) {
           QueryParsers.setRewriteMethod((FuzzyQuery) query, fuzzyRewriteMethod);
         }
         return query;
       } catch (RuntimeException e) {
         return new TermQuery(term);
         // See long comment below about why we're lenient here.
       }
     }
     int edits = fuzziness.asDistance(term.text());
     FuzzyQuery query =
         new FuzzyQuery(term, edits, fuzzyPrefixLength, maxExpansions, transpositions);
     QueryParsers.setRewriteMethod(query, fuzzyRewriteMethod);
     return query;
   }
   if (fieldType != null) {
     /*
      * Its a bit weird to default to lenient here but its the backwards
      * compatible. It makes some sense when you think about what we are
      * doing here: at this point the user has forced an analyzer and
      * passed some string to the match query. We cut it up using the
      * analyzer and then tried to cram whatever we get into the field.
      * lenient=true here means that we try the terms in the query and on
      * the off chance that they are actually valid terms then we
      * actually try them. lenient=false would mean that we blow up the
      * query if they aren't valid terms. "valid" in this context means
      * "parses properly to something of the type being queried." So "1"
      * is a valid number, etc.
      *
      * We use the text form here because we we've received the term from
      * an analyzer that cut some string into text.
      */
     Query query = termQuery(fieldType, term.bytes(), true);
     if (query != null) {
       return query;
     }
   }
   return new TermQuery(term);
 }
 /**
  * Gets the global term frequency of a term, i.e. how may times it occurs in the whole corpus
  *
  * @param term whose frequency you want
  * @return Global term frequency of term, or 1 if unavailable.
  */
 private int getGlobalTermFreq(Term term) {
   int tf = 0;
   try {
     TermDocs tDocs = this.indexReader.termDocs(term);
     if (tDocs == null) {
       logger.info("Couldn't get term frequency for term " + term.text());
       return 1;
     }
     while (tDocs.next()) {
       tf += tDocs.freq();
     }
   } catch (IOException e) {
     logger.info("Couldn't get term frequency for term " + term.text());
     return 1;
   }
   return tf;
 }
 protected boolean setTerm() {
   t = tenum.term();
   if (t == null
       || t.field() != tindex.fterm.field() // intern'd compare
       || (tindex.prefix != null && !t.text().startsWith(tindex.prefix, 0))) {
     t = null;
     return false;
   }
   return true;
 }
  /* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
   */
  public void write(DataOutput out) throws IOException {
    out.writeInt(deleteList.size());
    for (Term term : deleteList) {
      Text.writeString(out, term.field());
      Text.writeString(out, term.text());
    }

    String[] files = dir.list();
    RAMDirectoryUtil.writeRAMFiles(out, dir, files);
  }
 @Override
 public String toString(String field) {
   StringBuilder buffer = new StringBuilder();
   if (!term.field().equals(field)) {
     buffer.append(term.field());
     buffer.append(":");
   }
   buffer.append(term.text());
   buffer.append(ToStringUtils.boost(getBoost()));
   return buffer.toString();
 }
 public int[] toDocsArray(Term term, Bits bits, IndexReader reader) throws IOException {
   Fields fields = MultiFields.getFields(reader);
   Terms cterms = fields.terms(term.field);
   TermsEnum ctermsEnum = cterms.iterator();
   if (ctermsEnum.seekExact(new BytesRef(term.text()))) {
     PostingsEnum postingsEnum =
         TestUtil.docs(random(), ctermsEnum, bits, null, PostingsEnum.NONE);
     return toArray(postingsEnum);
   }
   return null;
 }
  private static SimpleOrderedMap<Object> getDocumentFieldsInfo(
      Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException {
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    for (Object o : doc.getFields()) {
      Fieldable fieldable = (Fieldable) o;
      SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

      SchemaField sfield = schema.getFieldOrNull(fieldable.name());
      FieldType ftype = (sfield == null) ? null : sfield.getType();

      f.add("type", (ftype == null) ? null : ftype.getTypeName());
      f.add("schema", getFieldFlags(sfield));
      f.add("flags", getFieldFlags(fieldable));

      Term t =
          new Term(
              fieldable.name(),
              ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue());

      f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable));

      // TODO: this really should be "stored"
      f.add("internal", fieldable.stringValue()); // may be a binary number

      byte[] arr = fieldable.getBinaryValue();
      if (arr != null) {
        f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length));
      }
      f.add("boost", fieldable.getBoost());
      f.add(
          "docFreq",
          t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields

      // If we have a term vector, return that
      if (fieldable.isTermVectorStored()) {
        try {
          TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name());
          if (v != null) {
            SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
            for (int i = 0; i < v.size(); i++) {
              tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]);
            }
            f.add("termVector", tfv);
          }
        } catch (Exception ex) {
          log.warn("error writing term vector", ex);
        }
      }

      finfo.add(fieldable.name(), f);
    }
    return finfo;
  }
 protected Term getAnalyzedTerm(TokenType tokenType, String termString) throws IOException {
   Term term = getTerm(termString, tokenType); // first ensure that we've stripped any prefixes
   TokenStream tokenStream = analyzer.tokenStream(term.field(), new StringReader(term.text()));
   tokenStream.reset();
   CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
   StringBuilder sb = new StringBuilder();
   while (tokenStream.incrementToken()) {
     sb.append(termAtt.toString());
   }
   tokenStream.end();
   tokenStream.close();
   return new Term(term.field(), sb.toString());
 }
  /**
   * Computes a term frequency map for the index at the specified location. "Most Frequent" is
   * defined as the terms whose frequencies are greater than or equal to the topTermCutoff * the
   * frequency of the top term, where the topTermCutoff is number between 0 and 1.
   *
   * @return
   * @throws CorruptIndexException
   * @throws IOException
   */
  protected ArrayList<String> retrieveTopTerms() throws CorruptIndexException, IOException {
    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
    List<String> termlist = new ArrayList<String>();
    IndexReader reader = IndexReader.open(ramdir);
    TermEnum terms = reader.terms();
    while (terms.next()) {
      Term term = terms.term();
      String termText = term.text();
      int frequency = reader.docFreq(term);
      frequencyMap.put(termText, frequency);
      termlist.add(termText);
    }
    reader.close();

    // sort the term map by frequency descending
    Collections.sort(
        termlist,
        new Comparator<String>() {
          @Override
          public int compare(String term1, String term2) {
            int term1Freq = frequencyMap.get(term1);
            int term2Freq = frequencyMap.get(term2);

            if (term1Freq < term2Freq) return 1;
            if (term1Freq > term2Freq) return -1;
            return 0;
          }
        });

    // retrieve the top terms based on topTermCutoff
    ArrayList<String> topTerms = new ArrayList<String>();
    double topFreq = -1.0F;
    for (String term : termlist) {
      if (topFreq < 0.0F) {
        // first term, capture the value
        topFreq = (double) frequencyMap.get(term);
        topTerms.add(term);
      } else {
        // not the first term, compute the ratio and discard if below
        // topTermCutoff score
        double ratio = (double) ((double) frequencyMap.get(term) / topFreq);
        if (ratio >= topTermCutoff) {
          topTerms.add(term);
        } else {
          break;
        }
      }
    }

    return topTerms;
  }
    @SuppressWarnings({"StringEquality"})
    @Override
    public void run() {
      TermDocs termDocs = null;
      TermEnum termEnum = null;
      try {
        BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15);
        termDocs = reader.termDocs();
        termEnum = reader.terms(new Term(field));
        do {
          Term term = termEnum.term();
          if (term == null || term.field() != field) break;

          // LUCENE MONITOR: 4.0, move to use bytes!
          UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text());
          termDocs.seek(termEnum);
          while (termDocs.next()) {
            // when traversing, make sure to ignore deleted docs, so the key->docId will be correct
            if (!reader.isDeleted(termDocs.doc())) {
              filter.add(utf8Result.result, 0, utf8Result.length);
            }
          }
        } while (termEnum.next());
        ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey());
        if (fieldCache != null) {
          if (fieldCache.containsKey(field)) {
            BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter);
            filterEntry.loading.set(false);
            fieldCache.put(field, filterEntry);
          }
        }
      } catch (Exception e) {
        logger.warn("failed to load bloom filter for [{}]", e, field);
      } finally {
        try {
          if (termDocs != null) {
            termDocs.close();
          }
        } catch (IOException e) {
          // ignore
        }
        try {
          if (termEnum != null) {
            termEnum.close();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    }
  /**
   * Gets the global term frequencies and writes them in the index directory.
   *
   * @throws Exception the exception
   */
  public void getGlobalTermFrequencies() throws Exception {
    String parentDir =
        Flags.rootDir + (Flags.positional ? "/positional-" : "/") + "lucene/" + Flags.suffix;
    File file = new File(parentDir);
    indexReader = IndexReader.open(FSDirectory.open(file));

    TermEnum terms = indexReader.terms();
    BufferedWriter out =
        new BufferedWriter(new FileWriter(new File(parentDir + "/globalTermFreq.txt")));
    while (terms.next()) {
      org.apache.lucene.index.Term term = terms.term();
      out.write(term.text() + " " + getGlobalTermFreq(term) + "\n");
    }
    out.close();
    indexReader.close();
  }
 private UidField.DocIdAndVersion loadCurrentVersionFromIndex(
     BloomCache bloomCache, Engine.Searcher searcher, Term uid) {
   UnicodeUtil.UTF8Result utf8 = Unicode.fromStringAsUtf8(uid.text());
   for (IndexReader reader : searcher.searcher().subReaders()) {
     BloomFilter filter = bloomCache.filter(reader, UidFieldMapper.NAME, true);
     // we know that its not there...
     if (!filter.isPresent(utf8.result, 0, utf8.length)) {
       continue;
     }
     UidField.DocIdAndVersion docIdAndVersion = UidField.loadDocIdAndVersion(reader, uid);
     // either -2 (its there, but no version associated), or an actual version
     if (docIdAndVersion.docId != -1) {
       return docIdAndVersion;
     }
   }
   return null;
 }
    @Override
    public Query blendTerm(Term term, MappedFieldType fieldType) {
      if (blendedFields == null) {
        return super.blendTerm(term, fieldType);
      }
      final Term[] terms = new Term[blendedFields.length];
      float[] blendedBoost = new float[blendedFields.length];
      for (int i = 0; i < blendedFields.length; i++) {
        terms[i] = blendedFields[i].newTerm(term.text());
        blendedBoost[i] = blendedFields[i].boost;
      }
      if (commonTermsCutoff != null) {
        return BlendedTermQuery.commonTermsBlendedQuery(
            terms, blendedBoost, false, commonTermsCutoff);
      }

      if (tieBreaker == 1.0f) {
        return BlendedTermQuery.booleanBlendedQuery(terms, blendedBoost, false);
      }
      return BlendedTermQuery.dismaxBlendedQuery(terms, blendedBoost, tieBreaker);
    }
  /**
   * Process properties to query sparse content directly.
   *
   * @param request
   * @param query
   * @param asAnon
   * @return
   * @throws StorageClientException
   * @throws AccessDeniedException
   */
  private SolrSearchResultSet processSparseQuery(
      SlingHttpServletRequest request, Query query, boolean asAnon)
      throws StorageClientException, AccessDeniedException, ParseException {
    // use solr parsing to get the terms from the query string
    QueryParser parser =
        new QueryParser(Version.LUCENE_40, "id", new TextField().getQueryAnalyzer());
    org.apache.lucene.search.Query luceneQuery = parser.parse(query.getQueryString());
    Set<Term> terms = Sets.newHashSet();
    luceneQuery.extractTerms(terms);

    Map<String, Object> props = Maps.newHashMap();
    for (Term term : terms) {
      props.put(term.field(), term.text());
    }
    Session session =
        StorageClientUtils.adaptToSession(
            request.getResourceResolver().adaptTo(javax.jcr.Session.class));
    ContentManager cm = session.getContentManager();
    Iterable<Content> items = cm.find(props);
    SolrSearchResultSet rs = new SparseSearchResultSet(items);
    return rs;
  }
Example #25
0
 /**
  * Utility method to dump out all fields (name and terms) for a given index.
  *
  * @param outFile File to dump to.
  * @throws IOException
  */
 public void dumpFields(File outFile) throws IOException {
   FileWriter writer = null;
   try {
     writer = new FileWriter(outFile);
     PrintWriter out = new PrintWriter(writer);
     Set<String> fieldNames = getFacetNames();
     for (String fieldName : fieldNames) {
       TermEnum te = terms(new Term(fieldName, ""));
       out.write(fieldName + ":\n");
       while (te.next()) {
         Term term = te.term();
         if (!fieldName.equals(term.field())) {
           break;
         }
         out.write(term.text() + "\n");
       }
       out.write("\n\n");
     }
   } finally {
     if (writer != null) {
       writer.close();
     }
   }
 }
 private Map<String, Query> getSingleTermQueries(
     String termQuery, TokenType tokenType, boolean collapse) throws IOException {
   Map<String, Query> queriesMap = new HashMap<String, Query>();
   if (termQuery.contains(WILDCARD_ASTERISK)
       || termQuery.contains(WILDCARD_QUESTION)) { // contains a wildcard
     Term term = getTerm(termQuery, tokenType);
     Query query = getWildCardQuery(term);
     if (collapse) { // treat all wildcard variants as a single term
       queriesMap.put(termQuery, query);
     } else { // separate each wildcard term into its own query
       Set<Term> terms = new HashSet<Term>();
       Weight weight = query.createWeight(indexSearcher, false);
       weight.extractTerms(terms);
       for (Term t : terms) {
         // we don't need to analyze term here since it's already from the index
         queriesMap.put(t.text(), getTermQuery(t));
       }
     }
   } else { // regular term (we hope)
     Term term = getAnalyzedTerm(tokenType, termQuery); // analyze it first
     queriesMap.put(termQuery, getTermQuery(term));
   }
   return queriesMap;
 }
 DumbRegexpQuery(Term term, int flags) {
   super(term.field());
   RegExp re = new RegExp(term.text(), flags);
   automaton = re.toAutomaton();
 }
Example #28
0
  private void dumpTerms() throws IOException {
    outputBanner("Terms (in Term.compareTo() order)");

    TermEnum terms = mIndexReader.terms();
    int order = 0;

    while (terms.next()) {
      order++;
      Term term = terms.term();
      String field = term.field();
      String text = term.text();

      if (!wantThisTerm(field, text)) {
        continue;
      }

      outputLn(order + " " + field + ": " + text);

      /*
       * for each term, print the
       * <document, frequency, <position>* > tuples for a term.
       *
       * document:  document in which the Term appears
       * frequency: number of time the Term appears in the document
       * position:  position for each appearance in the document
       *
       * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED));
       *      then the tuple for Term("field", "two") in this document would be like:
       *      88, 2, <2, 4>
       *      where
       *      88 is the document number
       *      2  is the frequency this term appear in the document
       *      <2, 4> are the positions for each appearance in the document
       */
      // by TermPositions
      outputLn("    document, frequency, <position>*");

      // keep track of docs that appear in all terms that are filtered in.
      Set<Integer> docNums = null;
      if (hasFilters()) {
        docNums = new HashSet<Integer>();
      }

      TermPositions termPos = mIndexReader.termPositions(term);
      while (termPos.next()) {
        int docNum = termPos.doc();
        int freq = termPos.freq();

        if (docNums != null) {
          docNums.add(docNum);
        }

        output("    " + docNum + ", " + freq + ", <");

        boolean first = true;
        for (int f = 0; f < freq; f++) {
          int positionInDoc = termPos.nextPosition();
          if (!first) {
            output(" ");
          } else {
            first = false;
          }
          output(positionInDoc + "");
        }
        outputLn(">");
      }
      termPos.close();

      if (docNums != null) {
        computeDocsIntersection(docNums);
      }

      outputLn();

      if (order % 1000 == 0) {
        mConsole.debug("Dumped " + order + " terms");
      }
    }

    terms.close();
  }
  private void _includeIfUnique(
      BooleanQuery booleanQuery,
      boolean like,
      QueryParser queryParser,
      Query query,
      BooleanClause.Occur occur) {

    if (query instanceof TermQuery) {
      Set<Term> terms = new HashSet<Term>();

      TermQuery termQuery = (TermQuery) query;

      termQuery.extractTerms(terms);

      float boost = termQuery.getBoost();

      for (Term term : terms) {
        String termValue = term.text();

        if (like) {
          termValue = termValue.toLowerCase(queryParser.getLocale());

          term = term.createTerm(StringPool.STAR.concat(termValue).concat(StringPool.STAR));

          query = new WildcardQuery(term);
        } else {
          query = new TermQuery(term);
        }

        query.setBoost(boost);

        boolean included = false;

        for (BooleanClause booleanClause : booleanQuery.getClauses()) {
          if (query.equals(booleanClause.getQuery())) {
            included = true;
          }
        }

        if (!included) {
          booleanQuery.add(query, occur);
        }
      }
    } else if (query instanceof BooleanQuery) {
      BooleanQuery curBooleanQuery = (BooleanQuery) query;

      BooleanQuery containerBooleanQuery = new BooleanQuery();

      for (BooleanClause booleanClause : curBooleanQuery.getClauses()) {
        _includeIfUnique(
            containerBooleanQuery,
            like,
            queryParser,
            booleanClause.getQuery(),
            booleanClause.getOccur());
      }

      if (containerBooleanQuery.getClauses().length > 0) {
        booleanQuery.add(containerBooleanQuery, occur);
      }
    } else {
      boolean included = false;

      for (BooleanClause booleanClause : booleanQuery.getClauses()) {
        if (query.equals(booleanClause.getQuery())) {
          included = true;
        }
      }

      if (!included) {
        booleanQuery.add(query, occur);
      }
    }
  }
  @Override
  public boolean reload(String collectionName, String topRankingField) {
    if (collectionName == null) {
      return false;
    }

    CrescentCollectionHandler collectionHandler =
        SpringApplicationContext.getBean(
            "crescentCollectionHandler", CrescentCollectionHandler.class);

    CrescentCollection collection =
        collectionHandler.getCrescentCollections().getCrescentCollection(collectionName);

    if (collection == null) {
      logger.debug("doesn't Collection Info => {}", collectionName);
      init(View.Overview);
      return false;
    }

    if (topRankingField == null) {
      if (collection.getDefaultSearchFields().get(0) != null) {
        topRankingField = collection.getDefaultSearchFields().get(0).getName();
      } else {
        logger.debug("doesn't defaultSearchField => {}", collectionName);
        init(View.Overview);
        return false;
      }
    }

    List<String> fieldName = new ArrayList<String>();
    for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName());
    TopRankingQueue topRankingQueue =
        new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator());

    try {
      Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory()));
      IndexReader reader = IndexReader.open(directory);

      TermEnum terms = reader.terms();

      int termFreq = 0;
      int termCount = 0;
      Term beforeTerm = null;
      // init term count
      fieldTermCount.clear();
      for (CrescentCollectionField field : collection.getFields())
        fieldTermCount.put(field.getName(), 0);
      topRankingQueue.clear();

      while (terms.next()) {
        Term currTerm = terms.term();
        if (beforeTerm == null) {
          beforeTerm = currTerm;
        }

        if (beforeTerm.field() == currTerm.field()) {
          termCount++;
        } else {
          fieldTermCount.put(beforeTerm.field(), termCount);
          termCount = 1;
          beforeTerm = currTerm;
        }

        TermDocs termDocs = reader.termDocs(currTerm);

        while (termDocs.next()) {
          if (currTerm.field().equals(topRankingField)) {
            RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq());
            topRankingQueue.add(e);
          }
        }
        termFreq++;
      }
      if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount);

      terms.close();
      result.put("numOfTerm", termFreq);
      result.put("numOfDoc", reader.numDocs());
      result.put("hasDel", reader.hasDeletions());
      result.put("isOptimize", reader.isOptimized());
      result.put("indexVersion", reader.getVersion());
      result.put("lastModify", new Date(IndexReader.lastModified(directory)));
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    if (topRankingQueue.size() != 0) {
      topRankingTerms = topRankingQueue.toArray();
      Arrays.sort(topRankingTerms);
    }
    result.put("collectionName", collectionName);
    result.put("indexName", collection.getIndexingDirectory());
    result.put("numOfField", collection.getFields().size());
    result.put("termCount", fieldTermCount);
    result.put("topRanking", topRankingTerms);
    result.put("fieldName", fieldName);

    return true;
  }