private void initBiggerDiagonal(IndexReader reader) throws IOException {
    logger.info("Initializing Spatial Indexes for Queries Strategies");
    if (biggerDiagonal == null) {
      biggerDiagonal = (Double) IndexReaderPersistentCache.get(reader, biggerDiagonalCacheKey);
      twiceBiggerDiagonal =
          (Double) IndexReaderPersistentCache.get(reader, twiceBiggerDiagonalCacheKey);
      if (biggerDiagonal == null || twiceBiggerDiagonal == null) {
        biggerDiagonal = 0.0;
        Term last = null;
        TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_DIAGONAL_INDEX, ""));
        if (termEnum.term() != null
            && termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX))
          last = termEnum.term();
        if (termEnum.term() != null)
          while (termEnum.next())
            if (termEnum.term().field().equals(Globals.LUCENE_DIAGONAL_INDEX))
              last = termEnum.term();
        if (last != null) {
          biggerDiagonal = NumberUtils.SortableStr2double(last.text());
          logger.info("Found bigger spatial width:" + biggerDiagonal);
        }
        twiceBiggerDiagonal = 2 * biggerDiagonal;
        halfBiggerDiagonal = biggerDiagonal / ((double) 2);
        logger.info("defining twice bigger spatial width:" + twiceBiggerDiagonal);
        termEnum.close();
        IndexReaderPersistentCache.put(biggerDiagonalCacheKey, biggerDiagonal, reader);
        IndexReaderPersistentCache.put(twiceBiggerDiagonalCacheKey, twiceBiggerDiagonal, reader);
      }
    }

    if (biggerInternalCircleRadium == null) {
      biggerInternalCircleRadium =
          (Double) IndexReaderPersistentCache.get(reader, biggerRadiumCacheKey);
      if (biggerInternalCircleRadium == null) {
        biggerInternalCircleRadium = 0.0;
        Term last = null;
        TermEnum termEnum = reader.terms(new Term(Globals.LUCENE_RADIUM_INDEX, ""));
        if (termEnum.term() != null && termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX))
          last = termEnum.term();
        if (termEnum.term() != null)
          while (termEnum.next())
            if (termEnum.term().field().equals(Globals.LUCENE_RADIUM_INDEX)) last = termEnum.term();
        if (last != null) {
          biggerInternalCircleRadium = NumberUtils.SortableStr2double(last.text());
          logger.info("Found bigger spatial width:" + biggerInternalCircleRadium);
        }
        termEnum.close();
        IndexReaderPersistentCache.put(biggerRadiumCacheKey, biggerInternalCircleRadium, reader);
      }
    }
  }
  public static SimpleOrderedMap<Object> getIndexInfo(IndexReader reader, boolean countTerms)
      throws IOException {
    Directory dir = reader.directory();
    SimpleOrderedMap<Object> indexInfo = new SimpleOrderedMap<Object>();

    indexInfo.add("numDocs", reader.numDocs());
    indexInfo.add("maxDoc", reader.maxDoc());

    if (countTerms) {
      TermEnum te = null;
      try {
        te = reader.terms();
        int numTerms = 0;
        while (te.next()) {
          numTerms++;
        }
        indexInfo.add("numTerms", numTerms);
      } finally {
        if (te != null) te.close();
      }
    }

    indexInfo.add(
        "version",
        reader.getVersion()); // TODO? Is this different then: IndexReader.getCurrentVersion( dir )?
    indexInfo.add("optimized", reader.isOptimized());
    indexInfo.add("current", reader.isCurrent());
    indexInfo.add("hasDeletions", reader.hasDeletions());
    indexInfo.add("directory", dir);
    indexInfo.add("lastModified", new Date(IndexReader.lastModified(dir)));
    return indexInfo;
  }
Beispiel #3
0
  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }
Beispiel #4
0
 @Override
 public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
     throws IOException {
   boolean expanded = false;
   int prefixLength = prefix.length();
   TermEnum enumerator = reader.terms(new Term(fieldName, prefix));
   Matcher matcher = pattern.matcher("");
   try {
     do {
       Term term = enumerator.term();
       if (term != null) {
         String text = term.text();
         if ((!text.startsWith(prefix)) || (!term.field().equals(fieldName))) {
           break;
         } else {
           matcher.reset(text.substring(prefixLength));
           if (matcher.matches()) {
             mtv.visitMatchingTerm(term);
             expanded = true;
           }
         }
       }
     } while (enumerator.next());
   } finally {
     enumerator.close();
     matcher.reset();
   }
   if (!expanded) {
     System.out.println("No terms in " + fieldName + " field for: " + toString());
   }
 }
  public boolean skipTo(Term target) throws IOException {
    // already here
    if (t != null && t.equals(target)) return true;

    int startIdx = tindex.index.search(target.text());

    if (startIdx >= 0) {
      // we hit the term exactly... lucky us!
      if (tenum != null) tenum.close();
      tenum = reader.terms(target);
      pos = startIdx << tindex.intervalBits;
      return setTerm();
    }

    // we didn't hit the term exactly
    startIdx = -startIdx - 1;

    if (startIdx == 0) {
      // our target occurs *before* the first term
      if (tenum != null) tenum.close();
      tenum = reader.terms(target);
      pos = 0;
      return setTerm();
    }

    // back up to the start of the block
    startIdx--;

    if ((pos >> tindex.intervalBits) == startIdx
        && t != null
        && t.text().compareTo(target.text()) <= 0) {
      // we are already in the right block and the current term is before the term we want,
      // so we don't need to seek.
    } else {
      // seek to the right block
      if (tenum != null) tenum.close();
      tenum = reader.terms(target.createTerm(tindex.index.get(startIdx)));
      pos = startIdx << tindex.intervalBits;
      setTerm(); // should be true since it's in the index
    }

    while (t != null && t.text().compareTo(target.text()) < 0) {
      next();
    }

    return t != null;
  }
 TermNumEnumerator(IndexReader reader, TermIndex tindex, String termValue, int pos)
     throws IOException {
   this.reader = reader;
   this.tindex = tindex;
   this.pos = pos;
   tenum = reader.terms(tindex.createTerm(termValue));
   setTerm();
 }
  /**
   * words in each records in input is sorted by document frequency, if ceil(prefix*length)-prefix
   * share at least one token, block them,
   *
   * @param input
   * @param lines number of lines to block
   * @param prefix prefix parameter
   * @param maxDocFreq max document frequency for a token to be considered a rare feature
   * @param indexFolder temporary index folder
   * @param output
   * @param report
   * @throws Exception
   */
  public static void prefixBlockingWithLucene(
      String input,
      int lines,
      float prefix,
      int maxPrefixLength,
      int maxDocFreq,
      String indexFolder,
      String output,
      String report)
      throws Exception {
    long startTime = new Date().getTime();
    Common.indexPrefix(input, lines, prefix, maxPrefixLength, indexFolder);

    IndexReader ireader = IndexReader.open(indexFolder);
    IndexSearcher isearcher = new IndexSearcher(ireader);
    TermEnum te = ireader.terms();
    PrintWriter pw = IOFactory.getPrintWriter(output);
    int maxBlockSize = 0;
    int totalBlockSize = 0;
    int blockCount = 0;
    while (te.next()) {
      TopDocs td = isearcher.search(new TermQuery(te.term()), maxDocFreq + 1);

      // discard blocks with only one individual or of too frequent tokens
      if (td.scoreDocs.length <= 1 || td.scoreDocs.length > maxDocFreq) continue;

      if (td.scoreDocs.length > maxBlockSize) maxBlockSize = td.scoreDocs.length;
      totalBlockSize += td.scoreDocs.length;
      blockCount++;
      pw.print(ireader.document(td.scoreDocs[0].doc).get("id"));
      for (int i = 1; i < td.scoreDocs.length; i++) {
        pw.print(" " + ireader.document(td.scoreDocs[i].doc).get("id"));
      }
      pw.println();
      if (blockCount % 1000 == 0)
        System.out.println(new Date().toString() + " : " + blockCount + " blocks");
    }
    pw.close();
    ireader.close();
    long time = new Date().getTime() - startTime;
    pw = IOFactory.getPrintWriter(report, true);
    pw.println(new Date().toString());
    pw.println("#individual: " + lines);
    pw.println("blocking parameter: " + prefix);
    pw.println("time: " + time);
    pw.println("#block: " + blockCount);
    pw.println("max block size: " + maxBlockSize);
    pw.println("avg block size: " + (totalBlockSize + 0.0) / blockCount);
    pw.close();
    Common.deleteFolder(new File(indexFolder));
    System.out.println(prefix + "\t" + lines + "\t" + time); // for speed test
  }
 private int getDocFreq(String term) {
   int result = 1;
   currentTerm = currentTerm.createTerm(term);
   try {
     TermEnum termEnum = reader.terms(currentTerm);
     if (termEnum != null && termEnum.term().equals(currentTerm)) {
       result = termEnum.docFreq();
     }
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
   return result;
 }
  /**
   * Computes a term frequency map for the index at the specified location. "Most Frequent" is
   * defined as the terms whose frequencies are greater than or equal to the topTermCutoff * the
   * frequency of the top term, where the topTermCutoff is number between 0 and 1.
   *
   * @return
   * @throws CorruptIndexException
   * @throws IOException
   */
  protected ArrayList<String> retrieveTopTerms() throws CorruptIndexException, IOException {
    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
    List<String> termlist = new ArrayList<String>();
    IndexReader reader = IndexReader.open(ramdir);
    TermEnum terms = reader.terms();
    while (terms.next()) {
      Term term = terms.term();
      String termText = term.text();
      int frequency = reader.docFreq(term);
      frequencyMap.put(termText, frequency);
      termlist.add(termText);
    }
    reader.close();

    // sort the term map by frequency descending
    Collections.sort(
        termlist,
        new Comparator<String>() {
          @Override
          public int compare(String term1, String term2) {
            int term1Freq = frequencyMap.get(term1);
            int term2Freq = frequencyMap.get(term2);

            if (term1Freq < term2Freq) return 1;
            if (term1Freq > term2Freq) return -1;
            return 0;
          }
        });

    // retrieve the top terms based on topTermCutoff
    ArrayList<String> topTerms = new ArrayList<String>();
    double topFreq = -1.0F;
    for (String term : termlist) {
      if (topFreq < 0.0F) {
        // first term, capture the value
        topFreq = (double) frequencyMap.get(term);
        topTerms.add(term);
      } else {
        // not the first term, compute the ratio and discard if below
        // topTermCutoff score
        double ratio = (double) ((double) frequencyMap.get(term) / topFreq);
        if (ratio >= topTermCutoff) {
          topTerms.add(term);
        } else {
          break;
        }
      }
    }

    return topTerms;
  }
  public void getIndexInfo(String indexdir, int freqThreshold) {
    IndexReader reader = null;

    try {
      Directory dir = FSDirectory.open(new File(indexdir));
      System.out.println(dir);
      reader = IndexReader.open(dir);

      System.out.println("document num:" + reader.numDocs());
      System.out.println("======================");

      TermEnum terms = reader.terms();
      sortedTermQueue.clear();
      maxDocNum = reader.maxDoc();
      linkMap.clear();
      termList.clear();
      while (terms.next()) {
        // System.out.print(terms.term() + "\tDocFreq:" +
        TermDocs termDocs = reader.termDocs(terms.term());
        MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum);
        if (temp.totalFreq < freqThreshold) {
          continue;
        } /*
           * if(temp.originTrem.text().length()==1){ continue; }
           */
        linkMap.put(temp.originTrem.text(), temp);
        sortedTermQueue.add(temp);
        termList.add(temp);
      }
      System.out.println("total Size:" + sortedTermQueue.size());
      System.out.println("mapsize:" + linkMap.keySet().size());
      // System.exit(0);
      int num = 0;
      this.maxFreq = sortedTermQueue.peek().totalFreq;
      while (!sortedTermQueue.isEmpty()) {
        num++;
        System.out.println(num + ":" + sortedTermQueue.poll());
      }
      System.out.println("read index info done");
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      try {
        reader.close();

      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
    @SuppressWarnings({"StringEquality"})
    @Override
    public void run() {
      TermDocs termDocs = null;
      TermEnum termEnum = null;
      try {
        BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15);
        termDocs = reader.termDocs();
        termEnum = reader.terms(new Term(field));
        do {
          Term term = termEnum.term();
          if (term == null || term.field() != field) break;

          // LUCENE MONITOR: 4.0, move to use bytes!
          UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text());
          termDocs.seek(termEnum);
          while (termDocs.next()) {
            // when traversing, make sure to ignore deleted docs, so the key->docId will be correct
            if (!reader.isDeleted(termDocs.doc())) {
              filter.add(utf8Result.result, 0, utf8Result.length);
            }
          }
        } while (termEnum.next());
        ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey());
        if (fieldCache != null) {
          if (fieldCache.containsKey(field)) {
            BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter);
            filterEntry.loading.set(false);
            fieldCache.put(field, filterEntry);
          }
        }
      } catch (Exception e) {
        logger.warn("failed to load bloom filter for [{}]", e, field);
      } finally {
        try {
          if (termDocs != null) {
            termDocs.close();
          }
        } catch (IOException e) {
          // ignore
        }
        try {
          if (termEnum != null) {
            termEnum.close();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    }
Beispiel #12
0
    public MultiTermEnum(IndexReader topReader, IndexReader[] readers, int[] starts, Term t)
        throws IOException {
      this.topReader = topReader;
      queue = new SegmentMergeQueue(readers.length);
      matchingSegments = new SegmentMergeInfo[readers.length + 1];
      for (int i = 0; i < readers.length; i++) {
        IndexReader reader = readers[i];
        TermEnum termEnum;

        if (t != null) {
          termEnum = reader.terms(t);
        } else termEnum = reader.terms();

        SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader);
        smi.ord = i;
        if (t == null ? smi.next() : termEnum.term() != null) queue.add(smi); // initialize queue
        else smi.close();
      }

      if (t != null && queue.size() > 0) {
        next();
      }
    }
  /**
   * Gets the global term frequencies and writes them in the index directory.
   *
   * @throws Exception the exception
   */
  public void getGlobalTermFrequencies() throws Exception {
    String parentDir =
        Flags.rootDir + (Flags.positional ? "/positional-" : "/") + "lucene/" + Flags.suffix;
    File file = new File(parentDir);
    indexReader = IndexReader.open(FSDirectory.open(file));

    TermEnum terms = indexReader.terms();
    BufferedWriter out =
        new BufferedWriter(new FileWriter(new File(parentDir + "/globalTermFreq.txt")));
    while (terms.next()) {
      org.apache.lucene.index.Term term = terms.term();
      out.write(term.text() + " " + getGlobalTermFreq(term) + "\n");
    }
    out.close();
    indexReader.close();
  }
  /**
   * Tests the IndexReader.getFieldNames implementation
   *
   * @throws Exception on error
   */
  public void testFilterIndexReader() throws Exception {
    Directory directory = newDirectory();
    IndexWriter writer =
        new IndexWriter(
            directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));

    Document d1 = new Document();
    d1.add(newField("default", "one two", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d1);

    Document d2 = new Document();
    d2.add(newField("default", "one three", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d2);

    Document d3 = new Document();
    d3.add(newField("default", "two four", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d3);

    writer.close();

    IndexReader reader = new TestReader(IndexReader.open(directory, true));
    TermEnum terms = reader.terms();
    while (terms.next()) {
      assertTrue(terms.term().text().indexOf('e') != -1);
    }
    terms.close();

    TermPositions positions = reader.termPositions(new Term("default", "one"));
    while (positions.next()) {
      assertTrue((positions.doc() % 2) == 1);
    }

    int NUM_DOCS = 3;

    TermDocs td = reader.termDocs(null);
    for (int i = 0; i < NUM_DOCS; i++) {
      assertTrue(td.next());
      assertEquals(i, td.doc());
      assertEquals(1, td.freq());
    }
    td.close();
    reader.close();
    directory.close();
  }
  private static Map<String, TopTermQueue> getTopTerms(
      IndexReader reader, Set<String> fields, int numTerms, Set<String> junkWords)
      throws Exception {
    Map<String, TopTermQueue> info = new HashMap<String, TopTermQueue>();

    TermEnum terms = null;
    try {
      terms = reader.terms();
      while (terms.next()) {
        String field = terms.term().field();
        String t = terms.term().text();

        // Compute distinct terms for every field
        TopTermQueue tiq = info.get(field);
        if (tiq == null) {
          tiq = new TopTermQueue(numTerms + 1);
          info.put(field, tiq);
        }
        tiq.distinctTerms++;
        tiq.histogram.add(terms.docFreq()); // add the term to the histogram

        // Only save the distinct terms for fields we worry about
        if (fields != null && fields.size() > 0) {
          if (!fields.contains(field)) {
            continue;
          }
        }
        if (junkWords != null && junkWords.contains(t)) {
          continue;
        }

        if (terms.docFreq() > tiq.minFreq) {
          tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
          if (tiq.size() > numTerms) { // if tiq full
            tiq.pop(); // remove lowest in tiq
            tiq.minFreq = ((TopTermQueue.TermInfo) tiq.top()).docFreq; // reset minFreq
          }
        }
      }
    } finally {
      if (terms != null) terms.close();
    }
    return info;
  }
  public boolean skipTo(int termNumber) throws IOException {
    int delta = termNumber - pos;
    if (delta < 0 || delta > tindex.interval || tenum == null) {
      int idx = termNumber >>> tindex.intervalBits;
      String base = tindex.index.get(idx);
      pos = idx << tindex.intervalBits;
      delta = termNumber - pos;
      if (tenum != null) {
        tenum.close();
      }
      tenum = reader.terms(tindex.createTerm(base));
    }

    while (--delta >= 0) {
      boolean b = tenum.next();
      if (b == false) {
        t = null;
        return false;
      }
      ++pos;
    }

    return setTerm();
  }
Beispiel #17
0
  private void dumpTerms() throws IOException {
    outputBanner("Terms (in Term.compareTo() order)");

    TermEnum terms = mIndexReader.terms();
    int order = 0;

    while (terms.next()) {
      order++;
      Term term = terms.term();
      String field = term.field();
      String text = term.text();

      if (!wantThisTerm(field, text)) {
        continue;
      }

      outputLn(order + " " + field + ": " + text);

      /*
       * for each term, print the
       * <document, frequency, <position>* > tuples for a term.
       *
       * document:  document in which the Term appears
       * frequency: number of time the Term appears in the document
       * position:  position for each appearance in the document
       *
       * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED));
       *      then the tuple for Term("field", "two") in this document would be like:
       *      88, 2, <2, 4>
       *      where
       *      88 is the document number
       *      2  is the frequency this term appear in the document
       *      <2, 4> are the positions for each appearance in the document
       */
      // by TermPositions
      outputLn("    document, frequency, <position>*");

      // keep track of docs that appear in all terms that are filtered in.
      Set<Integer> docNums = null;
      if (hasFilters()) {
        docNums = new HashSet<Integer>();
      }

      TermPositions termPos = mIndexReader.termPositions(term);
      while (termPos.next()) {
        int docNum = termPos.doc();
        int freq = termPos.freq();

        if (docNums != null) {
          docNums.add(docNum);
        }

        output("    " + docNum + ", " + freq + ", <");

        boolean first = true;
        for (int f = 0; f < freq; f++) {
          int positionInDoc = termPos.nextPosition();
          if (!first) {
            output(" ");
          } else {
            first = false;
          }
          output(positionInDoc + "");
        }
        outputLn(">");
      }
      termPos.close();

      if (docNums != null) {
        computeDocsIntersection(docNums);
      }

      outputLn();

      if (order % 1000 == 0) {
        mConsole.debug("Dumped " + order + " terms");
      }
    }

    terms.close();
  }
  private static float[] getFloats(FileFloatSource ffs, IndexReader reader) {
    float[] vals = new float[reader.maxDoc()];
    if (ffs.defVal != 0) {
      Arrays.fill(vals, ffs.defVal);
    }
    InputStream is;
    String fname = "external_" + ffs.field.getName();
    try {
      is = VersionedFile.getLatestFile(ffs.dataDir, fname);
    } catch (IOException e) {
      // log, use defaults
      SolrCore.log.error("Error opening external value source file: " + e);
      return vals;
    }

    BufferedReader r = new BufferedReader(new InputStreamReader(is));

    String idName = StringHelper.intern(ffs.keyField.getName());
    FieldType idType = ffs.keyField.getType();
    boolean sorted = true; // assume sorted until we discover it's not

    // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next()
    // because of this, simply ask the reader for a new termEnum rather than
    // trying to use skipTo()

    List<String> notFound = new ArrayList<String>();
    int notFoundCount = 0;
    int otherErrors = 0;

    TermDocs termDocs = null;
    Term protoTerm = new Term(idName, "");
    TermEnum termEnum = null;
    // Number of times to try termEnum.next() before resorting to skip
    int numTimesNext = 10;

    char delimiter = '=';
    String termVal;
    boolean hasNext = true;
    String prevKey = "";

    String lastVal = "\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF";

    try {
      termDocs = reader.termDocs();
      termEnum = reader.terms(protoTerm);
      Term t = termEnum.term();
      if (t != null && t.field() == idName) { // intern'd comparison
        termVal = t.text();
      } else {
        termVal = lastVal;
      }

      for (String line; (line = r.readLine()) != null; ) {
        int delimIndex = line.indexOf(delimiter);
        if (delimIndex < 0) continue;

        int endIndex = line.length();
        /* EOLs should already be removed for BufferedReader.readLine()
        for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) {
          char ch = line.charAt(endIndex-1);
          if (ch!='\n' && ch!='\r') break;
        }
        */
        String key = line.substring(0, delimIndex);
        String val = line.substring(delimIndex + 1, endIndex);

        String internalKey = idType.toInternal(key);
        float fval;
        try {
          fval = Float.parseFloat(val);
        } catch (Exception e) {
          if (++otherErrors <= 10) {
            SolrCore.log.error(
                "Error loading external value source + fileName + "
                    + e
                    + (otherErrors < 10 ? "" : "\tSkipping future errors for this file."));
          }
          continue; // go to next line in file.. leave values as default.
        }

        if (sorted) {
          // make sure this key is greater than the previous key
          sorted = internalKey.compareTo(prevKey) >= 0;
          prevKey = internalKey;

          if (sorted) {
            int countNext = 0;
            for (; ; ) {
              int cmp = internalKey.compareTo(termVal);
              if (cmp == 0) {
                termDocs.seek(termEnum);
                while (termDocs.next()) {
                  vals[termDocs.doc()] = fval;
                }
                break;
              } else if (cmp < 0) {
                // term enum has already advanced past current key... we didn't find it.
                if (notFoundCount < 10) { // collect first 10 not found for logging
                  notFound.add(key);
                }
                notFoundCount++;
                break;
              } else {
                // termEnum is less than our current key, so skip ahead

                // try next() a few times to see if we hit or pass the target.
                // Lucene's termEnum.skipTo() is currently unoptimized (it just does next())
                // so the best thing is to simply ask the reader for a new termEnum(target)
                // if we really need to skip.
                if (++countNext > numTimesNext) {
                  termEnum = reader.terms(protoTerm.createTerm(internalKey));
                  t = termEnum.term();
                } else {
                  hasNext = termEnum.next();
                  t = hasNext ? termEnum.term() : null;
                }

                if (t != null && t.field() == idName) { // intern'd comparison
                  termVal = t.text();
                } else {
                  termVal = lastVal;
                }
              }
            } // end for(;;)
          }
        }

        if (!sorted) {
          termEnum = reader.terms(protoTerm.createTerm(internalKey));
          t = termEnum.term();
          if (t != null
              && t.field() == idName // intern'd comparison
              && internalKey.equals(t.text())) {
            termDocs.seek(termEnum);
            while (termDocs.next()) {
              vals[termDocs.doc()] = fval;
            }
          } else {
            if (notFoundCount < 10) { // collect first 10 not found for logging
              notFound.add(key);
            }
            notFoundCount++;
          }
        }
      }
    } catch (IOException e) {
      // log, use defaults
      SolrCore.log.error("Error loading external value source: " + e);
    } finally {
      // swallow exceptions on close so we don't override any
      // exceptions that happened in the loop
      if (termDocs != null)
        try {
          termDocs.close();
        } catch (Exception e) {
        }
      if (termEnum != null)
        try {
          termEnum.close();
        } catch (Exception e) {
        }
      try {
        r.close();
      } catch (Exception e) {
      }
    }

    SolrCore.log.info(
        "Loaded external value source "
            + fname
            + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound));

    return vals;
  }
 @Before
 public void setup() throws IOException {
   when(searcher.getIndexReader()).thenReturn(reader);
   when(reader.terms()).thenReturn(terms);
 }
  @Override
  public boolean reload(String collectionName, String topRankingField) {
    if (collectionName == null) {
      return false;
    }

    CrescentCollectionHandler collectionHandler =
        SpringApplicationContext.getBean(
            "crescentCollectionHandler", CrescentCollectionHandler.class);

    CrescentCollection collection =
        collectionHandler.getCrescentCollections().getCrescentCollection(collectionName);

    if (collection == null) {
      logger.debug("doesn't Collection Info => {}", collectionName);
      init(View.Overview);
      return false;
    }

    if (topRankingField == null) {
      if (collection.getDefaultSearchFields().get(0) != null) {
        topRankingField = collection.getDefaultSearchFields().get(0).getName();
      } else {
        logger.debug("doesn't defaultSearchField => {}", collectionName);
        init(View.Overview);
        return false;
      }
    }

    List<String> fieldName = new ArrayList<String>();
    for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName());
    TopRankingQueue topRankingQueue =
        new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator());

    try {
      Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory()));
      IndexReader reader = IndexReader.open(directory);

      TermEnum terms = reader.terms();

      int termFreq = 0;
      int termCount = 0;
      Term beforeTerm = null;
      // init term count
      fieldTermCount.clear();
      for (CrescentCollectionField field : collection.getFields())
        fieldTermCount.put(field.getName(), 0);
      topRankingQueue.clear();

      while (terms.next()) {
        Term currTerm = terms.term();
        if (beforeTerm == null) {
          beforeTerm = currTerm;
        }

        if (beforeTerm.field() == currTerm.field()) {
          termCount++;
        } else {
          fieldTermCount.put(beforeTerm.field(), termCount);
          termCount = 1;
          beforeTerm = currTerm;
        }

        TermDocs termDocs = reader.termDocs(currTerm);

        while (termDocs.next()) {
          if (currTerm.field().equals(topRankingField)) {
            RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq());
            topRankingQueue.add(e);
          }
        }
        termFreq++;
      }
      if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount);

      terms.close();
      result.put("numOfTerm", termFreq);
      result.put("numOfDoc", reader.numDocs());
      result.put("hasDel", reader.hasDeletions());
      result.put("isOptimize", reader.isOptimized());
      result.put("indexVersion", reader.getVersion());
      result.put("lastModify", new Date(IndexReader.lastModified(directory)));
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    if (topRankingQueue.size() != 0) {
      topRankingTerms = topRankingQueue.toArray();
      Arrays.sort(topRankingTerms);
    }
    result.put("collectionName", collectionName);
    result.put("indexName", collection.getIndexingDirectory());
    result.put("numOfField", collection.getFields().size());
    result.put("termCount", fieldTermCount);
    result.put("topRanking", topRankingTerms);
    result.put("fieldName", fieldName);

    return true;
  }
Beispiel #21
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("TermDumper [-c|-v value] field <index...>");
      System.exit(1);
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
      String arg = args[i];

      if ("-h".equals(arg) || "--help".equals(arg)) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);
      } else if ("-c".equals(arg) || "--count".equals(arg)) {
        count = true;
      } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
        value = args[++i];
      } else if ("-a".equals(arg) || "--all".equals(arg)) {
        all = true;
      } else {
        break;
      }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers =
        new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
      String arg = args[i];
      try {
        IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

        readers.add(reader);
      } catch (IOException ioe) {
        System.err.println("Error reading: " + arg);
      }
    }

    for (IndexReader reader : readers) {
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms(new Term(field));

      try {
        do {
          Term term = termEnum.term();

          if (term == null || !field.equals(term.field())) break;

          if (value == null) {
            if (count) {
              termDocs.seek(termEnum);

              int c = 0;
              for (; termDocs.next(); c++) ;

              System.out.print(c + " ");
            }
            System.out.println(term.text());
          } else if (value.equals(term.text())) {
            termDocs.seek(termEnum);

            while (termDocs.next()) {
              if (all) {
                Document d = reader.document(termDocs.doc());
                System.out.println(termDocs.doc());
                for (Object o : d.getFields()) {
                  Field f = (Field) o;
                  System.out.println(f.name() + " " + d.get(f.name()));
                }
              } else {
                System.out.println(
                    termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
              }
            }
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
    }
  }
Beispiel #22
0
  /**
   * Returns a BitSet with true for documents which should be permitted in searchCallback results,
   * and false for those that should not.
   */
  public BitSet bits(IndexReader reader) throws IOException {
    long start = System.currentTimeMillis();

    BitSet bits = new BitSet(reader.maxDoc());
    //        TermEnum enumerator =
    //            (null == lowerTerm ? reader.terms(new Term(fieldName, "")) : reader.terms(new
    // Term(fieldName, lowerTerm)));
    TermEnum enumerator =
        (null != lowerTerm
            ? reader.terms(new Term(fieldName, lowerTerm))
            : reader.terms(new Term(fieldName, "")));

    // coords = new HashMap(enumerator.docFreq());

    try {

      if (enumerator.term() == null) {
        return bits;
      }

      boolean checkLower = false;
      if (!includeLower) // make adjustments to set to exclusive
      checkLower = true;

      TermDocs termDocs = reader.termDocs();
      try {

        do {
          Term term = enumerator.term();
          if (term != null && term.field().equals(fieldName)) {
            if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) {
              checkLower = false;
              if (upperTerm != null) {
                int compare = upperTerm.compareTo(term.text());
                /* if beyond the upper term, or is exclusive and
                 * this is equal to the upper term, break out */
                if ((compare < 0) || (!includeUpper && compare == 0)) {
                  break;
                }
              }
              /* we have a good term, find the docs */

              termDocs.seek(enumerator.term());
              while (termDocs.next()) {
                bits.set(termDocs.doc());
              }
            }
          } else {
            break;
          }
        } while (enumerator.next());

      } finally {
        termDocs.close();
      }
    } finally {
      enumerator.close();
    }

    long end = System.currentTimeMillis();
    log.info("BoundaryBox Time Taken: " + (end - start));
    return bits;
  }
 /** @see LuceneIndexReader#terms(Term) */
 public TermEnum terms(Term term) throws IOException {
   return indexReader.terms(term);
 }
  public List<Pair<String, Integer>> getTopTerms(String field, int count) {
    ClosingIndexSearcher searcher = null;
    try {
      LinkedList<Pair<String, Integer>> answer = new LinkedList<Pair<String, Integer>>();
      searcher = getSearcher(indexer);
      IndexReader reader = searcher.getIndexReader();
      TermEnum terms = reader.terms(new Term(field, ""));
      do {
        Term term = terms.term();
        if (term != null) {
          if (!term.field().equals(field)) {
            break;
          }
          int freq = terms.docFreq();
          Pair<String, Integer> pair =
              new Pair<String, Integer>(term.text(), Integer.valueOf(freq));
          if (answer.size() < count) {
            if (answer.size() == 0) {
              answer.add(pair);
            } else if (answer.get(answer.size() - 1).getSecond().compareTo(pair.getSecond()) >= 0) {
              answer.add(pair);
            } else {
              for (ListIterator<Pair<String, Integer>> it = answer.listIterator();
                  it.hasNext(); /**/ ) {
                Pair<String, Integer> test = it.next();
                if (test.getSecond().compareTo(pair.getSecond()) < 0) {
                  it.previous();
                  it.add(pair);
                  break;
                }
              }
            }
          } else if (answer.get(count - 1).getSecond().compareTo(pair.getSecond()) < 0) {
            for (ListIterator<Pair<String, Integer>> it = answer.listIterator();
                it.hasNext(); /**/ ) {
              Pair<String, Integer> test = it.next();
              if (test.getSecond().compareTo(pair.getSecond()) < 0) {
                it.previous();
                it.add(pair);
                break;
              }
            }
            answer.removeLast();
          } else {
            // off the end
          }
        }
      } while (terms.next());
      terms.close();
      return answer;

    } catch (IOException e) {
      throw new SearcherException(e);
    } finally {
      if (searcher != null) {
        try {
          searcher.close();
        } catch (IOException e) {
          throw new SearcherException(e);
        }
      }
    }
  }
  /**
   * loads multi-value facet data. This method uses a workarea to prepare loading.
   *
   * @param fieldName
   * @param reader
   * @param listFactory
   * @param workArea
   * @throws IOException
   */
  public void load(
      String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea)
      throws IOException {
    long t0 = System.currentTimeMillis();
    int maxdoc = reader.maxDoc();
    BufferedLoader loader = getBufferedLoader(maxdoc, workArea);

    TermEnum tenum = null;
    TermDocs tdoc = null;
    TermValueList<T> list =
        (listFactory == null
            ? (TermValueList<T>) new TermStringList()
            : listFactory.createTermList());
    IntArrayList minIDList = new IntArrayList();
    IntArrayList maxIDList = new IntArrayList();
    IntArrayList freqList = new IntArrayList();
    OpenBitSet bitset = new OpenBitSet(maxdoc + 1);
    int negativeValueCount = getNegativeValueCount(reader, fieldName.intern());
    int t = 0; // current term number
    list.add(null);
    minIDList.add(-1);
    maxIDList.add(-1);
    freqList.add(0);
    t++;

    _overflow = false;
    try {
      tdoc = reader.termDocs();
      tenum = reader.terms(new Term(fieldName, ""));
      if (tenum != null) {
        do {
          Term term = tenum.term();
          if (term == null || !fieldName.equals(term.field())) break;

          String val = term.text();

          if (val != null) {
            list.add(val);

            tdoc.seek(tenum);
            // freqList.add(tenum.docFreq()); // removed because the df doesn't take into account
            // the num of deletedDocs
            int df = 0;
            int minID = -1;
            int maxID = -1;
            int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
            if (tdoc.next()) {
              df++;
              int docid = tdoc.doc();
              if (!loader.add(docid, valId)) logOverflow(fieldName);
              minID = docid;
              bitset.fastSet(docid);
              while (tdoc.next()) {
                df++;
                docid = tdoc.doc();

                if (!loader.add(docid, valId)) logOverflow(fieldName);
                bitset.fastSet(docid);
              }
              maxID = docid;
            }
            freqList.add(df);
            minIDList.add(minID);
            maxIDList.add(maxID);
          }

          t++;
        } while (tenum.next());
      }
    } finally {
      try {
        if (tdoc != null) {
          tdoc.close();
        }
      } finally {
        if (tenum != null) {
          tenum.close();
        }
      }
    }

    list.seal();

    try {
      _nestedArray.load(maxdoc + 1, loader);
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException("failed to load due to " + e.toString(), e);
    }

    this.valArray = list;
    this.freqs = freqList.toIntArray();
    this.minIDs = minIDList.toIntArray();
    this.maxIDs = maxIDList.toIntArray();

    int doc = 0;
    while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) {
      ++doc;
    }
    if (doc <= maxdoc) {
      this.minIDs[0] = doc;
      doc = maxdoc;
      while (doc > 0 && !_nestedArray.contains(doc, 0, true)) {
        --doc;
      }
      if (doc > 0) {
        this.maxIDs[0] = doc;
      }
    }
    this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality();
  }
  public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences)
      throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File("lib/stoplists/en.txt"));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    for (String s : fileSentences) {
      Document doc1 = new Document();
      StringReader d1reader = new StringReader(s);
      doc1.add(new Field("contents", d1reader, TermVector.YES));
      writer.addDocument(doc1);
    }

    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[fileSentences.size()];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    int pos = 0;
    while (termEnum.next()) {
      Term term = termEnum.term();
      if (!"contents".equals(term.field())) break;
      terms.put(term.text(), pos++);
    }

    // System.out.println("Num terms:"+terms.size());

    for (int i = 0; i < fileSentences.size(); i++) {
      TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i);
      docs[i] = new DocVector(terms);
      if (tfvs == null) continue;
      for (TermFreqVector tfv : tfvs) {
        String[] termTexts = tfv.getTerms();
        int[] termFreqs = tfv.getTermFrequencies();
        for (int j = 0; j < termTexts.length; j++) {
          double idfValue = getIDF(RAMreader, termTexts[j]);
          double tfIdfValue = termFreqs[j] * idfValue;
          docs[i].setEntry(termTexts[j], tfIdfValue);
        }
      }
      docs[i].normalize();
    }

    RAMreader.close();
    ramDir.close();
    // ramDir.close();
    // System.out.println(RAMreader.numDocs());
    // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19]));
    return docs;
  }
  /**
   * Calculates the cosine similarity between two documents.
   *
   * @param d1 the first document
   * @param d2 the second document
   * @return the cosine similarity
   * @throws IOException
   */
  public double getCosineSimilarity(String d1, String d2) throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile")));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    @SuppressWarnings("deprecation")
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    Document doc1 = new Document();
    StringReader d1reader = new StringReader(d1);
    doc1.add(new Field("contents", d1reader, TermVector.YES));

    writer.addDocument(doc1);
    Document doc2 = new Document();
    StringReader d2reader = new StringReader(d2);

    doc2.add(new Field("contents", d2reader, TermVector.YES));
    writer.addDocument(doc2);
    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[2];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents");
    TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents");
    // System.out.println(tfvs1.toString());
    if (tfvs1 == null || tfvs2 == null) {
      return 0.0;
    }

    String[] termTexts1 = tfvs1.getTerms();

    String[] termTexts2 = tfvs2.getTerms();

    // Store the terms and their positions in a hashmap - this represents the vocabulary
    int pos = 0;
    for (String term : termTexts1) {
      terms.put(term, pos++);
    }
    for (String term : termTexts2) {
      if (!terms.containsKey(term)) {
        terms.put(term, pos++);
      }
    }

    docs[0] = new DocVector(terms);
    docs[1] = new DocVector(terms);
    int[] termFreqs1 = tfvs1.getTermFrequencies();
    for (int j = 0; j < termTexts1.length; j++) {
      // System.out.println("termtext:"+termTexts1[j]);
      double idfValue = getIDF(RAMreader, termTexts1[j]);
      // System.out.println("idf:"+idfValue);
      double tfIdfValue = termFreqs1[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[0].setEntry(termTexts1[j], tfIdfValue);
    }

    int[] termFreqs2 = tfvs2.getTermFrequencies();
    for (int j = 0; j < termTexts2.length; j++) {
      double idfValue = getIDF(RAMreader, termTexts2[j]);
      double tfIdfValue = termFreqs2[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[1].setEntry(termTexts2[j], tfIdfValue);
    }

    //
    //
    //
    //		System.out.println(terms.toString());
    //		System.out.println(docs[0]);
    //		System.out.println(docs[1]);
    RAMreader.close();
    ramDir.close();
    //        docs[0].normalize();
    //        docs[1].normalize();

    // Return the cosine similarity of the term vectors

    return calcCosineSimilarity(docs[0], docs[1]);
  }
Beispiel #28
0
  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }