コード例 #1
0
ファイル: DocReconstructor.java プロジェクト: belle96/luke
 /**
  * Prepare a document reconstructor.
  *
  * @param reader IndexReader to read from.
  * @param fieldNames if non-null or not empty, data will be collected only from these fields,
  *     otherwise data will be collected from all fields
  * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated)
  * @throws Exception
  */
 public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception {
   if (reader == null) {
     throw new Exception("IndexReader cannot be null.");
   }
   this.reader = reader;
   if (fieldNames == null || fieldNames.length == 0) {
     // collect fieldNames
     this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]);
   } else {
     this.fieldNames = fieldNames;
   }
   if (numTerms == -1) {
     Fields fields = MultiFields.getFields(reader);
     numTerms = 0;
     FieldsEnum fe = fields.iterator();
     String fld = null;
     while ((fld = fe.next()) != null) {
       TermsEnum te = fe.terms();
       while (te.next() != null) {
         numTerms++;
       }
     }
     this.numTerms = numTerms;
   }
   deleted = MultiFields.getDeletedDocs(reader);
 }
コード例 #2
0
ファイル: IndexDatabase.java プロジェクト: battbeach/OpenGrok
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
コード例 #3
0
ファイル: IndexDatabase.java プロジェクト: battbeach/OpenGrok
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
コード例 #4
0
ファイル: HighFreqTerms.java プロジェクト: hutea/luke
  /**
   * @param reader
   * @param numTerms
   * @param field
   * @return TermStats[] ordered by terms with highest docFreq first.
   * @throws Exception
   */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
      throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      for (String field : fieldNames) {
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      Iterator<String> fieldIterator = fields.iterator();
      while (fieldIterator.hasNext()) {
        String field = fieldIterator.next();
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
コード例 #5
0
ファイル: HighFreqTerms.java プロジェクト: shaie/lucene-solr
  /** Returns TermStats[] ordered by terms with highest docFreq first. */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field)
      throws Exception {
    TermStatsQueue tiq = null;

    if (field != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("field " + field + " not found");
      }
      Terms terms = fields.terms(field);
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator(null);
        tiq = new TermStatsQueue(numTerms);
        tiq.fill(field, termsEnum);
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("no fields found for this index");
      }
      tiq = new TermStatsQueue(numTerms);
      for (String fieldName : fields) {
        Terms terms = fields.terms(fieldName);
        if (terms != null) {
          tiq.fill(fieldName, terms.iterator(null));
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
コード例 #6
0
 private Fields generateTermVectors(
     Collection<GetField> getFields,
     boolean withOffsets,
     @Nullable Map<String, String> perFieldAnalyzer)
     throws IOException {
   /* store document in memory index */
   MemoryIndex index = new MemoryIndex(withOffsets);
   for (GetField getField : getFields) {
     String field = getField.getName();
     Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer);
     for (Object text : getField.getValues()) {
       index.addField(field, text.toString(), analyzer);
     }
   }
   /* and read vectors from it */
   return MultiFields.getFields(index.createSearcher().getIndexReader());
 }
コード例 #7
0
ファイル: SolrUtil.java プロジェクト: mayr/jate
  public static Terms getTermVector(String fieldname, SolrIndexSearcher solrIndexSearcher)
      throws JATEException {
    try {
      Fields fields = MultiFields.getFields(solrIndexSearcher.getLeafReader());

      Terms vector = fields.terms(fieldname);
      if (vector == null)
        throw new JATEException(String.format("Cannot find expected field: %s", fieldname));
      return vector;
    } catch (IOException ioe) {
      StringBuilder sb =
          new StringBuilder(
              String.format("Cannot find expected field: %s. Error stacktrack: \n", fieldname));
      sb.append(org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(ioe));
      throw new JATEException(sb.toString());
    }
  }
コード例 #8
0
  @Override
  protected ShardTermlistResponse shardOperation(ShardTermlistRequest request)
      throws ElasticSearchException {
    synchronized (termlistMutex) {
      InternalIndexShard indexShard =
          (InternalIndexShard)
              indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId());
      indexShard.store().directory();
      Engine.Searcher searcher = indexShard.searcher();
      try {
        Set<String> set = new CompactHashSet();

        Fields fields = MultiFields.getFields(searcher.reader());
        if (fields != null) {
          for (Iterator<String> it = fields.iterator(); it.hasNext(); ) {
            String field = it.next();
            if (field.charAt(0) == '_') {
              continue;
            }
            if (request.getField() == null || field.equals(request.getField())) {
              Terms terms = fields.terms(field);
              if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                  set.add(text.utf8ToString());
                  System.out.println("field=" + field + "; text=" + text.utf8ToString());
                }
              }
            }
          }
        }
        return new ShardTermlistResponse(request.index(), request.shardId(), set);
      } catch (IOException ex) {
        throw new ElasticSearchException(ex.getMessage(), ex);
      }
    }
  }
コード例 #9
0
  public void testCodec() throws Exception {
    Directory dir = new AppendingRAMDirectory(new RAMDirectory());
    IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer());

    cfg.setCodecProvider(new AppendingCodecProvider());
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundFile(false);
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundDocStore(false);
    IndexWriter writer = new IndexWriter(dir, cfg);
    Document doc = new Document();
    doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
    writer.addDocument(doc);
    writer.commit();
    writer.addDocument(doc);
    writer.optimize();
    writer.close();
    IndexReader reader = IndexReader.open(dir, null, true, 1, new AppendingCodecProvider());
    assertEquals(2, reader.numDocs());
    doc = reader.document(0);
    assertEquals(text, doc.get("f"));
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("f");
    assertNotNull(terms);
    TermsEnum te = terms.iterator();
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the")));
    DocsEnum de = te.docs(null, null);
    assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS);
    assertEquals(2, de.freq());
    assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS);
    assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS);
    reader.close();
  }
コード例 #10
0
  @Test
  public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =
        CollectionReaderFactory.createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/data/",
            TextReader.PARAM_LANGUAGE,
            "en",
            TextReader.PARAM_PATTERNS,
            "text*.txt");

    AnalysisEngineDescription segmenter =
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector =
        AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());
    }

    int i = 0;
    IndexReader index;
    try {
      index = DirectoryReader.open(FSDirectory.open(tmpDir));
      Fields fields = MultiFields.getFields(index);
      if (fields != null) {
        Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator(null);
          //                    Bits liveDocs = MultiFields.getLiveDocs(index);
          //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
          //                    int docId;
          //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
          //                        index.g
          //                    }
          BytesRef text = null;
          while ((text = termsEnum.next()) != null) {
            //                        System.out.println(text.utf8ToString() + " - " +
            // termsEnum.totalTermFreq());
            //                        System.out.println(termsEnum.docFreq());

            if (text.utf8ToString().equals("this")) {
              assertEquals(2, termsEnum.docFreq());
              assertEquals(3, termsEnum.totalTermFreq());
            }

            i++;
          }
        }
      }
    } catch (Exception e) {
      throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
  }
コード例 #11
0
ファイル: IndexDatabase.java プロジェクト: battbeach/OpenGrok
  /**
   * Update the content of this index database
   *
   * @throws IOException if an error occurs
   * @throws HistoryException if an error occurs when accessing the history
   */
  public void update() throws IOException, HistoryException {
    synchronized (lock) {
      if (running) {
        throw new IOException("Indexer already running!");
      }
      running = true;
      interrupted = false;
    }

    String ctgs = RuntimeEnvironment.getInstance().getCtags();
    if (ctgs != null) {
      ctags = new Ctags();
      ctags.setBinary(ctgs);
    }
    if (ctags == null) {
      log.severe("Unable to run ctags! searching definitions will not work!");
    }

    if (ctags != null) {
      String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile();
      if (filename != null) {
        ctags.setCTagsExtraOptionsFile(filename);
      }
    }

    try {
      Analyzer analyzer = AnalyzerGuru.getAnalyzer();
      IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer);
      iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      // iwc.setRAMBufferSizeMB(256.0);  //TODO check what is the sweet spot
      writer = new IndexWriter(indexDirectory, iwc);
      writer.commit(); // to make sure index exists on the disk
      // writer.setMaxFieldLength(RuntimeEnvironment.getInstance().getIndexWordLimit());

      if (directories.isEmpty()) {
        if (project == null) {
          directories.add("");
        } else {
          directories.add(project.getPath());
        }
      }

      for (String dir : directories) {
        File sourceRoot;
        if ("".equals(dir)) {
          sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile();
        } else {
          sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir);
        }

        HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot);

        String startuid = Util.path2uid(dir, "");
        IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index
        Terms terms = null;
        int numDocs = reader.numDocs();
        if (numDocs > 0) {
          Fields uFields = MultiFields.getFields(reader); // reader.getTermVectors(0);
          terms = uFields.terms(QueryBuilder.U);
        }

        try {
          if (numDocs > 0) {
            uidIter = terms.iterator(null);
            TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid), true); // init uid
            if (stat == TermsEnum.SeekStatus.END || stat == TermsEnum.SeekStatus.NOT_FOUND) {
              uidIter = null;
            }
          }
          // TODO below should be optional, since it traverses the tree once more to get total
          // count! :(
          int file_cnt = 0;
          if (RuntimeEnvironment.getInstance().isPrintProgress()) {
            log.log(Level.INFO, "Counting files in {0} ...", dir);
            file_cnt = indexDown(sourceRoot, dir, true, 0, 0);
            if (log.isLoggable(Level.INFO)) {
              log.log(
                  Level.INFO, "Need to process: {0} files for {1}", new Object[] {file_cnt, dir});
            }
          }

          indexDown(sourceRoot, dir, false, 0, file_cnt);

          while (uidIter != null
              && uidIter.term() != null
              && uidIter.term().utf8ToString().startsWith(startuid)) {
            removeFile();
            uidIter.next();
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      if (writer != null) {
        try {
          writer.prepareCommit();
          writer.commit();
          writer.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing writer", e);
        }
      }

      if (ctags != null) {
        try {
          ctags.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing ctags process", e);
        }
      }

      synchronized (lock) {
        running = false;
      }
    }

    if (!isInterrupted() && isDirty()) {
      if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) {
        optimize();
      }
      createSpellingSuggestions();
      RuntimeEnvironment env = RuntimeEnvironment.getInstance();
      File timestamp = new File(env.getDataRootFile(), "timestamp");
      if (timestamp.exists()) {
        if (!timestamp.setLastModified(System.currentTimeMillis())) {
          log.log(
              Level.WARNING,
              "Failed to set last modified time on ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      } else {
        if (!timestamp.createNewFile()) {
          log.log(
              Level.WARNING,
              "Failed to create file ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      }
    }
  }
コード例 #12
0
  public TermVectorResponse getTermVector(TermVectorRequest request, String concreteIndex) {
    final Engine.Searcher searcher = indexShard.acquireSearcher("term_vector");
    IndexReader topLevelReader = searcher.reader();
    final TermVectorResponse termVectorResponse =
        new TermVectorResponse(concreteIndex, request.type(), request.id());

    final Term uidTerm =
        new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(request.type(), request.id()));
    Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm));
    boolean docFromTranslog = get.source() != null;
    AggregatedDfs dfs = null;

    /* fetched from translog is treated as an artificial document */
    if (docFromTranslog) {
      request.doc(get.source().source, false);
      termVectorResponse.setDocVersion(get.version());
    }

    /* handle potential wildcards in fields */
    if (request.selectedFields() != null) {
      handleFieldWildcards(request);
    }

    try {
      Fields topLevelFields = MultiFields.getFields(topLevelReader);
      Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
      /* from an artificial document */
      if (request.doc() != null) {
        Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
        // if no document indexed in shard, take the queried document itself for stats
        if (topLevelFields == null) {
          topLevelFields = termVectorsByField;
        }
        if (termVectorsByField != null && useDfs(request)) {
          dfs = getAggregatedDfs(termVectorsByField, request);
        }
        termVectorResponse.setFields(
            termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
        termVectorResponse.setExists(true);
        termVectorResponse.setArtificial(!docFromTranslog);
      }
      /* or from an existing document */
      else if (docIdAndVersion != null) {
        // fields with stored term vectors
        Fields termVectorsByField =
            docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
        Set<String> selectedFields = request.selectedFields();
        // generate tvs for fields where analyzer is overridden
        if (selectedFields == null && request.perFieldAnalyzer() != null) {
          selectedFields = getFieldsToGenerate(request.perFieldAnalyzer(), termVectorsByField);
        }
        // fields without term vectors
        if (selectedFields != null) {
          termVectorsByField =
              addGeneratedTermVectors(get, termVectorsByField, request, selectedFields);
        }
        if (termVectorsByField != null && useDfs(request)) {
          dfs = getAggregatedDfs(termVectorsByField, request);
        }
        termVectorResponse.setFields(
            termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
        termVectorResponse.setDocVersion(docIdAndVersion.version);
        termVectorResponse.setExists(true);
      } else {
        termVectorResponse.setExists(false);
      }
    } catch (Throwable ex) {
      throw new ElasticsearchException("failed to execute term vector request", ex);
    } finally {
      searcher.close();
      get.release();
    }
    return termVectorResponse;
  }
コード例 #13
0
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }
コード例 #14
0
  private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException {
    Fields memFields = memIndexReader.fields();
    for (String field : MultiFields.getFields(other)) {
      Terms memTerms = memFields.terms(field);
      Terms iwTerms = memIndexReader.terms(field);
      if (iwTerms == null) {
        assertNull(memTerms);
      } else {
        NumericDocValues normValues = MultiDocValues.getNormValues(other, field);
        NumericDocValues memNormValues = memIndexReader.getNormValues(field);
        if (normValues != null) {
          // mem idx always computes norms on the fly
          assertNotNull(memNormValues);
          assertEquals(normValues.get(0), memNormValues.get(0));
        }

        assertNotNull(memTerms);
        assertEquals(iwTerms.getDocCount(), memTerms.getDocCount());
        assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq());
        assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq());
        TermsEnum iwTermsIter = iwTerms.iterator();
        TermsEnum memTermsIter = memTerms.iterator();
        if (iwTerms.hasPositions()) {
          final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets();

          while (iwTermsIter.next() != null) {
            assertNotNull(memTermsIter.next());
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
              for (int i = 0; i < iwDocsAndPos.freq(); i++) {
                assertEquals(
                    "term: " + iwTermsIter.term().utf8ToString(),
                    iwDocsAndPos.nextPosition(),
                    memDocsAndPos.nextPosition());
                if (offsets) {
                  assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset());
                  assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset());
                }

                if (iwTerms.hasPayloads()) {
                  assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload());
                }
              }
            }
          }
        } else {
          while (iwTermsIter.next() != null) {
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
            }
          }
        }
      }
    }
  }