Java IndexReader.getTermVectorの例

プログラミング言語: Java

名前空間/パッケージ名: org.apache.lucene.index

クラス/型: IndexReader

メソッド/関数: getTermVector

hotexamples.comのコード掲載数: 7

Java IndexReader.getTermVector - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたJavaのorg.apache.lucene.index.IndexReader.getTermVectorの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

close(30)

open(30)

numDocs(30)

maxDoc(30)

document(30)

terms(28)

leaves(23)

termDocs(22)

docFreq(21)

isDeleted(14)

hasDeletions(13)

indexExists(12)

getTermFreqVector(8)

norms(8)

getSequentialSubReaders(7)

getContext(7)

getTermVector(7)

termPositions(6)

deleteDocument(6)

getFieldNames(6)

getVersion(5)

deleteDocuments(5)

isCurrent(4)

ensureOpen(4)

isOptimized(4)

flush(4)

directory(4)

lastModified(3)

getFieldCacheKey(3)

getTermVectors(3)

getSumTotalTermFreq(3)

getDocCount(3)

decRef(3)

undeleteAll(3)

openIfChanged(2)

reportCloseToParentReaders(2)

termDocsEnum(2)

termPositionsEnum(2)

totalTermFreq(2)

unlock(2)

hasNorms(2)

isLocked(2)

getDeletedDocs(2)

hashCode(2)

equals(2)

getCoreCacheKey(2)

getTermFreqVectors(2)

getCurrentVersion(2)

closedByChild(1)

doClose(1)

コード例 #1

ファイルを表示

ファイル: RocchioQueryExpansion.java プロジェクト: rbouadjenek/patent-search

  /**
   * Extracts termClaimsDescriptionAbstractTitles of the documents; Adds them to vector in the same
   * order
   *
   * @param hits
   * @param i
   * @param j
   * @return relevantDocsTerms docs must be in order
   * @throws java.io.IOException
   */
  public Map<TermFreqVector, String> getDocsTerms(TopDocs hits, int i, int j) throws IOException {
    Map<TermFreqVector, String> docsTerms = new HashMap<>();
    // Process each of the documents
    while (i < j && i < hits.totalHits && i >= 0) {
      ScoreDoc scoreDoc = hits.scoreDocs[i];
      if (sourceField.equals(PatentQuery.all)) {
        Terms termTitle = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[1]);
        TermFreqVector docTermsTitle = new TermFreqVector(termTitle);
        docsTerms.put(docTermsTitle, PatentQuery.getFields()[1]);

        Terms termAbstract = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[2]);
        TermFreqVector docTermsAbstract = new TermFreqVector(termAbstract);
        docsTerms.put(docTermsAbstract, PatentQuery.getFields()[2]);

        Terms termDescription = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[3]);
        TermFreqVector docTermsDescription = new TermFreqVector(termDescription);
        docsTerms.put(docTermsDescription, PatentQuery.getFields()[3]);

        Terms termClaims = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[5]);
        TermFreqVector docTermsClaims = new TermFreqVector(termClaims);
        docsTerms.put(docTermsClaims, PatentQuery.getFields()[5]);

      } else {
        Terms term = ir.getTermVector(scoreDoc.doc, sourceField); // get termvector for document
        // Create termVector and add it to vector
        TermFreqVector docTerms = new TermFreqVector(term);
        docsTerms.put(docTerms, sourceField);
      }
      i++;
    }
    return docsTerms;
  }

コード例 #2

ファイルを表示

ファイル: LuceneUtil.java プロジェクト: INL/BlackLab

  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }

コード例 #3

ファイルを表示

ファイル: TweetsLtrDataGenerator.java プロジェクト: lintool/Anserini

  @Override
  public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    IndexReader reader = context.getIndexSearcher().getIndexReader();

    for (int i = 0; i < docs.documents.length; i++) {
      Terms terms = null;
      try {
        terms = reader.getTermVector(docs.ids[i], StatusField.TEXT.name);
      } catch (IOException e) {
        continue;
      }

      String qid = context.getQueryId().replaceFirst("^MB0*", "");
      String docid = docs.documents[i].getField(StatusField.ID.name).stringValue();

      out.print(qrels.getRelevanceGrade(qid, docid));
      out.print(" qid:" + qid);
      out.print(" 1:" + docs.scores[i]);

      float[] intFeatures = this.extractorChain.extractAll(docs.documents[i], terms, context);

      for (int j = 0; j < intFeatures.length; j++) {
        out.print(" " + (j + 2) + ":" + intFeatures[j]);
      }

      out.print(" # docid:" + docid);
      out.print("\n");
    }

    return docs;
  }

コード例 #4

ファイルを表示

ファイル: InspectIndex.java プロジェクト: laosiaudi/CMU-11642-Project

  /*
   *  listTermVectorField displays the term vector for a field in
   *  a document in an index (specified by reader).
   */
  static void listTermVectorField(IndexReader reader, String docidString, String field)
      throws IOException {

    System.out.println("\nTermVector:  docid " + docidString + ", field " + field);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    Terms terms = reader.getTermVector(docid, field);
    termVectorDisplay(terms);
  }

コード例 #5

ファイルを表示

ファイル: TestMemoryIndexAgainstRAMDir.java プロジェクト: dennisgove/lucene-solr

  public void testDuelMemoryIndexCoreDirectoryWithArrayField() throws Exception {

    final String field_name = "text";
    MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
    if (random().nextBoolean()) {
      mockAnalyzer.setOffsetGap(random().nextInt(100));
    }
    // index into a random directory
    FieldType type = new FieldType(TextField.TYPE_STORED);
    type.setStoreTermVectorOffsets(true);
    type.setStoreTermVectorPayloads(false);
    type.setStoreTermVectorPositions(true);
    type.setStoreTermVectors(true);
    type.freeze();

    Document doc = new Document();
    doc.add(new Field(field_name, "la la", type));
    doc.add(new Field(field_name, "foo bar foo bar foo", type));

    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer));
    writer.updateDocument(new Term("id", "1"), doc);
    writer.commit();
    writer.close();
    DirectoryReader reader = DirectoryReader.open(dir);

    // Index document in Memory index
    MemoryIndex memIndex = new MemoryIndex(true);
    memIndex.addField(field_name, "la la", mockAnalyzer);
    memIndex.addField(field_name, "foo bar foo bar foo", mockAnalyzer);

    // compare term vectors
    Terms ramTv = reader.getTermVector(0, field_name);
    IndexReader memIndexReader = memIndex.createSearcher().getIndexReader();
    TestUtil.checkReader(memIndexReader);
    Terms memTv = memIndexReader.getTermVector(0, field_name);

    compareTermVectors(ramTv, memTv, field_name);
    memIndexReader.close();
    reader.close();
    dir.close();
  }

コード例 #6

ファイルを表示

ファイル: LuceneUtil.java プロジェクト: INL/BlackLab

  /**
   * Get all words between the specified start and end positions from the term vector.
   *
   * <p>NOTE: this may return an array of less than the size requested, if the document ends before
   * the requested end position.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param start start position (first word we want to request)
   * @param end end position (last word we want to request)
   * @param partialOk is it okay if we're missing words in the middle, or do we need them all?
   *     (debug)
   * @return the words found, in order
   */
  public static String[] getWordsFromTermVector(
      IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) {

    // Retrieve the term position vector of the contents of this document.
    // NOTE: might be faster to retrieve all term vectors at once

    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      if (!terms.hasPositions())
        throw new IllegalArgumentException(
            "Field " + luceneName + " has no character postion information");
      // String[] docTerms = new String[(int) terms.size()];
      // final List<BytesRef> termsList = new ArrayList<BytesRef>();
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum docPosEnum = null;
      int numFound = 0;
      String[] concordanceWords = new String[end - start + 1];
      while (termsEnum.next() != null) {
        docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS);
        while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          // NOTE: .docId() will always return 0 in this case
          // if (docPosEnum.docID() != doc)
          //	throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc
          // + ")");
          for (int i = 0; i < docPosEnum.freq(); i++) {
            int position = docPosEnum.nextPosition();
            if (position == -1)
              throw new RuntimeException(
                  "Unexpected missing position (i="
                      + i
                      + ", docPosEnum.freq() = "
                      + docPosEnum.freq()
                      + ")");
            if (position >= start && position <= end) {
              if (concordanceWords[position - start] == null)
                concordanceWords[position - start] = termsEnum.term().utf8ToString();
              else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString();
              numFound++;
            }
          }
          if (numFound == concordanceWords.length) return concordanceWords;
        }
      }

      if (numFound < concordanceWords.length && !partialOk) {
        // If we simply ran into the end of the document, that's okay;
        // but if words are missing in the middle, that's not.
        String[] partial = new String[numFound];
        for (int i = 0; i < numFound; i++) {
          partial[i] = concordanceWords[i];
          if (partial[i] == null) {
            throw new RuntimeException(
                "Not all words found ("
                    + numFound
                    + " out of "
                    + concordanceWords.length
                    + "); missing words in the middle of concordance!");
          }
        }
        return partial;
      }
      return concordanceWords;
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }

コード例 #7

ファイルを表示

ファイル: TermVectorComponent.java プロジェクト: netboynb/search-core

  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }

    NamedList<Object> termVectors = new NamedList<Object>();
    rb.rsp.add(TERM_VECTORS, termVectors);

    IndexSchema schema = rb.req.getSchema();
    SchemaField keyField = schema.getUniqueKeyField();
    String uniqFieldName = null;
    if (keyField != null) {
      uniqFieldName = keyField.getName();
      termVectors.add("uniqueKeyFieldName", uniqFieldName);
    }

    FieldOptions allFields = new FieldOptions();
    // figure out what options we have, and try to get the appropriate vector
    allFields.termFreq = params.getBool(TermVectorParams.TF, false);
    allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
    allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
    allFields.docFreq = params.getBool(TermVectorParams.DF, false);
    allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
    // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
    // short cut to all values.
    if (params.getBool(TermVectorParams.ALL, false)) {
      allFields.termFreq = true;
      allFields.positions = true;
      allFields.offsets = true;
      allFields.docFreq = true;
      allFields.tfIdf = true;
    }

    // Build up our per field mapping
    Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
    NamedList<List<String>> warnings = new NamedList<List<String>>();
    List<String> noTV = new ArrayList<String>();
    List<String> noPos = new ArrayList<String>();
    List<String> noOff = new ArrayList<String>();

    Set<String> fields = getFields(rb);
    if (null != fields) {
      // we have specific fields to retrieve, or no fields
      for (String field : fields) {

        // workarround SOLR-3523
        if (null == field || "score".equals(field)) continue;

        // we don't want to issue warnings about the uniqueKey field
        // since it can cause lots of confusion in distributed requests
        // where the uniqueKey field is injected into the fl for merging
        final boolean fieldIsUniqueKey = field.equals(uniqFieldName);

        SchemaField sf = schema.getFieldOrNull(field);
        if (sf != null) {
          if (sf.storeTermVector()) {
            FieldOptions option = fieldOptions.get(field);
            if (option == null) {
              option = new FieldOptions();
              option.fieldName = field;
              fieldOptions.put(field, option);
            }
            // get the per field mappings
            option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
            option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
            option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
            // Validate these are even an option
            option.positions =
                params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
            if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey) {
              noPos.add(field);
            }
            option.offsets =
                params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
            if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey) {
              noOff.add(field);
            }
          } else { // field doesn't have term vectors
            if (!fieldIsUniqueKey) noTV.add(field);
          }
        } else {
          // field doesn't exist
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
        }
      }
    } // else, deal with all fields

    // NOTE: currently all typs of warnings are schema driven, and garunteed
    // to be consistent across all shards - if additional types of warnings
    // are added that might be differnet between shards, finishStage() needs
    // to be changed to account for that.
    boolean hasWarnings = false;
    if (!noTV.isEmpty()) {
      warnings.add("noTermVectors", noTV);
      hasWarnings = true;
    }
    if (!noPos.isEmpty()) {
      warnings.add("noPositions", noPos);
      hasWarnings = true;
    }
    if (!noOff.isEmpty()) {
      warnings.add("noOffsets", noOff);
      hasWarnings = true;
    }
    if (hasWarnings) {
      termVectors.add("warnings", warnings);
    }

    DocListAndSet listAndSet = rb.getResults();
    List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
    Iterator<Integer> iter;
    if (docIds != null && !docIds.isEmpty()) {
      iter = docIds.iterator();
    } else {
      DocList list = listAndSet.docList;
      iter = list.iterator();
    }
    SolrIndexSearcher searcher = rb.req.getSearcher();

    IndexReader reader = searcher.getIndexReader();
    // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors

    // Only load the id field to get the uniqueKey of that
    // field

    final String finalUniqFieldName = uniqFieldName;

    final List<String> uniqValues = new ArrayList<String>();

    // TODO: is this required to be single-valued? if so, we should STOP
    // once we find it...
    final StoredFieldVisitor getUniqValue =
        new StoredFieldVisitor() {
          @Override
          public void stringField(FieldInfo fieldInfo, String value) {
            uniqValues.add(value);
          }

          @Override
          public void intField(FieldInfo fieldInfo, int value) {
            uniqValues.add(Integer.toString(value));
          }

          @Override
          public void longField(FieldInfo fieldInfo, long value) {
            uniqValues.add(Long.toString(value));
          }

          @Override
          public Status needsField(FieldInfo fieldInfo) {
            return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO;
          }
        };

    TermsEnum termsEnum = null;

    while (iter.hasNext()) {
      Integer docId = iter.next();
      NamedList<Object> docNL = new NamedList<Object>();

      if (keyField != null) {
        reader.document(docId, getUniqValue);
        String uniqVal = null;
        if (uniqValues.size() != 0) {
          uniqVal = uniqValues.get(0);
          uniqValues.clear();
          docNL.add("uniqueKey", uniqVal);
          termVectors.add(uniqVal, docNL);
        }
      } else {
        // support for schemas w/o a unique key,
        termVectors.add("doc-" + docId, docNL);
      }

      if (null != fields) {
        for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
          final String field = entry.getKey();
          final Terms vector = reader.getTermVector(docId, field);
          if (vector != null) {
            termsEnum = vector.iterator(termsEnum);
            mapOneVector(docNL, entry.getValue(), reader, docId, vector.iterator(termsEnum), field);
          }
        }
      } else {
        // extract all fields
        final Fields vectors = reader.getTermVectors(docId);
        for (String field : vectors) {
          Terms terms = vectors.terms(field);
          if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            mapOneVector(docNL, allFields, reader, docId, termsEnum, field);
          }
        }
      }
    }
  }