コード例 #1
0
  /**
   * Extracts termClaimsDescriptionAbstractTitles of the documents; Adds them to vector in the same
   * order
   *
   * @param hits
   * @param i
   * @param j
   * @return relevantDocsTerms docs must be in order
   * @throws java.io.IOException
   */
  public Map<TermFreqVector, String> getDocsTerms(TopDocs hits, int i, int j) throws IOException {
    Map<TermFreqVector, String> docsTerms = new HashMap<>();
    // Process each of the documents
    while (i < j && i < hits.totalHits && i >= 0) {
      ScoreDoc scoreDoc = hits.scoreDocs[i];
      if (sourceField.equals(PatentQuery.all)) {
        Terms termTitle = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[1]);
        TermFreqVector docTermsTitle = new TermFreqVector(termTitle);
        docsTerms.put(docTermsTitle, PatentQuery.getFields()[1]);

        Terms termAbstract = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[2]);
        TermFreqVector docTermsAbstract = new TermFreqVector(termAbstract);
        docsTerms.put(docTermsAbstract, PatentQuery.getFields()[2]);

        Terms termDescription = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[3]);
        TermFreqVector docTermsDescription = new TermFreqVector(termDescription);
        docsTerms.put(docTermsDescription, PatentQuery.getFields()[3]);

        Terms termClaims = ir.getTermVector(scoreDoc.doc, PatentQuery.getFields()[5]);
        TermFreqVector docTermsClaims = new TermFreqVector(termClaims);
        docsTerms.put(docTermsClaims, PatentQuery.getFields()[5]);

      } else {
        Terms term = ir.getTermVector(scoreDoc.doc, sourceField); // get termvector for document
        // Create termVector and add it to vector
        TermFreqVector docTerms = new TermFreqVector(term);
        docsTerms.put(docTerms, sourceField);
      }
      i++;
    }
    return docsTerms;
  }
コード例 #2
0
ファイル: LuceneUtil.java プロジェクト: INL/BlackLab
  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
コード例 #3
0
  @Override
  public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    IndexReader reader = context.getIndexSearcher().getIndexReader();

    for (int i = 0; i < docs.documents.length; i++) {
      Terms terms = null;
      try {
        terms = reader.getTermVector(docs.ids[i], StatusField.TEXT.name);
      } catch (IOException e) {
        continue;
      }

      String qid = context.getQueryId().replaceFirst("^MB0*", "");
      String docid = docs.documents[i].getField(StatusField.ID.name).stringValue();

      out.print(qrels.getRelevanceGrade(qid, docid));
      out.print(" qid:" + qid);
      out.print(" 1:" + docs.scores[i]);

      float[] intFeatures = this.extractorChain.extractAll(docs.documents[i], terms, context);

      for (int j = 0; j < intFeatures.length; j++) {
        out.print(" " + (j + 2) + ":" + intFeatures[j]);
      }

      out.print(" # docid:" + docid);
      out.print("\n");
    }

    return docs;
  }
コード例 #4
0
  /*
   *  listTermVectorField displays the term vector for a field in
   *  a document in an index (specified by reader).
   */
  static void listTermVectorField(IndexReader reader, String docidString, String field)
      throws IOException {

    System.out.println("\nTermVector:  docid " + docidString + ", field " + field);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    Terms terms = reader.getTermVector(docid, field);
    termVectorDisplay(terms);
  }
コード例 #5
0
  public void testDuelMemoryIndexCoreDirectoryWithArrayField() throws Exception {

    final String field_name = "text";
    MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
    if (random().nextBoolean()) {
      mockAnalyzer.setOffsetGap(random().nextInt(100));
    }
    // index into a random directory
    FieldType type = new FieldType(TextField.TYPE_STORED);
    type.setStoreTermVectorOffsets(true);
    type.setStoreTermVectorPayloads(false);
    type.setStoreTermVectorPositions(true);
    type.setStoreTermVectors(true);
    type.freeze();

    Document doc = new Document();
    doc.add(new Field(field_name, "la la", type));
    doc.add(new Field(field_name, "foo bar foo bar foo", type));

    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer));
    writer.updateDocument(new Term("id", "1"), doc);
    writer.commit();
    writer.close();
    DirectoryReader reader = DirectoryReader.open(dir);

    // Index document in Memory index
    MemoryIndex memIndex = new MemoryIndex(true);
    memIndex.addField(field_name, "la la", mockAnalyzer);
    memIndex.addField(field_name, "foo bar foo bar foo", mockAnalyzer);

    // compare term vectors
    Terms ramTv = reader.getTermVector(0, field_name);
    IndexReader memIndexReader = memIndex.createSearcher().getIndexReader();
    TestUtil.checkReader(memIndexReader);
    Terms memTv = memIndexReader.getTermVector(0, field_name);

    compareTermVectors(ramTv, memTv, field_name);
    memIndexReader.close();
    reader.close();
    dir.close();
  }
コード例 #6
0
ファイル: LuceneUtil.java プロジェクト: INL/BlackLab
  /**
   * Get all words between the specified start and end positions from the term vector.
   *
   * <p>NOTE: this may return an array of less than the size requested, if the document ends before
   * the requested end position.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param start start position (first word we want to request)
   * @param end end position (last word we want to request)
   * @param partialOk is it okay if we're missing words in the middle, or do we need them all?
   *     (debug)
   * @return the words found, in order
   */
  public static String[] getWordsFromTermVector(
      IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) {

    // Retrieve the term position vector of the contents of this document.
    // NOTE: might be faster to retrieve all term vectors at once

    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      if (!terms.hasPositions())
        throw new IllegalArgumentException(
            "Field " + luceneName + " has no character postion information");
      // String[] docTerms = new String[(int) terms.size()];
      // final List<BytesRef> termsList = new ArrayList<BytesRef>();
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum docPosEnum = null;
      int numFound = 0;
      String[] concordanceWords = new String[end - start + 1];
      while (termsEnum.next() != null) {
        docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS);
        while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          // NOTE: .docId() will always return 0 in this case
          // if (docPosEnum.docID() != doc)
          //	throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc
          // + ")");
          for (int i = 0; i < docPosEnum.freq(); i++) {
            int position = docPosEnum.nextPosition();
            if (position == -1)
              throw new RuntimeException(
                  "Unexpected missing position (i="
                      + i
                      + ", docPosEnum.freq() = "
                      + docPosEnum.freq()
                      + ")");
            if (position >= start && position <= end) {
              if (concordanceWords[position - start] == null)
                concordanceWords[position - start] = termsEnum.term().utf8ToString();
              else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString();
              numFound++;
            }
          }
          if (numFound == concordanceWords.length) return concordanceWords;
        }
      }

      if (numFound < concordanceWords.length && !partialOk) {
        // If we simply ran into the end of the document, that's okay;
        // but if words are missing in the middle, that's not.
        String[] partial = new String[numFound];
        for (int i = 0; i < numFound; i++) {
          partial[i] = concordanceWords[i];
          if (partial[i] == null) {
            throw new RuntimeException(
                "Not all words found ("
                    + numFound
                    + " out of "
                    + concordanceWords.length
                    + "); missing words in the middle of concordance!");
          }
        }
        return partial;
      }
      return concordanceWords;
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
コード例 #7
0
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }

    NamedList<Object> termVectors = new NamedList<Object>();
    rb.rsp.add(TERM_VECTORS, termVectors);

    IndexSchema schema = rb.req.getSchema();
    SchemaField keyField = schema.getUniqueKeyField();
    String uniqFieldName = null;
    if (keyField != null) {
      uniqFieldName = keyField.getName();
      termVectors.add("uniqueKeyFieldName", uniqFieldName);
    }

    FieldOptions allFields = new FieldOptions();
    // figure out what options we have, and try to get the appropriate vector
    allFields.termFreq = params.getBool(TermVectorParams.TF, false);
    allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
    allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
    allFields.docFreq = params.getBool(TermVectorParams.DF, false);
    allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
    // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
    // short cut to all values.
    if (params.getBool(TermVectorParams.ALL, false)) {
      allFields.termFreq = true;
      allFields.positions = true;
      allFields.offsets = true;
      allFields.docFreq = true;
      allFields.tfIdf = true;
    }

    // Build up our per field mapping
    Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
    NamedList<List<String>> warnings = new NamedList<List<String>>();
    List<String> noTV = new ArrayList<String>();
    List<String> noPos = new ArrayList<String>();
    List<String> noOff = new ArrayList<String>();

    Set<String> fields = getFields(rb);
    if (null != fields) {
      // we have specific fields to retrieve, or no fields
      for (String field : fields) {

        // workarround SOLR-3523
        if (null == field || "score".equals(field)) continue;

        // we don't want to issue warnings about the uniqueKey field
        // since it can cause lots of confusion in distributed requests
        // where the uniqueKey field is injected into the fl for merging
        final boolean fieldIsUniqueKey = field.equals(uniqFieldName);

        SchemaField sf = schema.getFieldOrNull(field);
        if (sf != null) {
          if (sf.storeTermVector()) {
            FieldOptions option = fieldOptions.get(field);
            if (option == null) {
              option = new FieldOptions();
              option.fieldName = field;
              fieldOptions.put(field, option);
            }
            // get the per field mappings
            option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
            option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
            option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
            // Validate these are even an option
            option.positions =
                params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
            if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey) {
              noPos.add(field);
            }
            option.offsets =
                params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
            if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey) {
              noOff.add(field);
            }
          } else { // field doesn't have term vectors
            if (!fieldIsUniqueKey) noTV.add(field);
          }
        } else {
          // field doesn't exist
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
        }
      }
    } // else, deal with all fields

    // NOTE: currently all typs of warnings are schema driven, and garunteed
    // to be consistent across all shards - if additional types of warnings
    // are added that might be differnet between shards, finishStage() needs
    // to be changed to account for that.
    boolean hasWarnings = false;
    if (!noTV.isEmpty()) {
      warnings.add("noTermVectors", noTV);
      hasWarnings = true;
    }
    if (!noPos.isEmpty()) {
      warnings.add("noPositions", noPos);
      hasWarnings = true;
    }
    if (!noOff.isEmpty()) {
      warnings.add("noOffsets", noOff);
      hasWarnings = true;
    }
    if (hasWarnings) {
      termVectors.add("warnings", warnings);
    }

    DocListAndSet listAndSet = rb.getResults();
    List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
    Iterator<Integer> iter;
    if (docIds != null && !docIds.isEmpty()) {
      iter = docIds.iterator();
    } else {
      DocList list = listAndSet.docList;
      iter = list.iterator();
    }
    SolrIndexSearcher searcher = rb.req.getSearcher();

    IndexReader reader = searcher.getIndexReader();
    // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors

    // Only load the id field to get the uniqueKey of that
    // field

    final String finalUniqFieldName = uniqFieldName;

    final List<String> uniqValues = new ArrayList<String>();

    // TODO: is this required to be single-valued? if so, we should STOP
    // once we find it...
    final StoredFieldVisitor getUniqValue =
        new StoredFieldVisitor() {
          @Override
          public void stringField(FieldInfo fieldInfo, String value) {
            uniqValues.add(value);
          }

          @Override
          public void intField(FieldInfo fieldInfo, int value) {
            uniqValues.add(Integer.toString(value));
          }

          @Override
          public void longField(FieldInfo fieldInfo, long value) {
            uniqValues.add(Long.toString(value));
          }

          @Override
          public Status needsField(FieldInfo fieldInfo) {
            return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO;
          }
        };

    TermsEnum termsEnum = null;

    while (iter.hasNext()) {
      Integer docId = iter.next();
      NamedList<Object> docNL = new NamedList<Object>();

      if (keyField != null) {
        reader.document(docId, getUniqValue);
        String uniqVal = null;
        if (uniqValues.size() != 0) {
          uniqVal = uniqValues.get(0);
          uniqValues.clear();
          docNL.add("uniqueKey", uniqVal);
          termVectors.add(uniqVal, docNL);
        }
      } else {
        // support for schemas w/o a unique key,
        termVectors.add("doc-" + docId, docNL);
      }

      if (null != fields) {
        for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
          final String field = entry.getKey();
          final Terms vector = reader.getTermVector(docId, field);
          if (vector != null) {
            termsEnum = vector.iterator(termsEnum);
            mapOneVector(docNL, entry.getValue(), reader, docId, vector.iterator(termsEnum), field);
          }
        }
      } else {
        // extract all fields
        final Fields vectors = reader.getTermVectors(docId);
        for (String field : vectors) {
          Terms terms = vectors.terms(field);
          if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            mapOneVector(docNL, allFields, reader, docId, termsEnum, field);
          }
        }
      }
    }
  }