예제 #1
0
 /**
  * Prepare a document reconstructor.
  *
  * @param reader IndexReader to read from.
  * @param fieldNames if non-null or not empty, data will be collected only from these fields,
  *     otherwise data will be collected from all fields
  * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated)
  * @throws Exception
  */
 public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception {
   if (reader == null) {
     throw new Exception("IndexReader cannot be null.");
   }
   this.reader = reader;
   if (fieldNames == null || fieldNames.length == 0) {
     // collect fieldNames
     this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]);
   } else {
     this.fieldNames = fieldNames;
   }
   if (numTerms == -1) {
     Fields fields = MultiFields.getFields(reader);
     numTerms = 0;
     FieldsEnum fe = fields.iterator();
     String fld = null;
     while ((fld = fe.next()) != null) {
       TermsEnum te = fe.terms();
       while (te.next() != null) {
         numTerms++;
       }
     }
     this.numTerms = numTerms;
   }
   deleted = MultiFields.getDeletedDocs(reader);
 }
 private Fields generateTermVectors(
     Collection<GetField> getFields,
     boolean withOffsets,
     @Nullable Map<String, String> perFieldAnalyzer)
     throws IOException {
   /* store document in memory index */
   MemoryIndex index = new MemoryIndex(withOffsets);
   for (GetField getField : getFields) {
     String field = getField.getName();
     Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer);
     for (Object text : getField.getValues()) {
       index.addField(field, text.toString(), analyzer);
     }
   }
   /* and read vectors from it */
   return MultiFields.getFields(index.createSearcher().getIndexReader());
 }
  public TermVectorResponse getTermVector(TermVectorRequest request, String concreteIndex) {
    final Engine.Searcher searcher = indexShard.acquireSearcher("term_vector");
    IndexReader topLevelReader = searcher.reader();
    final TermVectorResponse termVectorResponse =
        new TermVectorResponse(concreteIndex, request.type(), request.id());

    final Term uidTerm =
        new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(request.type(), request.id()));
    Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm));
    boolean docFromTranslog = get.source() != null;
    AggregatedDfs dfs = null;

    /* fetched from translog is treated as an artificial document */
    if (docFromTranslog) {
      request.doc(get.source().source, false);
      termVectorResponse.setDocVersion(get.version());
    }

    /* handle potential wildcards in fields */
    if (request.selectedFields() != null) {
      handleFieldWildcards(request);
    }

    try {
      Fields topLevelFields = MultiFields.getFields(topLevelReader);
      Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
      /* from an artificial document */
      if (request.doc() != null) {
        Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
        // if no document indexed in shard, take the queried document itself for stats
        if (topLevelFields == null) {
          topLevelFields = termVectorsByField;
        }
        if (termVectorsByField != null && useDfs(request)) {
          dfs = getAggregatedDfs(termVectorsByField, request);
        }
        termVectorResponse.setFields(
            termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
        termVectorResponse.setExists(true);
        termVectorResponse.setArtificial(!docFromTranslog);
      }
      /* or from an existing document */
      else if (docIdAndVersion != null) {
        // fields with stored term vectors
        Fields termVectorsByField =
            docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
        Set<String> selectedFields = request.selectedFields();
        // generate tvs for fields where analyzer is overridden
        if (selectedFields == null && request.perFieldAnalyzer() != null) {
          selectedFields = getFieldsToGenerate(request.perFieldAnalyzer(), termVectorsByField);
        }
        // fields without term vectors
        if (selectedFields != null) {
          termVectorsByField =
              addGeneratedTermVectors(get, termVectorsByField, request, selectedFields);
        }
        if (termVectorsByField != null && useDfs(request)) {
          dfs = getAggregatedDfs(termVectorsByField, request);
        }
        termVectorResponse.setFields(
            termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
        termVectorResponse.setDocVersion(docIdAndVersion.version);
        termVectorResponse.setExists(true);
      } else {
        termVectorResponse.setExists(false);
      }
    } catch (Throwable ex) {
      throw new ElasticsearchException("failed to execute term vector request", ex);
    } finally {
      searcher.close();
      get.release();
    }
    return termVectorResponse;
  }
예제 #4
0
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }