コード例 #1
0
  /*
   *  listTermVectors displays the term vectors for all of the fields
   *  in a document in an index (specified by reader).
   */
  static void listTermVectors(IndexReader reader, String docidString) throws IOException {

    System.out.println("\nTermVector:  docid " + docidString);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    /*
     *  Iterate over the fields in this document.
     */
    Fields fields = reader.getTermVectors(docid);
    Iterator<String> fieldIterator = fields.iterator();

    while (fieldIterator.hasNext()) {
      String fieldName = fieldIterator.next();
      System.out.println("  Field: " + fieldName);

      Terms terms = fields.terms(fieldName);
      termVectorDisplay(terms);
    }
    ;
  }
コード例 #2
0
  /**
   * Create a PriorityQueue from a word->tf map.
   *
   * @param words a map of words keyed on the word(String) with Int objects as the values.
   * @param fieldNames an array of field names to override defaults.
   */
  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames)
      throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
    final int limit = Math.min(maxQueryTerms, words.size());
    FreqQ queue = new FreqQ(limit); // will order words by score

    for (String word : words.keySet()) { // for every word
      int tf = words.get(word).x; // term freq in the source doc
      if (minTermFreq > 0 && tf < minTermFreq) {
        continue; // filter out words that don't occur enough times in the source
      }

      // go through all the fields and find the largest document frequency
      String topField = fieldNames[0];
      int docFreq = 0;
      for (String fieldName : fieldNames) {
        int freq = ir.docFreq(new Term(fieldName, word));
        topField = (freq > docFreq) ? fieldName : topField;
        docFreq = (freq > docFreq) ? freq : docFreq;
      }

      if (minDocFreq > 0 && docFreq < minDocFreq) {
        continue; // filter out words that don't occur in enough docs
      }

      if (docFreq > maxDocFreq) {
        continue; // filter out words that occur in too many docs
      }

      if (docFreq == 0) {
        continue; // index update problem?
      }

      float idf = similarity.idf(docFreq, numDocs);
      float score = tf * idf;

      if (queue.size() < limit) {
        // there is still space in the queue
        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
      } else {
        ScoreTerm term = queue.top();
        if (term.score < score) { // update the smallest in the queue in place and update the queue.
          term.update(word, topField, score, idf, docFreq, tf);
          queue.updateTop();
        }
      }
    }
    return queue;
  }
コード例 #3
0
  /*
   *  listTermVectorField displays the term vector for a field in
   *  a document in an index (specified by reader).
   */
  static void listTermVectorField(IndexReader reader, String docidString, String field)
      throws IOException {

    System.out.println("\nTermVector:  docid " + docidString + ", field " + field);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    Terms terms = reader.getTermVector(docid, field);
    termVectorDisplay(terms);
  }
コード例 #4
0
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }
コード例 #5
0
 /**
  * Set the maximum percentage in which words may still appear. Words that appear in more than this
  * many percent of all docs will be ignored.
  *
  * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear in to
  *     be still considered relevant
  */
 public void setMaxDocFreqPct(int maxPercentage) {
   this.maxDocFreq = maxPercentage * ir.numDocs() / 100;
 }