Exemplo n.º 1
0
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }
  public static void verifyEquals(Fields d1, Fields d2) throws IOException {
    if (d1 == null) {
      assertTrue(d2 == null || d2.size() == 0);
      return;
    }
    assertTrue(d2 != null);

    Iterator<String> fieldsEnum2 = d2.iterator();

    for (String field1 : d1) {
      String field2 = fieldsEnum2.next();
      assertEquals(field1, field2);

      Terms terms1 = d1.terms(field1);
      assertNotNull(terms1);
      TermsEnum termsEnum1 = terms1.iterator(null);

      Terms terms2 = d2.terms(field2);
      assertNotNull(terms2);
      TermsEnum termsEnum2 = terms2.iterator(null);

      DocsAndPositionsEnum dpEnum1 = null;
      DocsAndPositionsEnum dpEnum2 = null;
      DocsEnum dEnum1 = null;
      DocsEnum dEnum2 = null;

      BytesRef term1;
      while ((term1 = termsEnum1.next()) != null) {
        BytesRef term2 = termsEnum2.next();
        assertEquals(term1, term2);
        assertEquals(termsEnum1.totalTermFreq(), termsEnum2.totalTermFreq());

        dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1);
        dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2);
        if (dpEnum1 != null) {
          assertNotNull(dpEnum2);
          int docID1 = dpEnum1.nextDoc();
          dpEnum2.nextDoc();
          // docIDs are not supposed to be equal
          // int docID2 = dpEnum2.nextDoc();
          // assertEquals(docID1, docID2);
          assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);

          int freq1 = dpEnum1.freq();
          int freq2 = dpEnum2.freq();
          assertEquals(freq1, freq2);
          OffsetAttribute offsetAtt1 =
              dpEnum1.attributes().hasAttribute(OffsetAttribute.class)
                  ? dpEnum1.attributes().getAttribute(OffsetAttribute.class)
                  : null;
          OffsetAttribute offsetAtt2 =
              dpEnum2.attributes().hasAttribute(OffsetAttribute.class)
                  ? dpEnum2.attributes().getAttribute(OffsetAttribute.class)
                  : null;

          if (offsetAtt1 != null) {
            assertNotNull(offsetAtt2);
          } else {
            assertNull(offsetAtt2);
          }

          for (int posUpto = 0; posUpto < freq1; posUpto++) {
            int pos1 = dpEnum1.nextPosition();
            int pos2 = dpEnum2.nextPosition();
            assertEquals(pos1, pos2);
            if (offsetAtt1 != null) {
              assertEquals(offsetAtt1.startOffset(), offsetAtt2.startOffset());
              assertEquals(offsetAtt1.endOffset(), offsetAtt2.endOffset());
            }
          }
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc());
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc());
        } else {
          dEnum1 = TestUtil.docs(random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS);
          dEnum2 = TestUtil.docs(random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS);
          assertNotNull(dEnum1);
          assertNotNull(dEnum2);
          int docID1 = dEnum1.nextDoc();
          dEnum2.nextDoc();
          // docIDs are not supposed to be equal
          // int docID2 = dEnum2.nextDoc();
          // assertEquals(docID1, docID2);
          assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
          int freq1 = dEnum1.freq();
          int freq2 = dEnum2.freq();
          assertEquals(freq1, freq2);
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum1.nextDoc());
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum2.nextDoc());
        }
      }

      assertNull(termsEnum2.next());
    }
    assertFalse(fieldsEnum2.hasNext());
  }