Exemple #1
0
  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }
  /*
   *  listTermVectors displays the term vectors for all of the fields
   *  in a document in an index (specified by reader).
   */
  static void listTermVectors(IndexReader reader, String docidString) throws IOException {

    System.out.println("\nTermVector:  docid " + docidString);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    /*
     *  Iterate over the fields in this document.
     */
    Fields fields = reader.getTermVectors(docid);
    Iterator<String> fieldIterator = fields.iterator();

    while (fieldIterator.hasNext()) {
      String fieldName = fieldIterator.next();
      System.out.println("  Field: " + fieldName);

      Terms terms = fields.terms(fieldName);
      termVectorDisplay(terms);
    }
    ;
  }
  /*
   *  listPostings displays the first n postings for a term in a
   *  field in an index (specified by reader).  Set n to MAX_VALUE
   *  to display all postings.
   */
  static void listPostings(IndexReader reader, String termString, String field, Integer n)
      throws IOException {

    System.out.println("\nPostings:  " + termString + " " + field);

    /*
     *  Prepare to access the index.
     */
    BytesRef termBytes = new BytesRef(termString);
    Term term = new Term(field, termBytes);
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    /*
     *  Lookup the collection term frequency (ctf).
     */
    long df = reader.docFreq(term);
    System.out.println("\tdf:  " + df);

    long ctf = reader.totalTermFreq(term);
    System.out.println("\tctf:  " + ctf);

    if (df < 1) return;

    /*
     *  Lookup the inverted list.
     */
    DocsAndPositionsEnum postings =
        MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes);

    /*
     *  Iterate through the first n postings.
     */
    long count = 0;

    while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) {

      System.out.println("\tdocid: " + postings.docID());
      int tf = postings.freq();
      System.out.println("\ttf: " + tf);
      System.out.print("\tPositions: ");

      for (int j = 0; j < tf; j++) {
        int pos = postings.nextPosition();
        System.out.print(pos + " ");
      }

      System.out.println("");

      count++;
    }
    ;

    return;
  }
  /*
   *  listTermVectorField displays the term vector for a field in
   *  a document in an index (specified by reader).
   */
  static void listTermVectorField(IndexReader reader, String docidString, String field)
      throws IOException {

    System.out.println("\nTermVector:  docid " + docidString + ", field " + field);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    Terms terms = reader.getTermVector(docid, field);
    termVectorDisplay(terms);
  }
  public void getIndexInfo(String indexdir, int freqThreshold) {
    IndexReader reader = null;

    try {
      Directory dir = FSDirectory.open(new File(indexdir));
      System.out.println(dir);
      reader = IndexReader.open(dir);

      System.out.println("document num:" + reader.numDocs());
      System.out.println("======================");

      TermEnum terms = reader.terms();
      sortedTermQueue.clear();
      maxDocNum = reader.maxDoc();
      linkMap.clear();
      termList.clear();
      while (terms.next()) {
        // System.out.print(terms.term() + "\tDocFreq:" +
        TermDocs termDocs = reader.termDocs(terms.term());
        MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum);
        if (temp.totalFreq < freqThreshold) {
          continue;
        } /*
           * if(temp.originTrem.text().length()==1){ continue; }
           */
        linkMap.put(temp.originTrem.text(), temp);
        sortedTermQueue.add(temp);
        termList.add(temp);
      }
      System.out.println("total Size:" + sortedTermQueue.size());
      System.out.println("mapsize:" + linkMap.keySet().size());
      // System.exit(0);
      int num = 0;
      this.maxFreq = sortedTermQueue.peek().totalFreq;
      while (!sortedTermQueue.isEmpty()) {
        num++;
        System.out.println(num + ":" + sortedTermQueue.poll());
      }
      System.out.println("read index info done");
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      try {
        reader.close();

      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }
  /**
   * Searches pages using a particular combination of flags.
   *
   * @param query The query to perform in Lucene query language
   * @param flags A set of flags
   * @return A Collection of SearchResult instances
   * @throws ProviderException if there is a problem with the backend
   */
  public Collection findPages(String query, int flags) throws ProviderException {
    IndexSearcher searcher = null;
    ArrayList<SearchResult> list = null;
    Highlighter highlighter = null;

    try {
      String[] queryfields = {
        LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS
      };
      QueryParser qp =
          new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer());

      // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
      Query luceneQuery = qp.parse(query);

      if ((flags & FLAG_CONTEXTS) != 0) {
        highlighter =
            new Highlighter(
                new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"),
                new SimpleHTMLEncoder(),
                new QueryScorer(luceneQuery));
      }

      try {
        File dir = new File(m_luceneDirectory);
        Directory luceneDir = new SimpleFSDirectory(dir, null);
        IndexReader reader = IndexReader.open(luceneDir);
        searcher = new IndexSearcher(reader);
      } catch (Exception ex) {
        log.info("Lucene not yet ready; indexing not started", ex);
        return null;
      }

      ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;

      list = new ArrayList<SearchResult>(hits.length);
      for (int curr = 0; curr < hits.length; curr++) {
        int docID = hits[curr].doc;
        Document doc = searcher.doc(docID);
        String pageName = doc.get(LUCENE_ID);
        WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);

        if (page != null) {
          if (page instanceof Attachment) {
            // Currently attachments don't look nice on the search-results page
            // When the search-results are cleaned up this can be enabled again.
          }

          int score = (int) (hits[curr].score * 100);

          // Get highlighted search contexts
          String text = doc.get(LUCENE_PAGE_CONTENTS);

          String[] fragments = new String[0];
          if (text != null && highlighter != null) {
            TokenStream tokenStream =
                getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
            fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
          }

          SearchResult result = new SearchResultImpl(page, score, fragments);
          list.add(result);
        } else {
          log.error(
              "Lucene found a result page '"
                  + pageName
                  + "' that could not be loaded, removing from Lucene cache");
          pageRemoved(new WikiPage(m_engine, pageName));
        }
      }
    } catch (IOException e) {
      log.error("Failed during lucene search", e);
    } catch (ParseException e) {
      log.info("Broken query; cannot parse query ", e);

      throw new ProviderException(
          "You have entered a query Lucene cannot process: " + e.getMessage());
    } catch (InvalidTokenOffsetsException e) {
      log.error("Tokens are incompatible with provided text ", e);
    } finally {
      if (searcher != null) {
        try {
          searcher.close();
        } catch (IOException e) {
          log.error(e);
        }
      }
    }

    return list;
  }
Exemple #8
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("TermDumper [-c|-v value] field <index...>");
      System.exit(1);
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
      String arg = args[i];

      if ("-h".equals(arg) || "--help".equals(arg)) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);
      } else if ("-c".equals(arg) || "--count".equals(arg)) {
        count = true;
      } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
        value = args[++i];
      } else if ("-a".equals(arg) || "--all".equals(arg)) {
        all = true;
      } else {
        break;
      }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers =
        new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
      String arg = args[i];
      try {
        IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

        readers.add(reader);
      } catch (IOException ioe) {
        System.err.println("Error reading: " + arg);
      }
    }

    for (IndexReader reader : readers) {
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms(new Term(field));

      try {
        do {
          Term term = termEnum.term();

          if (term == null || !field.equals(term.field())) break;

          if (value == null) {
            if (count) {
              termDocs.seek(termEnum);

              int c = 0;
              for (; termDocs.next(); c++) ;

              System.out.print(c + " ");
            }
            System.out.println(term.text());
          } else if (value.equals(term.text())) {
            termDocs.seek(termEnum);

            while (termDocs.next()) {
              if (all) {
                Document d = reader.document(termDocs.doc());
                System.out.println(termDocs.doc());
                for (Object o : d.getFields()) {
                  Field f = (Field) o;
                  System.out.println(f.name() + " " + d.get(f.name()));
                }
              } else {
                System.out.println(
                    termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
              }
            }
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
    }
  }