Esempio n. 1
0
 /** Get an alphabetically sorted list of field names. */
 public List<String> getFieldNames() {
   HashSet<String> names = new HashSet<String>();
   names.addAll(storedFields.keySet());
   names.addAll(reconstructedFields.keySet());
   ArrayList<String> res = new ArrayList<String>(names.size());
   res.addAll(names);
   Collections.sort(res);
   return res;
 }
Esempio n. 2
0
  public void getIndexInfo(String indexdir, int freqThreshold) {
    IndexReader reader = null;

    try {
      Directory dir = FSDirectory.open(new File(indexdir));
      System.out.println(dir);
      reader = IndexReader.open(dir);

      System.out.println("document num:" + reader.numDocs());
      System.out.println("======================");

      TermEnum terms = reader.terms();
      sortedTermQueue.clear();
      maxDocNum = reader.maxDoc();
      linkMap.clear();
      termList.clear();
      while (terms.next()) {
        // System.out.print(terms.term() + "\tDocFreq:" +
        TermDocs termDocs = reader.termDocs(terms.term());
        MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum);
        if (temp.totalFreq < freqThreshold) {
          continue;
        } /*
           * if(temp.originTrem.text().length()==1){ continue; }
           */
        linkMap.put(temp.originTrem.text(), temp);
        sortedTermQueue.add(temp);
        termList.add(temp);
      }
      System.out.println("total Size:" + sortedTermQueue.size());
      System.out.println("mapsize:" + linkMap.keySet().size());
      // System.exit(0);
      int num = 0;
      this.maxFreq = sortedTermQueue.peek().totalFreq;
      while (!sortedTermQueue.isEmpty()) {
        num++;
        System.out.println(num + ":" + sortedTermQueue.poll());
      }
      System.out.println("read index info done");
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      try {
        reader.close();

      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
 /**
  * Convenience routine to make it easy to return the most interesting words in a document. More
  * advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
  *
  * @param r the source document
  * @param fieldName field passed to analyzer to use when analyzing the content
  * @return the most interesting words in the document
  * @see #retrieveTerms(java.io.Reader, String)
  * @see #setMaxQueryTerms
  */
 public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
   ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
   PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
   ScoreTerm scoreTerm;
   int lim =
       maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably
                      // not useful to our caller...
   // we just want to return the top words
   while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
     al.add(scoreTerm.word); // the 1st entry is the interesting word
   }
   String[] res = new String[al.size()];
   return al.toArray(res);
 }
  protected TestConfig[] generateTestConfigs(
      int numberOfTests, TestDoc[] testDocs, TestFieldSetting[] fieldSettings) {
    ArrayList<TestConfig> configs = new ArrayList<TestConfig>();
    for (int i = 0; i < numberOfTests; i++) {

      ArrayList<String> selectedFields = null;
      if (randomBoolean()) {
        // used field selection
        selectedFields = new ArrayList<String>();
        if (randomBoolean()) {
          selectedFields.add("Doesnt_exist"); // this will be ignored.
        }
        for (TestFieldSetting field : fieldSettings)
          if (randomBoolean()) {
            selectedFields.add(field.name);
          }

        if (selectedFields.size() == 0) {
          selectedFields = null; // 0 length set is not supported.
        }
      }
      TestConfig config =
          new TestConfig(
              testDocs[randomInt(testDocs.length - 1)],
              selectedFields == null ? null : selectedFields.toArray(new String[] {}),
              randomBoolean(),
              randomBoolean(),
              randomBoolean());

      configs.add(config);
    }
    // always adds a test that fails
    configs.add(
        new TestConfig(
                new TestDoc("doesnt_exist", new TestFieldSetting[] {}, new String[] {})
                    .index("doesn't_exist"),
                new String[] {"doesnt_exist"},
                true,
                true,
                true)
            .expectedException(IndexMissingException.class));

    refresh();

    return configs.toArray(new TestConfig[] {});
  }
Esempio n. 5
0
  public void generateWekaFile(ArrayList<MyTerm> myTerms, int maxDocNum, String wekaFilePath)
      throws IOException {

    String text = "@relation interest\n";
    text += "@attribute text string\n";
    for (int i = 0; i < maxDocNum; i++) {
      text += "@attribute doc" + i + "\treal\n";
    }
    text += "@data\n";
    for (int j = 0; j < myTerms.size(); j++) {
      MyTerm term = myTerms.get(j);
      String line = "";
      line += term.originTrem.text();
      for (int i = 0; i < term.vector.length; i++) {
        line += "," + term.vector[i];
      }
      line += "\n";
      text += line;
    }
    // System.out.println(text);
    PrintWriter Pout = new PrintWriter(new FileWriter(wekaFilePath));
    Pout.println(text);
    Pout.close();
  }
Esempio n. 6
0
  public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception {
    File inputFile = new File(wekaFilePath);
    ArffLoader arf = new ArffLoader();
    arf.setFile(inputFile);
    Instances originIns = arf.getDataSet();
    Instances insTest = new Instances(originIns);
    insTest.deleteStringAttributes();
    int totalNum = insTest.numInstances();

    // SimpleKMeans sm = new SimpleKMeans();
    EM em = new EM();
    em.setNumClusters(clusterNum);
    MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer();
    sm.setClusterer(em);
    sm.buildClusterer(insTest);

    System.out.println("totalNum:" + insTest.numInstances());
    System.out.println("============================");
    System.out.println(sm.toString());
    Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
    for (int i = 0; i < clusterNum; i++) {
      result.put(i, new ArrayList<String>());
    }

    for (int i = 0; i < totalNum; i++) {
      Instance ins = originIns.instance(i);
      String word = ins.stringValue(0);
      Instance tempIns = new Instance(ins);
      tempIns.deleteAttributeAt(0);
      int cluster = sm.clusterInstance(tempIns);
      result.get(cluster).add(word);
    }

    // print the result
    ArrayList<String> words = new ArrayList<String>();
    JSONArray keyWords = new JSONArray();
    for (int k : result.keySet()) {
      words = result.get(k);
      PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare);
      for (int i = 0; i < words.size(); i++) {
        String s = words.get(i);
        assert linkMap.containsKey(s);
        int freq = linkMap.get(s).totalFreq;
        clusterQueue.add(linkMap.get(s));
        words.set(i, "(" + s + ":" + freq + ")");
      }

      JSONArray clusterArray = new JSONArray();
      int num = clusterQueue.size() / 10 + 1; // 5%
      int totalFreq = 0;
      int totalLength = 0;
      for (int i = 0; i < num && !clusterQueue.isEmpty(); ) {
        JSONObject mem = new JSONObject();
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        if (word.length() == 1) {
          continue;
        }
        mem.put("text", word);
        mem.put("freq", myTerm.totalFreq);
        clusterArray.put(mem);
        i++;
        totalFreq += myTerm.totalFreq;
        totalLength += word.length();
      }

      double averFreq = totalFreq * 1.0 / num;
      double averLength = totalLength * 1.0 / num;
      int count = 0;
      while (!clusterQueue.isEmpty() && count < num) {
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        int freq = myTerm.totalFreq;
        int times = (int) (word.length() / averFreq) + 1;
        if (freq > averFreq / times) {
          JSONObject mem = new JSONObject();
          mem.put("text", word);
          mem.put("freq", freq);
          mem.put("extra", true);
          clusterArray.put(mem);
        }
      }

      keyWords.put(clusterArray);
      System.out.println(
          "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100));
      if (result.get(k).size() < 100) {
        System.out.println(result.get(k));
      }
    }
    // System.out.println("errorNum:"+errorNum);
    return keyWords;
  }
  /**
   * Searches pages using a particular combination of flags.
   *
   * @param query The query to perform in Lucene query language
   * @param flags A set of flags
   * @return A Collection of SearchResult instances
   * @throws ProviderException if there is a problem with the backend
   */
  public Collection findPages(String query, int flags) throws ProviderException {
    IndexSearcher searcher = null;
    ArrayList<SearchResult> list = null;
    Highlighter highlighter = null;

    try {
      String[] queryfields = {
        LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS
      };
      QueryParser qp =
          new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer());

      // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
      Query luceneQuery = qp.parse(query);

      if ((flags & FLAG_CONTEXTS) != 0) {
        highlighter =
            new Highlighter(
                new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"),
                new SimpleHTMLEncoder(),
                new QueryScorer(luceneQuery));
      }

      try {
        File dir = new File(m_luceneDirectory);
        Directory luceneDir = new SimpleFSDirectory(dir, null);
        IndexReader reader = IndexReader.open(luceneDir);
        searcher = new IndexSearcher(reader);
      } catch (Exception ex) {
        log.info("Lucene not yet ready; indexing not started", ex);
        return null;
      }

      ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;

      list = new ArrayList<SearchResult>(hits.length);
      for (int curr = 0; curr < hits.length; curr++) {
        int docID = hits[curr].doc;
        Document doc = searcher.doc(docID);
        String pageName = doc.get(LUCENE_ID);
        WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);

        if (page != null) {
          if (page instanceof Attachment) {
            // Currently attachments don't look nice on the search-results page
            // When the search-results are cleaned up this can be enabled again.
          }

          int score = (int) (hits[curr].score * 100);

          // Get highlighted search contexts
          String text = doc.get(LUCENE_PAGE_CONTENTS);

          String[] fragments = new String[0];
          if (text != null && highlighter != null) {
            TokenStream tokenStream =
                getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
            fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
          }

          SearchResult result = new SearchResultImpl(page, score, fragments);
          list.add(result);
        } else {
          log.error(
              "Lucene found a result page '"
                  + pageName
                  + "' that could not be loaded, removing from Lucene cache");
          pageRemoved(new WikiPage(m_engine, pageName));
        }
      }
    } catch (IOException e) {
      log.error("Failed during lucene search", e);
    } catch (ParseException e) {
      log.info("Broken query; cannot parse query ", e);

      throw new ProviderException(
          "You have entered a query Lucene cannot process: " + e.getMessage());
    } catch (InvalidTokenOffsetsException e) {
      log.error("Tokens are incompatible with provided text ", e);
    } finally {
      if (searcher != null) {
        try {
          searcher.close();
        } catch (IOException e) {
          log.error(e);
        }
      }
    }

    return list;
  }