/**
  * Convenience routine to make it easy to return the most interesting words in a document. More
  * advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
  *
  * @param r the source document
  * @param fieldName field passed to analyzer to use when analyzing the content
  * @return the most interesting words in the document
  * @see #retrieveTerms(java.io.Reader, String)
  * @see #setMaxQueryTerms
  */
 public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
   ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
   PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
   ScoreTerm scoreTerm;
   int lim =
       maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably
                      // not useful to our caller...
   // we just want to return the top words
   while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
     al.add(scoreTerm.word); // the 1st entry is the interesting word
   }
   String[] res = new String[al.size()];
   return al.toArray(res);
 }
  protected TestConfig[] generateTestConfigs(
      int numberOfTests, TestDoc[] testDocs, TestFieldSetting[] fieldSettings) {
    ArrayList<TestConfig> configs = new ArrayList<TestConfig>();
    for (int i = 0; i < numberOfTests; i++) {

      ArrayList<String> selectedFields = null;
      if (randomBoolean()) {
        // used field selection
        selectedFields = new ArrayList<String>();
        if (randomBoolean()) {
          selectedFields.add("Doesnt_exist"); // this will be ignored.
        }
        for (TestFieldSetting field : fieldSettings)
          if (randomBoolean()) {
            selectedFields.add(field.name);
          }

        if (selectedFields.size() == 0) {
          selectedFields = null; // 0 length set is not supported.
        }
      }
      TestConfig config =
          new TestConfig(
              testDocs[randomInt(testDocs.length - 1)],
              selectedFields == null ? null : selectedFields.toArray(new String[] {}),
              randomBoolean(),
              randomBoolean(),
              randomBoolean());

      configs.add(config);
    }
    // always adds a test that fails
    configs.add(
        new TestConfig(
                new TestDoc("doesnt_exist", new TestFieldSetting[] {}, new String[] {})
                    .index("doesn't_exist"),
                new String[] {"doesnt_exist"},
                true,
                true,
                true)
            .expectedException(IndexMissingException.class));

    refresh();

    return configs.toArray(new TestConfig[] {});
  }
Example #3
0
  public void generateWekaFile(ArrayList<MyTerm> myTerms, int maxDocNum, String wekaFilePath)
      throws IOException {

    String text = "@relation interest\n";
    text += "@attribute text string\n";
    for (int i = 0; i < maxDocNum; i++) {
      text += "@attribute doc" + i + "\treal\n";
    }
    text += "@data\n";
    for (int j = 0; j < myTerms.size(); j++) {
      MyTerm term = myTerms.get(j);
      String line = "";
      line += term.originTrem.text();
      for (int i = 0; i < term.vector.length; i++) {
        line += "," + term.vector[i];
      }
      line += "\n";
      text += line;
    }
    // System.out.println(text);
    PrintWriter Pout = new PrintWriter(new FileWriter(wekaFilePath));
    Pout.println(text);
    Pout.close();
  }
Example #4
0
  public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception {
    File inputFile = new File(wekaFilePath);
    ArffLoader arf = new ArffLoader();
    arf.setFile(inputFile);
    Instances originIns = arf.getDataSet();
    Instances insTest = new Instances(originIns);
    insTest.deleteStringAttributes();
    int totalNum = insTest.numInstances();

    // SimpleKMeans sm = new SimpleKMeans();
    EM em = new EM();
    em.setNumClusters(clusterNum);
    MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer();
    sm.setClusterer(em);
    sm.buildClusterer(insTest);

    System.out.println("totalNum:" + insTest.numInstances());
    System.out.println("============================");
    System.out.println(sm.toString());
    Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
    for (int i = 0; i < clusterNum; i++) {
      result.put(i, new ArrayList<String>());
    }

    for (int i = 0; i < totalNum; i++) {
      Instance ins = originIns.instance(i);
      String word = ins.stringValue(0);
      Instance tempIns = new Instance(ins);
      tempIns.deleteAttributeAt(0);
      int cluster = sm.clusterInstance(tempIns);
      result.get(cluster).add(word);
    }

    // print the result
    ArrayList<String> words = new ArrayList<String>();
    JSONArray keyWords = new JSONArray();
    for (int k : result.keySet()) {
      words = result.get(k);
      PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare);
      for (int i = 0; i < words.size(); i++) {
        String s = words.get(i);
        assert linkMap.containsKey(s);
        int freq = linkMap.get(s).totalFreq;
        clusterQueue.add(linkMap.get(s));
        words.set(i, "(" + s + ":" + freq + ")");
      }

      JSONArray clusterArray = new JSONArray();
      int num = clusterQueue.size() / 10 + 1; // 5%
      int totalFreq = 0;
      int totalLength = 0;
      for (int i = 0; i < num && !clusterQueue.isEmpty(); ) {
        JSONObject mem = new JSONObject();
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        if (word.length() == 1) {
          continue;
        }
        mem.put("text", word);
        mem.put("freq", myTerm.totalFreq);
        clusterArray.put(mem);
        i++;
        totalFreq += myTerm.totalFreq;
        totalLength += word.length();
      }

      double averFreq = totalFreq * 1.0 / num;
      double averLength = totalLength * 1.0 / num;
      int count = 0;
      while (!clusterQueue.isEmpty() && count < num) {
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        int freq = myTerm.totalFreq;
        int times = (int) (word.length() / averFreq) + 1;
        if (freq > averFreq / times) {
          JSONObject mem = new JSONObject();
          mem.put("text", word);
          mem.put("freq", freq);
          mem.put("extra", true);
          clusterArray.put(mem);
        }
      }

      keyWords.put(clusterArray);
      System.out.println(
          "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100));
      if (result.get(k).size() < 100) {
        System.out.println(result.get(k));
      }
    }
    // System.out.println("errorNum:"+errorNum);
    return keyWords;
  }