/** Get an alphabetically sorted list of field names. */ public List<String> getFieldNames() { HashSet<String> names = new HashSet<String>(); names.addAll(storedFields.keySet()); names.addAll(reconstructedFields.keySet()); ArrayList<String> res = new ArrayList<String>(names.size()); res.addAll(names); Collections.sort(res); return res; }
public void getIndexInfo(String indexdir, int freqThreshold) { IndexReader reader = null; try { Directory dir = FSDirectory.open(new File(indexdir)); System.out.println(dir); reader = IndexReader.open(dir); System.out.println("document num:" + reader.numDocs()); System.out.println("======================"); TermEnum terms = reader.terms(); sortedTermQueue.clear(); maxDocNum = reader.maxDoc(); linkMap.clear(); termList.clear(); while (terms.next()) { // System.out.print(terms.term() + "\tDocFreq:" + TermDocs termDocs = reader.termDocs(terms.term()); MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum); if (temp.totalFreq < freqThreshold) { continue; } /* * if(temp.originTrem.text().length()==1){ continue; } */ linkMap.put(temp.originTrem.text(), temp); sortedTermQueue.add(temp); termList.add(temp); } System.out.println("total Size:" + sortedTermQueue.size()); System.out.println("mapsize:" + linkMap.keySet().size()); // System.exit(0); int num = 0; this.maxFreq = sortedTermQueue.peek().totalFreq; while (!sortedTermQueue.isEmpty()) { num++; System.out.println(num + ":" + sortedTermQueue.poll()); } System.out.println("read index info done"); } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } }
/** * Convenience routine to make it easy to return the most interesting words in a document. More * advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader, String) * @see #setMaxQueryTerms */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably // not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
protected TestConfig[] generateTestConfigs( int numberOfTests, TestDoc[] testDocs, TestFieldSetting[] fieldSettings) { ArrayList<TestConfig> configs = new ArrayList<TestConfig>(); for (int i = 0; i < numberOfTests; i++) { ArrayList<String> selectedFields = null; if (randomBoolean()) { // used field selection selectedFields = new ArrayList<String>(); if (randomBoolean()) { selectedFields.add("Doesnt_exist"); // this will be ignored. } for (TestFieldSetting field : fieldSettings) if (randomBoolean()) { selectedFields.add(field.name); } if (selectedFields.size() == 0) { selectedFields = null; // 0 length set is not supported. } } TestConfig config = new TestConfig( testDocs[randomInt(testDocs.length - 1)], selectedFields == null ? null : selectedFields.toArray(new String[] {}), randomBoolean(), randomBoolean(), randomBoolean()); configs.add(config); } // always adds a test that fails configs.add( new TestConfig( new TestDoc("doesnt_exist", new TestFieldSetting[] {}, new String[] {}) .index("doesn't_exist"), new String[] {"doesnt_exist"}, true, true, true) .expectedException(IndexMissingException.class)); refresh(); return configs.toArray(new TestConfig[] {}); }
public void generateWekaFile(ArrayList<MyTerm> myTerms, int maxDocNum, String wekaFilePath) throws IOException { String text = "@relation interest\n"; text += "@attribute text string\n"; for (int i = 0; i < maxDocNum; i++) { text += "@attribute doc" + i + "\treal\n"; } text += "@data\n"; for (int j = 0; j < myTerms.size(); j++) { MyTerm term = myTerms.get(j); String line = ""; line += term.originTrem.text(); for (int i = 0; i < term.vector.length; i++) { line += "," + term.vector[i]; } line += "\n"; text += line; } // System.out.println(text); PrintWriter Pout = new PrintWriter(new FileWriter(wekaFilePath)); Pout.println(text); Pout.close(); }
public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception { File inputFile = new File(wekaFilePath); ArffLoader arf = new ArffLoader(); arf.setFile(inputFile); Instances originIns = arf.getDataSet(); Instances insTest = new Instances(originIns); insTest.deleteStringAttributes(); int totalNum = insTest.numInstances(); // SimpleKMeans sm = new SimpleKMeans(); EM em = new EM(); em.setNumClusters(clusterNum); MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer(); sm.setClusterer(em); sm.buildClusterer(insTest); System.out.println("totalNum:" + insTest.numInstances()); System.out.println("============================"); System.out.println(sm.toString()); Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>(); for (int i = 0; i < clusterNum; i++) { result.put(i, new ArrayList<String>()); } for (int i = 0; i < totalNum; i++) { Instance ins = originIns.instance(i); String word = ins.stringValue(0); Instance tempIns = new Instance(ins); tempIns.deleteAttributeAt(0); int cluster = sm.clusterInstance(tempIns); result.get(cluster).add(word); } // print the result ArrayList<String> words = new ArrayList<String>(); JSONArray keyWords = new JSONArray(); for (int k : result.keySet()) { words = result.get(k); PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare); for (int i = 0; i < words.size(); i++) { String s = words.get(i); assert linkMap.containsKey(s); int freq = linkMap.get(s).totalFreq; clusterQueue.add(linkMap.get(s)); words.set(i, "(" + s + ":" + freq + ")"); } JSONArray clusterArray = new JSONArray(); int num = clusterQueue.size() / 10 + 1; // 5% int totalFreq = 0; int totalLength = 0; for (int i = 0; i < num && !clusterQueue.isEmpty(); ) { JSONObject mem = new JSONObject(); MyTerm myTerm = clusterQueue.poll(); String word = myTerm.originTrem.text(); if (word.length() == 1) { continue; } mem.put("text", word); mem.put("freq", myTerm.totalFreq); clusterArray.put(mem); i++; totalFreq += myTerm.totalFreq; totalLength += word.length(); } double averFreq = totalFreq * 1.0 / num; double averLength = totalLength * 1.0 / num; int count = 0; while (!clusterQueue.isEmpty() && count < num) { MyTerm myTerm = clusterQueue.poll(); String word = myTerm.originTrem.text(); int freq = myTerm.totalFreq; int times = (int) (word.length() / averFreq) + 1; if (freq > averFreq / times) { JSONObject mem = new JSONObject(); mem.put("text", word); mem.put("freq", freq); mem.put("extra", true); clusterArray.put(mem); } } keyWords.put(clusterArray); System.out.println( "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100)); if (result.get(k).size() < 100) { System.out.println(result.get(k)); } } // System.out.println("errorNum:"+errorNum); return keyWords; }
/** * Searches pages using a particular combination of flags. * * @param query The query to perform in Lucene query language * @param flags A set of flags * @return A Collection of SearchResult instances * @throws ProviderException if there is a problem with the backend */ public Collection findPages(String query, int flags) throws ProviderException { IndexSearcher searcher = null; ArrayList<SearchResult> list = null; Highlighter highlighter = null; try { String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS }; QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer()); // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() ); Query luceneQuery = qp.parse(query); if ((flags & FLAG_CONTEXTS) != 0) { highlighter = new Highlighter( new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery)); } try { File dir = new File(m_luceneDirectory); Directory luceneDir = new SimpleFSDirectory(dir, null); IndexReader reader = IndexReader.open(luceneDir); searcher = new IndexSearcher(reader); } catch (Exception ex) { log.info("Lucene not yet ready; indexing not started", ex); return null; } ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs; list = new ArrayList<SearchResult>(hits.length); for (int curr = 0; curr < hits.length; curr++) { int docID = hits[curr].doc; Document doc = searcher.doc(docID); String pageName = doc.get(LUCENE_ID); WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION); if (page != null) { if (page instanceof Attachment) { // Currently attachments don't look nice on the search-results page // When the search-results are cleaned up this can be enabled again. } int score = (int) (hits[curr].score * 100); // Get highlighted search contexts String text = doc.get(LUCENE_PAGE_CONTENTS); String[] fragments = new String[0]; if (text != null && highlighter != null) { TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text)); fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS); } SearchResult result = new SearchResultImpl(page, score, fragments); list.add(result); } else { log.error( "Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache"); pageRemoved(new WikiPage(m_engine, pageName)); } } } catch (IOException e) { log.error("Failed during lucene search", e); } catch (ParseException e) { log.info("Broken query; cannot parse query ", e); throw new ProviderException( "You have entered a query Lucene cannot process: " + e.getMessage()); } catch (InvalidTokenOffsetsException e) { log.error("Tokens are incompatible with provided text ", e); } finally { if (searcher != null) { try { searcher.close(); } catch (IOException e) { log.error(e); } } } return list; }