/** @return the indexs */ public List<Index> getIndexes() { List<Index> indexes = new ArrayList<Index>(); // Method[] methods = Index.class.getDeclaredMethods(); int numDocs = reader.numDocs(); // System.out.println(numDocs); for (int i = 0; i < numDocs; i++) { try { Document document = reader.document(i); List<Fieldable> f = document.getFields(); Index index = new Index(); for (Fieldable fieldable : f) { Field field = (Field) fieldable; Method m = Index.class.getDeclaredMethod("set" + field.name(), new Class[] {String.class}); m.invoke(index, new Object[] {field.stringValue()}); // Method m2 = Index.class.getDeclaredMethod("get" + field.name(), new Class[]{}); // Object val = m2.invoke(index, new Object[]{}); // System.out.println(m2.getName()+" = "+val); // System.out.println(m.getName() + " " + field.stringValue()); } // System.out.println("RHAAR-"+i+" = "+index.getRHaarFeature()); indexes.add(index); } catch (Exception e) { e.printStackTrace(); } } return indexes; }
public void startSearch(String searchString) throws IOException { /*analyze(searchString);*/ try { Directory directory = FSDirectory.open(new File(".//Index")); // где находится индекс IndexSearcher is = new IndexSearcher(directory); // объект поиска QueryParser parser = new QueryParser( Version.LUCENE_31, "name", new RussianAnalyzer(Version.LUCENE_31)); // поле поиска + анализатор /* String str1 = "фотоаппарат"; String str2 = "телевизор"; String str3 = "SONY"; String total = "(" + str1 + " OR " + str2 + ")" + " AND " + str3; System.out.println(total);*/ Query query = parser.parse(searchString); // что ищем TopDocs results = is.search( query, null, 10); // включаем поиск ограничиваемся 10 документами, results содержит ... System.out.println( "getMaxScore()=" + results.getMaxScore() + " totalHits=" + results .totalHits); // MaxScore - наилучший результат(приоритет), totalHits - количество // найденных документов /*proposalController.getProposalList().clear();*/ for (ScoreDoc hits : results.scoreDocs) { // получаем подсказки Document doc = is.doc(hits.doc); // получаем документ по спец сылке doc for (Proposal proposal : proposalFacade.findPropolsalsByProduct(Long.valueOf(doc.get("recid")))) { proposalController.getProposalList().add(proposal); _log.info( "Предложение найдено:" + proposal.getRecid().toString() + ",Товар: " + doc.get("recid") + ", " + doc.get("name")); } /*System.out.println("doc="+hits.doc+" score="+hits.score);//выводим спец сылку doc + приоритет addMessage(doc.get("id") + " | " + doc.get("recid") + " | " + doc.get("name"));//выводим поля найденного документа*/ } directory.close(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } addMessage("Поиск выполнен"); }
public SearchResult search(String field, String query) { SearchResult searchResult = new SearchResult(); try { Analyzer analyzer = new StandardAnalyzer(); QueryParser queryParser = new QueryParser(field, analyzer); Query q = queryParser.parse(query); long start = System.currentTimeMillis(); IndexSearcher searcher = getSearcher(); TopDocs hits = searcher.search(q, 50); searchResult.setTotalHits(hits.totalHits); long end = System.currentTimeMillis(); searchResult.setTime(end - start); System.err.println( "Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); ResultDocument document = new ResultDocument(); document.setFullpath("\"" + doc.get("fullpath").replace("\"", "") + "\""); document.setFilename("\"" + doc.get("filename").replace("\"", "") + "\""); document.setTeaser("\"" + doc.get("teaser") + "\""); searchResult.addDocumnent(document); } close(); } catch (Exception e) { e.printStackTrace(); } return searchResult; }
public void testMoreThan32ProhibitedClauses() throws Exception { final Directory d = newDirectory(); final RandomIndexWriter w = new RandomIndexWriter(random(), d); Document doc = new Document(); doc.add( new TextField( "field", "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33", Field.Store.NO)); w.addDocument(doc); doc = new Document(); doc.add(new TextField("field", "33", Field.Store.NO)); w.addDocument(doc); final IndexReader r = w.getReader(); w.close(); final IndexSearcher s = newSearcher(r); final BooleanQuery q = new BooleanQuery(); for (int term = 0; term < 33; term++) { q.add( new BooleanClause( new TermQuery(new Term("field", "" + term)), BooleanClause.Occur.MUST_NOT)); } q.add(new BooleanClause(new TermQuery(new Term("field", "33")), BooleanClause.Occur.SHOULD)); final int[] count = new int[1]; s.search( q, new Collector() { private Scorer scorer; @Override public void setScorer(Scorer scorer) { // Make sure we got BooleanScorer: this.scorer = scorer; assertEquals( "Scorer is implemented by wrong class", BooleanScorer.class.getName() + "$BucketScorer", scorer.getClass().getName()); } @Override public void collect(int doc) { count[0]++; } @Override public void setNextReader(AtomicReaderContext context) {} @Override public boolean acceptsDocsOutOfOrder() { return true; } }); assertEquals(1, count[0]); r.close(); d.close(); }
@Test public void testSimpleMapper() throws Exception { DocumentMapperParser mapperParser = MapperTests.newParser(); DocumentMapper docMapper = doc( "test", rootObject("person") .add(object("name").add(stringField("first").store(YES).index(Field.Index.NO)))) .build(mapperParser); BytesReference json = new BytesArray( copyToBytesFromClasspath( "/org/elasticsearch/test/unit/index/mapper/simple/test1.json")); Document doc = docMapper.parse("person", "1", json).rootDoc(); assertThat((double) doc.getBoost(), closeTo(3.7, 0.01)); assertThat( doc.get(docMapper.mappers().name("first").mapper().names().indexName()), equalTo("shay")); assertThat( docMapper.mappers().name("first").mapper().names().fullName(), equalTo("name.first")); // System.out.println("Document: " + doc); // System.out.println("Json: " + docMapper.sourceMapper().value(doc)); doc = docMapper.parse(json).rootDoc(); // System.out.println("Document: " + doc); // System.out.println("Json: " + docMapper.sourceMapper().value(doc)); }
@Override public void index(List<AgeObject> aol) { try { IndexWriter iWriter = new IndexWriter( index, analyzer, objectList == null, IndexWriter.MaxFieldLength.UNLIMITED); if (objectList == null) objectList = aol; else objectList.addAll(aol); for (AgeObject ao : aol) { Document doc = new Document(); for (TextFieldExtractor tfe : extractors) doc.add( new Field( tfe.getName(), tfe.getExtractor().getValue(ao), Field.Store.NO, Field.Index.ANALYZED)); iWriter.addDocument(doc); } iWriter.close(); defaultFieldName = extractors.iterator().next().getName(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public ImageSearchHits search(Document doc, IndexReader reader) throws IOException { ScalableColor sc = null; ColorLayout cl = null; EdgeHistogram eh = null; String[] cls = doc.getValues(DocumentBuilder.FIELD_NAME_COLORLAYOUT); if (cls != null && cls.length > 0) { cl = new ColorLayout(); cl.setStringRepresentation(cls[0]); } String[] scs = doc.getValues(DocumentBuilder.FIELD_NAME_SCALABLECOLOR); if (scs != null && scs.length > 0) { sc = new ScalableColor(); sc.setStringRepresentation(scs[0]); } String[] ehs = doc.getValues(DocumentBuilder.FIELD_NAME_EDGEHISTOGRAM); if (ehs != null && ehs.length > 0) { eh = new EdgeHistogram(); eh.setStringRepresentation(ehs[0]); } float maxDistance = findSimilar(reader, cl, sc, eh); return new SimpleImageSearchHits(this.docs, maxDistance); }
public void queryIndex() { Query q; try { q = new MultiFieldQueryParser(new String[] {"title", "name"}, analyzer).parse("s*"); // searching ... int hitsPerPage = 10; IndexSearcher searcher = new IndexSearcher(indexDirectory); TopDocCollector collector = new TopDocCollector(hitsPerPage); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // output results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("name") + ": " + d.get("title")); } } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@Override public AddResponse add(Collection<InputDocument> inputDocuments) { try { if (logger.isDebugEnabled()) { logger.debug("adding documents..."); } for (InputDocument inputDocument : inputDocuments) { assertIdExist(inputDocument); } for (Document document : DocumentTransformUtil.toLuceneDocuments(inputDocuments, schema)) { indexWriter.updateDocument( new Term(schema.getIdName(), document.getFieldable(schema.getIdName()).stringValue()), document, schema.getAnalyzer()); } updateCount.addAndGet(inputDocuments.size()); if (logger.isDebugEnabled()) { logger.debug("add documents finish."); } } catch (Exception e) { logger.error("add documents error", e); return new AddResponse(e.getMessage(), ResultCodes.COMMON_ERROR); } return new AddResponse(); }
public void createIndex() { loadTweets("datasets/sentiment-short.csv", 100); directory = new RAMDirectory(); try { IndexWriter writer = getWriter(); for (int i = 0; i < tweets.size(); i++) { Document doc = new Document(); doc.add( new Field( "tweet", tweets.get(i).getText(), Field.Store.YES, Field.Index.ANALYZED, TermVector.YES)); writer.addDocument(doc); } System.out.println("Docs: " + writer.numDocs()); writer.close(); } catch (Exception e) { e.printStackTrace(); } }
public String searchIndexDocTextReturn(String searchString, String termString) { System.out.println("Searching for '" + searchString + "'"); // Directory directory = FSDirectory.getDirectory(); IndexReader indexReader; try { indexReader = IndexReader.open(indexDirectory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); int n = w.numDocs(); Term term = new Term(termString, searchString); TermQuery query = new TermQuery(term); TopDocs topDocs = indexSearcher.search(query, 10); if (topDocs.scoreDocs.length > 0) { // while(it.hasNext()){ int docID = topDocs.scoreDocs[0].doc; Document doc = indexSearcher.doc(docID); textOfURL = doc.get("text"); // sourceCodeOfURL = doc.get("html"); this.docId = docID; } } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return textOfURL; }
public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) throws Exception { int[] freq = new int[nTerms]; terms = new Term[nTerms]; for (int i = 0; i < nTerms; i++) { int f = (nTerms + 1) - i; // make first terms less frequent freq[i] = (int) Math.ceil(Math.pow(f, power)); terms[i] = new Term("f", Character.toString((char) ('A' + i))); } IndexWriter iw = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); for (int i = 0; i < nDocs; i++) { Document d = new Document(); for (int j = 0; j < nTerms; j++) { if (r.nextInt(freq[j]) == 0) { d.add(new Field("f", terms[j].text(), Field.Store.NO, Field.Index.NOT_ANALYZED)); // System.out.println(d); } } iw.addDocument(d); } iw.optimize(); iw.close(); }
@Override public Document document(int docid) throws IOException { if (_subReaders != null) { int readerIndex = readerIndex(docid, _starts, _subReaders.length); BoboIndexReader subReader = _subReaders[readerIndex]; return subReader.document(docid - _starts[readerIndex]); } else { Document doc = super.document(docid); Collection<FacetHandler<?>> facetHandlers = _facetHandlerMap.values(); for (FacetHandler<?> facetHandler : facetHandlers) { String[] vals = facetHandler.getFieldValues(this, docid); if (vals != null) { String[] values = doc.getValues(facetHandler.getName()); Set<String> storedVals = new HashSet<String>(Arrays.asList(values)); for (String val : vals) { storedVals.add(val); } doc.removeField(facetHandler.getName()); for (String val : storedVals) { doc.add( new Field(facetHandler.getName(), val, Field.Store.NO, Field.Index.NOT_ANALYZED)); } } } return doc; } }
@Test public void testFuzzyQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field")); Document document = new Document(); document.add(new SuggestField("suggest_field", "suggestion", 2)); document.add(new SuggestField("suggest_field", "suaggestion", 4)); document.add(new SuggestField("suggest_field", "ssuggestion", 1)); iw.addDocument(document); document = new Document(); document.add(new SuggestField("suggest_field", "sugfoo", 1)); iw.addDocument(document); if (rarely()) { iw.commit(); } DirectoryReader reader = iw.getReader(); SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader); CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "sugg")); TopSuggestDocs suggest = suggestIndexSearcher.suggest(query, 4); assertSuggestions( suggest, new Entry("suaggestion", 4 * 2), new Entry("suggestion", 2 * 3), new Entry("sugfoo", 1 * 3), new Entry("ssuggestion", 1 * 1)); reader.close(); iw.close(); }
// Verifies no *.nrm exists when all fields omit norms: public void testNoNrmFile() throws Throwable { Directory ram = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter( ram, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer) .setMaxBufferedDocs(3) .setMergePolicy(newLogMergePolicy())); LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy(); lmp.setMergeFactor(2); lmp.setNoCFSRatio(0.0); Document d = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setOmitNorms(true); Field f1 = newField("f1", "This field has no norms", customType); d.add(f1); for (int i = 0; i < 30; i++) { writer.addDocument(d); } writer.commit(); assertNoNrm(ram); // force merge writer.forceMerge(1); // flush writer.close(); assertNoNrm(ram); ram.close(); }
/** * give the id list of sentences, from Lucene index * * @param input input word * @param catalogName catalog (domain) name which we'd like to search in * @param limit how many hits are needed (0 means all) */ public List<String> query(String input, String catalogName, int limit) { List<String> res = new ArrayList<String>(); try { catalog c = catalogs.get(catalogName); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(c.indexPath))); IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser("contents", analyzer); Query query = parser.parse(QueryParser.escape(input)); int n = limit > 0 ? limit : searcher.count(query); if (n == 0) n = 1; TopDocs results = searcher.search(query, n); int endPos = limit; if (limit != 0) endPos = Math.min(results.totalHits, limit); // 1st n hits else endPos = results.totalHits; // all hits for (int i = 0; i < endPos; i++) { int id = results.scoreDocs[i].doc; Document doc = searcher.doc(id); res.add(doc.get("filename")); } reader.close(); return res; } catch (ParseException e) { log(e.getMessage()); } catch (IOException e) { log(e.getMessage()); } return res; }
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName); } } } else { addTermFrequencies(field2termFreqMap, vector, fieldName); } } return createQueue(field2termFreqMap); }
@SuppressWarnings("unchecked") @Override public List<String> getClusterByCarrot2(String query) { // TODO Auto-generated method stub List<String> strs = new ArrayList<String>(); final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class); final List<org.carrot2.core.Document> documents = Lists.newArrayList(); try { q = getParser().parse(QueryParserUtil.escape(query)); docs = getIndexSearcher().search(q, Integer.MAX_VALUE); hits = docs.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = getIndexSearcher().doc(hits[i].doc); documents.add( new org.carrot2.core.Document( doc.get(CONTENTS_FIELD), doc.get(TITLE_FIELD), doc.get(USER_FIELD))); } final ProcessingResult byTopicClusters = controller.process(documents, query, LingoClusteringAlgorithm.class); final List<Cluster> clustersByTopic = byTopicClusters.getClusters(); final ProcessingResult byDomainClusters = controller.process(documents, query, ByUrlClusteringAlgorithm.class); final List<Cluster> clustersByDomain = byDomainClusters.getClusters(); for (Cluster c : clustersByDomain) { strs.add(c.getLabel()); } for (Cluster c : clustersByTopic) { strs.add(c.getLabel()); } } catch (Exception ex) { } return strs; }
public void testFarsiRangeFilterCollating( Analyzer analyzer, String firstBeg, String firstEnd, String secondBeg, String secondEnd) throws Exception { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); doc.add(new Field("content", "\u0633\u0627\u0628", Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("body", "body", Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); writer.close(); IndexReader reader = IndexReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); Query query = new TermQuery(new Term("body", "body")); // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi // orders the U+0698 character before the U+0633 character, so the single // index Term below should NOT be returned by a TermRangeFilter with a Farsi // Collator (or an Arabic one for the case when Farsi searcher not // supported). ScoreDoc[] result = searcher.search(query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1) .scoreDocs; assertEquals("The index Term should not be included.", 0, result.length); result = searcher.search(query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1) .scoreDocs; assertEquals("The index Term should be included.", 1, result.length); searcher.close(); reader.close(); dir.close(); }
public void testSpanNot() throws Exception { SpanQuery[] clauses = new SpanQuery[2]; clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one")); clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three")); SpanQuery spq = new SpanNearQuery(clauses, 5, true); SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"))); Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random(), directory, newIndexWriterConfig(new PayloadAnalyzer()).setSimilarity(similarity)); Document doc = new Document(); doc.add(newTextField(PayloadHelper.FIELD, "one two three one four three", Field.Store.YES)); writer.addDocument(doc); IndexReader reader = writer.getReader(); writer.close(); checkSpans(MultiSpansWrapper.wrap(reader, snq, SpanWeight.Postings.PAYLOADS), 1, new int[] {2}); reader.close(); directory.close(); }
/** Test that core cache key (needed for NRT) is working */ public void testCoreCacheKey() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setMaxBufferedDocs(100); iwc.setMergePolicy(NoMergePolicy.INSTANCE); IndexWriter iw = new IndexWriter(dir, iwc); // add two docs, id:0 and id:1 Document doc = new Document(); Field idField = new StringField("id", "", Field.Store.NO); doc.add(idField); idField.setStringValue("0"); iw.addDocument(doc); idField.setStringValue("1"); iw.addDocument(doc); // open reader ShardId shardId = new ShardId("fake", "_na_", 1); DirectoryReader ir = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(iw, true), shardId); assertEquals(2, ir.numDocs()); assertEquals(1, ir.leaves().size()); // delete id:0 and reopen iw.deleteDocuments(new Term("id", "0")); DirectoryReader ir2 = DirectoryReader.openIfChanged(ir); // we should have the same cache key as before assertEquals(1, ir2.numDocs()); assertEquals(1, ir2.leaves().size()); assertSame( ir.leaves().get(0).reader().getCoreCacheKey(), ir2.leaves().get(0).reader().getCoreCacheKey()); IOUtils.close(ir, ir2, iw, dir); }
private IndexSearcher getSearcher() throws Exception { directory = newDirectory(); String[] docs = new String[] { "xx rr yy mm pp", "xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten" }; RandomIndexWriter writer = new RandomIndexWriter( random(), directory, newIndexWriterConfig(new PayloadAnalyzer()).setSimilarity(similarity)); Document doc = null; for (int i = 0; i < docs.length; i++) { doc = new Document(); String docText = docs[i]; doc.add(newTextField(PayloadHelper.FIELD, docText, Field.Store.YES)); writer.addDocument(doc); } closeIndexReader = writer.getReader(); writer.close(); IndexSearcher searcher = newSearcher(closeIndexReader); return searcher; }
public void testMethod() throws Exception { Directory directory = newDirectory(); String[] values = new String[] {"1", "2", "3", "4"}; RandomIndexWriter writer = new RandomIndexWriter(random(), directory); for (int i = 0; i < values.length; i++) { Document doc = new Document(); doc.add(newStringField(FIELD, values[i], Field.Store.YES)); writer.addDocument(doc); } IndexReader ir = writer.getReader(); writer.close(); BooleanQuery booleanQuery1 = new BooleanQuery(); booleanQuery1.add(new TermQuery(new Term(FIELD, "1")), BooleanClause.Occur.SHOULD); booleanQuery1.add(new TermQuery(new Term(FIELD, "2")), BooleanClause.Occur.SHOULD); BooleanQuery query = new BooleanQuery(); query.add(booleanQuery1, BooleanClause.Occur.MUST); query.add(new TermQuery(new Term(FIELD, "9")), BooleanClause.Occur.MUST_NOT); IndexSearcher indexSearcher = newSearcher(ir); ScoreDoc[] hits = indexSearcher.search(query, null, 1000).scoreDocs; assertEquals("Number of matched documents", 2, hits.length); ir.close(); directory.close(); }
private static void index_h(String prefix, File file, IndexWriter indexWriter) throws IOException { Document doc = null; if (file.isDirectory()) { File files[] = file.listFiles(); for (File file1 : files) { index_h(prefix + FILE_SEPARATOR + file.getName(), file1, indexWriter); } } else { String content = FileUtils.readFileToString(file, "utf-8"); System.out.println("=============================================================="); System.out.println("index_h " + content); System.out.println("=============================================================="); String filename = prefix + FILE_SEPARATOR + file.getName(); String path = file.getAbsolutePath(); doc = new Document(); doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("relative_path", filename, Field.Store.YES, Field.Index.NOT_ANALYZED)); indexWriter.addDocument(doc); } }
private void dumpDocument(int docNum, Document doc) throws IOException { outputLn(); outputLn("Document " + docNum); if (doc == null) { outputLn(" deleted"); return; } // note: only stored fields will be returned for (Fieldable field : doc.getFields()) { String fieldName = field.name(); boolean isDate = "l.date".equals(fieldName); outputLn(" Field [" + fieldName + "]: " + field.toString()); String[] values = doc.getValues(fieldName); if (values != null) { int i = 0; for (String value : values) { output(" " + "(" + i++ + ") " + value); if (isDate) { try { Date date = DateTools.stringToDate(value); output(" (" + date.toString() + " (" + date.getTime() + "))"); } catch (java.text.ParseException e) { assert false; } } outputLn(); } } } }
private static Map<String, List<String>> generate_result(Directory directory) { Map<String, List<String>> result_map = new HashMap<String, List<String>>(); try { IndexReader reader = IndexReader.open(directory); TermEnum termEnum = reader.terms(); while (termEnum.next()) { String termEnumString = termEnum.term().toString(); if (termEnumString.startsWith("content:")) { String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1); TermDocs termDocs = reader.termDocs(termEnum.term()); while (termDocs.next()) { Document doc = reader.document(termDocs.doc()); String relative_path = doc.get("relative_path"); if (result_map.containsKey(relative_path)) { result_map.get(relative_path).add(term + termDocs.freq()); } else { result_map.put(relative_path, new ArrayList<String>()); } } } } } catch (IOException e) { e.printStackTrace(); } finally { } return result_map; }
private static void search(IndexSearcher searcher, Query query, IndexWriter out, String field) throws IOException { /* Carlos's hack */ int hitsPerPage = 1; System.out.println("Consulta: " + query); TopDocs results = searcher.search(query, hitsPerPage); int numTotalHits = results.totalHits; if (numTotalHits > 0) results = searcher.search(query, numTotalHits); ScoreDoc[] hits = results.scoreDocs; /* End hack */ for (int i = 0; i < numTotalHits; i++) { Document doc = searcher.doc(hits[i].doc); // System.out.println("Title: " + doc.get("title")); if (field != null) { System.out.println(hits[i].doc + "\t" + hits[i].score + "\t" + doc.get(field)); } if (out != null) { out.addDocument(doc); } } System.out.println("Resultados: " + numTotalHits); }
// TODO: randomize public IndexSearcher setUp(Random random, Similarity similarity, int numDocs) throws IOException { Directory directory = new MockDirectoryWrapper(random, new RAMDirectory()); PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter( directory, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setSimilarity(similarity)); // writer.infoStream = System.out; for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.add(new Field(FIELD, English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); doc.add( new Field( MULTI_FIELD, English.intToEnglish(i) + " " + English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); doc.add( new Field( NO_PAYLOAD_FIELD, English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } reader = IndexReader.open(writer, true); writer.close(); IndexSearcher searcher = LuceneTestCase.newSearcher(reader); searcher.setSimilarity(similarity); return searcher; }
public Document createDocument(BufferedImage image, String identifier) { assert (image != null); BufferedImage bimg = image; // Scaling image is especially with the correlogram features very important! // All images are scaled to guarantee a certain upper limit for indexing. if (Math.max(image.getHeight(), image.getWidth()) > MAX_IMAGE_DIMENSION) { bimg = ImageUtils.scaleImage(image, MAX_IMAGE_DIMENSION); } Document doc = null; logger.finer("Starting extraction from image [CEDD - fast]."); CEDD vd = new CEDD(); vd.extract(bimg); logger.fine("Extraction finished [CEDD - fast]."); doc = new Document(); doc.add(new Field(DocumentBuilder.FIELD_NAME_CEDD, vd.getByteArrayRepresentation())); if (identifier != null) doc.add( new Field( DocumentBuilder.FIELD_NAME_IDENTIFIER, identifier, Field.Store.YES, Field.Index.NOT_ANALYZED)); return doc; }
public List<Document> searchDocuments(String text) { List<Document> documents = new ArrayList<Document>(); try { TokenStream tokenStream = analyzer.tokenStream("text", text); CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); BooleanQuery bQuery = new BooleanQuery(); while (tokenStream.incrementToken()) { String token = charTermAtt.toString(); TermQuery tq = new TermQuery(new Term("text", token)); tq.setBoost(2f); bQuery.add(tq, Occur.MUST); } tokenStream.close(); TopDocs results = searcher.search(bQuery, 100000); ScoreDoc[] hits = results.scoreDocs; for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); doc.add(new FloatField("score", hit.score, FloatField.TYPE_STORED)); documents.add(doc); } } catch (Exception e) { e.printStackTrace(); } return documents; }