public SearchItem toResult(int documentId) throws IOException { Document document = searcher.doc(documentId); String type = document.getFieldable(FieldNames.TYPE).stringValue(); NumericField date = (NumericField) document.getFieldable(FieldNames.DATE); Fieldable path = document.getFieldable(FieldNames.PATH); NumericField version = (NumericField) document.getFieldable(FieldNames.VERSION); return new SearchItem( Integer.parseInt(type), path.stringValue(), (version != null) ? version.getNumericValue().intValue() : -1, new Date(date.getNumericValue().longValue())); }
@Override public AddResponse add(Collection<InputDocument> inputDocuments) { try { if (logger.isDebugEnabled()) { logger.debug("adding documents..."); } for (InputDocument inputDocument : inputDocuments) { assertIdExist(inputDocument); } for (Document document : DocumentTransformUtil.toLuceneDocuments(inputDocuments, schema)) { indexWriter.updateDocument( new Term(schema.getIdName(), document.getFieldable(schema.getIdName()).stringValue()), document, schema.getAnalyzer()); } updateCount.addAndGet(inputDocuments.size()); if (logger.isDebugEnabled()) { logger.debug("add documents finish."); } } catch (Exception e) { logger.error("add documents error", e); return new AddResponse(e.getMessage(), ResultCodes.COMMON_ERROR); } return new AddResponse(); }
/** * Ensures the content and content blob fields are loaded lazy. * * <p> * * @throws Exception in case the test fails */ public void testLazyContentFields() throws Exception { echo("Testing lazy status of content fields in search index"); String fileName = "/sites/default/test/master.pdf"; CmsSearchIndex searchIndex = OpenCms.getSearchManager().getIndex(INDEX_SPECIAL); Document doc = searchIndex.getDocument(CmsSearchField.FIELD_PATH, fileName); assertNotNull("Document '" + fileName + "' not found", doc); assertNotNull("No 'content' field available", doc.getFieldable(CmsSearchField.FIELD_CONTENT)); assertTrue("Content field not lazy", doc.getFieldable(CmsSearchField.FIELD_CONTENT).isLazy()); assertNotNull( "No 'content blob' field available", doc.getFieldable(CmsSearchField.FIELD_CONTENT_BLOB)); assertTrue( "Content blob field not lazy", doc.getFieldable(CmsSearchField.FIELD_CONTENT_BLOB).isLazy()); }
private byte[] extractSource(Document doc, DocumentMapper documentMapper) { byte[] source = null; Fieldable sourceField = doc.getFieldable(documentMapper.sourceMapper().names().indexName()); if (sourceField != null) { source = documentMapper.sourceMapper().nativeValue(sourceField); doc.removeField(documentMapper.sourceMapper().names().indexName()); } return source; }
@Override public AddResponse add(InputDocument inputDocument) { try { if (logger.isDebugEnabled()) { logger.debug("adding document..."); } assertIdExist(inputDocument); Document document = DocumentTransformUtil.toLuceneDocument(inputDocument, schema); indexWriter.updateDocument( new Term(schema.getIdName(), document.getFieldable(schema.getIdName()).stringValue()), document, schema.getAnalyzer()); updateCount.incrementAndGet(); if (logger.isDebugEnabled()) { logger.debug("add document finish."); } } catch (IOException e) { return new AddResponse(e.getMessage(), ResultCodes.COMMON_ERROR); } return new AddResponse(); }
protected Taxon[] findTaxon(String fieldName1, String fieldValue) throws IOException { Taxon[] terms = new TaxonImpl[0]; if (StringUtils.isNotBlank(fieldValue) && indexSearcher != null) { PhraseQuery query = new PhraseQuery(); query.add(new Term(fieldName1, fieldValue)); int maxHits = 3; TopDocs docs = indexSearcher.search(query, maxHits); if (docs.totalHits > 0) { terms = new TaxonImpl[docs.totalHits]; for (int i = 0; i < docs.totalHits && i < maxHits; i++) { ScoreDoc scoreDoc = docs.scoreDocs[i]; Document foundDoc = indexSearcher.doc(scoreDoc.doc); Taxon term = new TaxonImpl(); Fieldable idField = foundDoc.getFieldable(FIELD_ID); if (idField != null) { term.setExternalId(idField.stringValue()); } Fieldable rankPathField = foundDoc.getFieldable(FIELD_RANK_PATH); if (rankPathField != null) { term.setPath(rankPathField.stringValue()); } Fieldable rankPathIdsField = foundDoc.getFieldable(FIELD_RANK_PATH_IDS); if (rankPathIdsField != null) { term.setPathIds(rankPathIdsField.stringValue()); } Fieldable rankPathNamesField = foundDoc.getFieldable(FIELD_RANK_PATH_NAMES); if (rankPathNamesField != null) { term.setPathNames(rankPathNamesField.stringValue()); } Fieldable commonNamesFields = foundDoc.getFieldable(FIELD_COMMON_NAMES); if (commonNamesFields != null) { term.setCommonNames(commonNamesFields.stringValue()); } Fieldable fieldName = foundDoc.getFieldable(FIELD_RECOMMENDED_NAME); if (fieldName != null) { term.setName(fieldName.stringValue()); } terms[i] = term; } } } return terms; }
public String value(Document document) { Fieldable field = document.getFieldable(names.indexName()); return field == null ? null : value(field); }
/** * 获取BigDecimal对象 * * @param name 名称 * @param document document * @return BigDecimal对象 */ public Object get(String name, Document document) { return new BigDecimal(document.getFieldable(name).stringValue()); }
/** * Get matching entries for a query * * @param _word word that * @return map of lists of cluster results from index for this query * @throws Exception */ public List<List<String>> searchIndex(String query) throws Exception { Map<Integer, List<String>> suggestionCls = new HashMap<Integer, List<String>>(); Map<Integer, Integer> clusterOrder = new HashMap<Integer, Integer>(); List<List<String>> suggestionClsLists = new ArrayList<List<String>>(); String searchCriteria = IndexUtils.KEY_QUERY + ":" + "\"" + query + "\""; Query luceneQuery = null; try { luceneQuery = parser.parse(searchCriteria); } catch (ParseException e) { System.err.println("Lucene could not parse query: " + searchCriteria); e.printStackTrace(); } // TopDocs results = idxSearcher.search(query, 10); // TODO sort also by clusterId // sort after refinement counts Sort clRefSort = new Sort( new SortField[] { new SortField(IndexUtils.KEY_REF_COUNT, SortField.INT, true), new SortField(IndexUtils.KEY_CLUSTER_ID, SortField.INT, false) }); int clusterId; String refinement; int refCount; TopDocs docs = idxSearcher.search(luceneQuery, 1000, clRefSort); int clusterNum = 0; for (ScoreDoc match : docs.scoreDocs) { Document d = idxSearcher.doc(match.doc); clusterId = (Integer) ((NumericField) d.getFieldable(IndexUtils.KEY_CLUSTER_ID)).getNumericValue(); refinement = d.get(IndexUtils.KEY_REF); refCount = (Integer) ((NumericField) d.getFieldable(IndexUtils.KEY_REF_COUNT)).getNumericValue(); // add results to right list if (clusterOrder.containsKey(clusterId)) { // add to right list suggestionClsLists.get(clusterOrder.get(clusterId)).add(refinement); } else { // add new list clusterOrder.put(clusterId, clusterNum); suggestionClsLists.add(new ArrayList<String>()); suggestionClsLists.get(clusterOrder.get(clusterId)).add(refinement); clusterNum++; } // add results to map if (suggestionCls.containsKey(clusterId)) { suggestionCls.get(clusterId).add(refinement); } else { // for new cluster add new list List<String> clRefs = new ArrayList<String>(); clRefs.add(refinement); suggestionCls.put(clusterId, clRefs); } // System.out.println(clusterId + "\t" + refinement + "\t" + refCount); } // return suggestionCls; return suggestionClsLists; }
public NamedList get(String[] fields, DocSet baseDocs) throws IOException, ParseException { if (this.crcget == null) { this.container = this.parse.createContainer(fields, baseDocs, this.reader, this.searcher, this.req); DocIterator iter = baseDocs.iterator(); this.recordCount.inc(baseDocs.size()); Doclist res = new Doclist(this.parse.limit_offset); int doc = -1; while (iter.hasNext()) { doc = iter.nextDoc(); res.add(doc); if (res.index >= this.parse.limit_offset) { break; } } PriorityQueue<SelectDetailRow> topItems = this.transGroupValue(res, fields); this.container.free(); return this.toNameList(topItems); } String hostkey = String.valueOf(this.getkeyCrc()); ConcurrentHashMap<Long, String> cache = MdrillUtils.CRC_CACHE_SIZE.remove(crcget + "@" + hostkey); NamedList rtn = new NamedList(); Map<Long, String> crcvalue = new HashMap<Long, String>(); rtn.add("fdtcre", crcvalue); if (cache != null) { MapFieldSelector selector = new MapFieldSelector(fields); FieldType[] ftlist = new FieldType[fields.length]; IndexSchema schema = this.searcher.getSchema(); for (int j = 0; j < fields.length; j++) { ftlist[j] = schema.getFieldType(fields[j]); } String crcliststr = params.get("mdrill.crc.key.get.crclist"); if (crcliststr != null) { String[] crclist = crcliststr.split(","); for (String s : crclist) { Long crc = Long.parseLong(s); String v = cache.get(crc); if (v != null) { String cols[] = v.split(UniqConfig.GroupJoinString(), -1); if (cols.length >= 2) { int doc = Integer.parseInt(cols[0]); SortGroupVal buff = new SortGroupVal(); buff.groupbuff.append("-"); buff.groupbuff.append(UniqConfig.GroupJoinString()); buff.groupbuff.append("-"); Document docfields = this.reader.document(doc, selector); if (docfields == null) { for (int j = 0; j < fields.length; j++) { buff.groupbuff.append(UniqConfig.GroupJoinString()); buff.groupbuff.append(EncodeUtils.encode("-")); } if (!crcvalue.containsKey(crc)) { crcvalue.put(crc, buff.groupbuff.toString()); } } else { for (int j = 0; j < fields.length; j++) { buff.groupbuff.append(UniqConfig.GroupJoinString()); Fieldable fv = docfields.getFieldable(fields[j]); if (fv != null) { buff.groupbuff.append(ftlist[j].toExternal(fv)); } else { buff.groupbuff.append(EncodeUtils.encode("-")); } } crcvalue.put(crc, buff.groupbuff.toString()); } } } } } } return rtn; }
public static void main(String[] args) throws Exception { // the IndexReader object is the main handle that will give you // all the documents, terms and inverted index IndexReader r = IndexReader.open(FSDirectory.open(new File("index"))); // You can figure out the number of documents using the maxDoc() function System.out.println("The number of documents in this index is: " + r.maxDoc()); int i = 0; // You can find out all the terms that have been indexed using the terms() function TermEnum t = r.terms(); while (t.next()) { // Since there are so many terms, let us try printing only term #100000-#100010 if (i > 100000) System.out.println("[" + i + "] " + t.term().text()); if (++i > 100010) break; } // You can create your own query terms by calling the Term constructor, with the field // 'contents' // In the following example, the query term is 'brute' Term te = new Term("contents", "brute"); // You can also quickly find out the number of documents that have term t System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te)); // You can use the inverted index to find out all the documents that contain the term 'brute' // by using the termDocs function TermDocs td = r.termDocs(te); while (td.next()) { System.out.println( "Document number [" + td.doc() + "] contains the term 'brute' " + td.freq() + " time(s)."); } // You can find the URL of the a specific document number using the document() function // For example, the URL for document number 14191 is: Document d = r.document(14191); String url = d.getFieldable("path") .stringValue(); // the 'path' field of the Document object holds the URL System.out.println(url.replace("%%", "/")); // -------- Now let us use all of the functions above to make something useful -------- // The following bit of code is a worked out example of how to get a bunch of documents // in response to a query and show them (without ranking them according to TF/IDF) Scanner sc = new Scanner(System.in); String str = ""; System.out.print("query> "); while (!(str = sc.nextLine()).equals("quit")) { String[] terms = str.split("\\s+"); for (String word : terms) { Term term = new Term("contents", word); TermDocs tdocs = r.termDocs(term); while (tdocs.next()) { String d_url = r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/"); System.out.println("[" + tdocs.doc() + "] " + d_url); } } System.out.print("query> "); } }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } NamedList termVectors = new NamedList(); rb.rsp.add(TERM_VECTORS, termVectors); FieldOptions allFields = new FieldOptions(); // figure out what options we have, and try to get the appropriate vector allFields.termFreq = params.getBool(TermVectorParams.TF, false); allFields.positions = params.getBool(TermVectorParams.POSITIONS, false); allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false); allFields.docFreq = params.getBool(TermVectorParams.DF, false); allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false); // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); // short cut to all values. boolean all = params.getBool(TermVectorParams.ALL, false); if (all == true) { allFields.termFreq = true; allFields.positions = true; allFields.offsets = true; allFields.docFreq = true; allFields.tfIdf = true; } String fldLst = params.get(TermVectorParams.FIELDS); if (fldLst == null) { fldLst = params.get(CommonParams.FL); } // use this to validate our fields IndexSchema schema = rb.req.getSchema(); // Build up our per field mapping Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>(); NamedList warnings = new NamedList(); List<String> noTV = new ArrayList<String>(); List<String> noPos = new ArrayList<String>(); List<String> noOff = new ArrayList<String>(); // we have specific fields to retrieve if (fldLst != null) { String[] fields = SolrPluginUtils.split(fldLst); for (String field : fields) { SchemaField sf = schema.getFieldOrNull(field); if (sf != null) { if (sf.storeTermVector()) { FieldOptions option = fieldOptions.get(field); if (option == null) { option = new FieldOptions(); option.fieldName = field; fieldOptions.put(field, option); } // get the per field mappings option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq); option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq); option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf); // Validate these are even an option option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions); if (option.positions == true && sf.storeTermPositions() == false) { noPos.add(field); } option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets); if (option.offsets == true && sf.storeTermOffsets() == false) { noOff.add(field); } } else { // field doesn't have term vectors noTV.add(field); } } else { // field doesn't exist throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field); } } } // else, deal with all fields boolean hasWarnings = false; if (noTV.isEmpty() == false) { warnings.add("noTermVectors", noTV); hasWarnings = true; } if (noPos.isEmpty() == false) { warnings.add("noPositions", noPos); hasWarnings = true; } if (noOff.isEmpty() == false) { warnings.add("noOffsets", noOff); hasWarnings = true; } if (hasWarnings == true) { termVectors.add("warnings", warnings); } DocListAndSet listAndSet = rb.getResults(); List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); Iterator<Integer> iter; if (docIds != null && docIds.isEmpty() == false) { iter = docIds.iterator(); } else { DocList list = listAndSet.docList; iter = list.iterator(); } SolrIndexSearcher searcher = rb.req.getSearcher(); IndexReader reader = searcher.getReader(); // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors SchemaField keyField = schema.getUniqueKeyField(); String uniqFieldName = null; if (keyField != null) { uniqFieldName = keyField.getName(); } // Only load the id field to get the uniqueKey of that field SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector( Collections.singleton(uniqFieldName), Collections.<String>emptySet()); TVMapper mapper = new TVMapper(reader); mapper.fieldOptions = allFields; // this will only stay set if fieldOptions.isEmpty() (in other words, only if the // user didn't set any fields) while (iter.hasNext()) { Integer docId = iter.next(); NamedList docNL = new NamedList(); mapper.docNL = docNL; termVectors.add("doc-" + docId, docNL); if (keyField != null) { Document document = reader.document(docId, fieldSelector); Fieldable uniqId = document.getFieldable(uniqFieldName); String uniqVal = null; if (uniqId != null) { uniqVal = keyField.getType().storedToReadable(uniqId); } if (uniqVal != null) { docNL.add("uniqueKey", uniqVal); termVectors.add("uniqueKeyFieldName", uniqFieldName); } } if (fieldOptions.isEmpty() == false) { for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) { mapper.fieldOptions = entry.getValue(); reader.getTermFreqVector(docId, entry.getKey(), mapper); } } else { // deal with all fields by using the allFieldMapper reader.getTermFreqVector(docId, mapper); } } }
@SuppressWarnings("unchecked") private static SimpleOrderedMap<Object> getIndexedFieldsInfo( final SolrIndexSearcher searcher, final Set<String> fields, final int numTerms) throws Exception { IndexReader reader = searcher.getReader(); IndexSchema schema = searcher.getSchema(); // Walk the term enum and keep a priority queue for each map in our set Map<String, TopTermQueue> ttinfo = null; if (numTerms > 0) { ttinfo = getTopTerms(reader, fields, numTerms, null); } SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); for (String fieldName : fieldNames) { if (fields != null && !fields.contains(fieldName)) { continue; // if a field is specified, only them } SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldName); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) { f.add("dynamicBase", schema.getDynamicPattern(sfield.getName())); } // If numTerms==0, the call is just asking for a quick field list if (ttinfo != null && sfield != null && sfield.indexed()) { Query q = new TermRangeQuery(fieldName, null, null, false, false); TopDocs top = searcher.search(q, 1); if (top.totalHits > 0) { // Find a document with this field try { Document doc = searcher.doc(top.scoreDocs[0].doc); Fieldable fld = doc.getFieldable(fieldName); if (fld != null) { f.add("index", getFieldFlags(fld)); } else { // it is a non-stored field... f.add("index", "(unstored field)"); } } catch (Exception ex) { log.warn("error reading field: " + fieldName); } } f.add("docs", top.totalHits); TopTermQueue topTerms = ttinfo.get(fieldName); if (topTerms != null) { f.add("distinct", topTerms.distinctTerms); // Include top terms f.add("topTerms", topTerms.toNamedList(searcher.getSchema())); // Add a histogram f.add("histogram", topTerms.histogram.toNamedList()); } } // Add the field finfo.add(fieldName, f); } return finfo; }