Example #1
0
  public SearchItem toResult(int documentId) throws IOException {
    Document document = searcher.doc(documentId);

    String type = document.getFieldable(FieldNames.TYPE).stringValue();
    NumericField date = (NumericField) document.getFieldable(FieldNames.DATE);
    Fieldable path = document.getFieldable(FieldNames.PATH);
    NumericField version = (NumericField) document.getFieldable(FieldNames.VERSION);
    return new SearchItem(
        Integer.parseInt(type),
        path.stringValue(),
        (version != null) ? version.getNumericValue().intValue() : -1,
        new Date(date.getNumericValue().longValue()));
  }
 @Override
 public AddResponse add(Collection<InputDocument> inputDocuments) {
   try {
     if (logger.isDebugEnabled()) {
       logger.debug("adding documents...");
     }
     for (InputDocument inputDocument : inputDocuments) {
       assertIdExist(inputDocument);
     }
     for (Document document : DocumentTransformUtil.toLuceneDocuments(inputDocuments, schema)) {
       indexWriter.updateDocument(
           new Term(schema.getIdName(), document.getFieldable(schema.getIdName()).stringValue()),
           document,
           schema.getAnalyzer());
     }
     updateCount.addAndGet(inputDocuments.size());
     if (logger.isDebugEnabled()) {
       logger.debug("add documents finish.");
     }
   } catch (Exception e) {
     logger.error("add documents error", e);
     return new AddResponse(e.getMessage(), ResultCodes.COMMON_ERROR);
   }
   return new AddResponse();
 }
  /**
   * Ensures the content and content blob fields are loaded lazy.
   *
   * <p>
   *
   * @throws Exception in case the test fails
   */
  public void testLazyContentFields() throws Exception {

    echo("Testing lazy status of content fields in search index");

    String fileName = "/sites/default/test/master.pdf";

    CmsSearchIndex searchIndex = OpenCms.getSearchManager().getIndex(INDEX_SPECIAL);
    Document doc = searchIndex.getDocument(CmsSearchField.FIELD_PATH, fileName);

    assertNotNull("Document '" + fileName + "' not found", doc);
    assertNotNull("No 'content' field available", doc.getFieldable(CmsSearchField.FIELD_CONTENT));
    assertTrue("Content field not lazy", doc.getFieldable(CmsSearchField.FIELD_CONTENT).isLazy());
    assertNotNull(
        "No 'content blob' field available", doc.getFieldable(CmsSearchField.FIELD_CONTENT_BLOB));
    assertTrue(
        "Content blob field not lazy",
        doc.getFieldable(CmsSearchField.FIELD_CONTENT_BLOB).isLazy());
  }
 private byte[] extractSource(Document doc, DocumentMapper documentMapper) {
   byte[] source = null;
   Fieldable sourceField = doc.getFieldable(documentMapper.sourceMapper().names().indexName());
   if (sourceField != null) {
     source = documentMapper.sourceMapper().nativeValue(sourceField);
     doc.removeField(documentMapper.sourceMapper().names().indexName());
   }
   return source;
 }
 @Override
 public AddResponse add(InputDocument inputDocument) {
   try {
     if (logger.isDebugEnabled()) {
       logger.debug("adding document...");
     }
     assertIdExist(inputDocument);
     Document document = DocumentTransformUtil.toLuceneDocument(inputDocument, schema);
     indexWriter.updateDocument(
         new Term(schema.getIdName(), document.getFieldable(schema.getIdName()).stringValue()),
         document,
         schema.getAnalyzer());
     updateCount.incrementAndGet();
     if (logger.isDebugEnabled()) {
       logger.debug("add document finish.");
     }
   } catch (IOException e) {
     return new AddResponse(e.getMessage(), ResultCodes.COMMON_ERROR);
   }
   return new AddResponse();
 }
  protected Taxon[] findTaxon(String fieldName1, String fieldValue) throws IOException {
    Taxon[] terms = new TaxonImpl[0];
    if (StringUtils.isNotBlank(fieldValue) && indexSearcher != null) {
      PhraseQuery query = new PhraseQuery();
      query.add(new Term(fieldName1, fieldValue));
      int maxHits = 3;
      TopDocs docs = indexSearcher.search(query, maxHits);

      if (docs.totalHits > 0) {
        terms = new TaxonImpl[docs.totalHits];
        for (int i = 0; i < docs.totalHits && i < maxHits; i++) {
          ScoreDoc scoreDoc = docs.scoreDocs[i];
          Document foundDoc = indexSearcher.doc(scoreDoc.doc);
          Taxon term = new TaxonImpl();
          Fieldable idField = foundDoc.getFieldable(FIELD_ID);
          if (idField != null) {
            term.setExternalId(idField.stringValue());
          }
          Fieldable rankPathField = foundDoc.getFieldable(FIELD_RANK_PATH);
          if (rankPathField != null) {
            term.setPath(rankPathField.stringValue());
          }
          Fieldable rankPathIdsField = foundDoc.getFieldable(FIELD_RANK_PATH_IDS);
          if (rankPathIdsField != null) {
            term.setPathIds(rankPathIdsField.stringValue());
          }
          Fieldable rankPathNamesField = foundDoc.getFieldable(FIELD_RANK_PATH_NAMES);
          if (rankPathNamesField != null) {
            term.setPathNames(rankPathNamesField.stringValue());
          }
          Fieldable commonNamesFields = foundDoc.getFieldable(FIELD_COMMON_NAMES);
          if (commonNamesFields != null) {
            term.setCommonNames(commonNamesFields.stringValue());
          }
          Fieldable fieldName = foundDoc.getFieldable(FIELD_RECOMMENDED_NAME);
          if (fieldName != null) {
            term.setName(fieldName.stringValue());
          }
          terms[i] = term;
        }
      }
    }
    return terms;
  }
 public String value(Document document) {
   Fieldable field = document.getFieldable(names.indexName());
   return field == null ? null : value(field);
 }
 /**
  * 获取BigDecimal对象
  *
  * @param name 名称
  * @param document document
  * @return BigDecimal对象
  */
 public Object get(String name, Document document) {
   return new BigDecimal(document.getFieldable(name).stringValue());
 }
  /**
   * Get matching entries for a query
   *
   * @param _word word that
   * @return map of lists of cluster results from index for this query
   * @throws Exception
   */
  public List<List<String>> searchIndex(String query) throws Exception {

    Map<Integer, List<String>> suggestionCls = new HashMap<Integer, List<String>>();

    Map<Integer, Integer> clusterOrder = new HashMap<Integer, Integer>();
    List<List<String>> suggestionClsLists = new ArrayList<List<String>>();

    String searchCriteria = IndexUtils.KEY_QUERY + ":" + "\"" + query + "\"";

    Query luceneQuery = null;
    try {
      luceneQuery = parser.parse(searchCriteria);
    } catch (ParseException e) {
      System.err.println("Lucene could not parse query: " + searchCriteria);
      e.printStackTrace();
    }
    // TopDocs results = idxSearcher.search(query, 10);

    // TODO sort also by clusterId
    // sort after refinement counts
    Sort clRefSort =
        new Sort(
            new SortField[] {
              new SortField(IndexUtils.KEY_REF_COUNT, SortField.INT, true),
              new SortField(IndexUtils.KEY_CLUSTER_ID, SortField.INT, false)
            });

    int clusterId;
    String refinement;
    int refCount;

    TopDocs docs = idxSearcher.search(luceneQuery, 1000, clRefSort);

    int clusterNum = 0;

    for (ScoreDoc match : docs.scoreDocs) {
      Document d = idxSearcher.doc(match.doc);

      clusterId =
          (Integer) ((NumericField) d.getFieldable(IndexUtils.KEY_CLUSTER_ID)).getNumericValue();
      refinement = d.get(IndexUtils.KEY_REF);
      refCount =
          (Integer) ((NumericField) d.getFieldable(IndexUtils.KEY_REF_COUNT)).getNumericValue();

      //  add results to right list
      if (clusterOrder.containsKey(clusterId)) {
        // add to right list
        suggestionClsLists.get(clusterOrder.get(clusterId)).add(refinement);
      } else {
        // add new list
        clusterOrder.put(clusterId, clusterNum);
        suggestionClsLists.add(new ArrayList<String>());
        suggestionClsLists.get(clusterOrder.get(clusterId)).add(refinement);
        clusterNum++;
      }

      // add results to map
      if (suggestionCls.containsKey(clusterId)) {

        suggestionCls.get(clusterId).add(refinement);
      } else {
        // for new cluster add new list
        List<String> clRefs = new ArrayList<String>();
        clRefs.add(refinement);
        suggestionCls.put(clusterId, clRefs);
      }

      //      System.out.println(clusterId + "\t" + refinement + "\t" + refCount);
    }

    //    return suggestionCls;
    return suggestionClsLists;
  }
Example #10
0
  public NamedList get(String[] fields, DocSet baseDocs) throws IOException, ParseException {
    if (this.crcget == null) {
      this.container =
          this.parse.createContainer(fields, baseDocs, this.reader, this.searcher, this.req);
      DocIterator iter = baseDocs.iterator();
      this.recordCount.inc(baseDocs.size());
      Doclist res = new Doclist(this.parse.limit_offset);
      int doc = -1;
      while (iter.hasNext()) {
        doc = iter.nextDoc();
        res.add(doc);
        if (res.index >= this.parse.limit_offset) {
          break;
        }
      }
      PriorityQueue<SelectDetailRow> topItems = this.transGroupValue(res, fields);
      this.container.free();
      return this.toNameList(topItems);
    }
    String hostkey = String.valueOf(this.getkeyCrc());

    ConcurrentHashMap<Long, String> cache =
        MdrillUtils.CRC_CACHE_SIZE.remove(crcget + "@" + hostkey);

    NamedList rtn = new NamedList();
    Map<Long, String> crcvalue = new HashMap<Long, String>();
    rtn.add("fdtcre", crcvalue);
    if (cache != null) {
      MapFieldSelector selector = new MapFieldSelector(fields);
      FieldType[] ftlist = new FieldType[fields.length];
      IndexSchema schema = this.searcher.getSchema();

      for (int j = 0; j < fields.length; j++) {
        ftlist[j] = schema.getFieldType(fields[j]);
      }

      String crcliststr = params.get("mdrill.crc.key.get.crclist");
      if (crcliststr != null) {
        String[] crclist = crcliststr.split(",");
        for (String s : crclist) {
          Long crc = Long.parseLong(s);
          String v = cache.get(crc);
          if (v != null) {
            String cols[] = v.split(UniqConfig.GroupJoinString(), -1);
            if (cols.length >= 2) {
              int doc = Integer.parseInt(cols[0]);

              SortGroupVal buff = new SortGroupVal();
              buff.groupbuff.append("-");
              buff.groupbuff.append(UniqConfig.GroupJoinString());
              buff.groupbuff.append("-");
              Document docfields = this.reader.document(doc, selector);
              if (docfields == null) {
                for (int j = 0; j < fields.length; j++) {
                  buff.groupbuff.append(UniqConfig.GroupJoinString());
                  buff.groupbuff.append(EncodeUtils.encode("-"));
                }
                if (!crcvalue.containsKey(crc)) {
                  crcvalue.put(crc, buff.groupbuff.toString());
                }
              } else {

                for (int j = 0; j < fields.length; j++) {
                  buff.groupbuff.append(UniqConfig.GroupJoinString());

                  Fieldable fv = docfields.getFieldable(fields[j]);

                  if (fv != null) {
                    buff.groupbuff.append(ftlist[j].toExternal(fv));
                  } else {
                    buff.groupbuff.append(EncodeUtils.encode("-"));
                  }
                }
                crcvalue.put(crc, buff.groupbuff.toString());
              }
            }
          }
        }
      }
    }
    return rtn;
  }
Example #11
0
  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }

    NamedList termVectors = new NamedList();
    rb.rsp.add(TERM_VECTORS, termVectors);
    FieldOptions allFields = new FieldOptions();
    // figure out what options we have, and try to get the appropriate vector
    allFields.termFreq = params.getBool(TermVectorParams.TF, false);
    allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
    allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
    allFields.docFreq = params.getBool(TermVectorParams.DF, false);
    allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
    // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
    // short cut to all values.
    boolean all = params.getBool(TermVectorParams.ALL, false);
    if (all == true) {
      allFields.termFreq = true;
      allFields.positions = true;
      allFields.offsets = true;
      allFields.docFreq = true;
      allFields.tfIdf = true;
    }

    String fldLst = params.get(TermVectorParams.FIELDS);
    if (fldLst == null) {
      fldLst = params.get(CommonParams.FL);
    }

    // use this to validate our fields
    IndexSchema schema = rb.req.getSchema();
    // Build up our per field mapping
    Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
    NamedList warnings = new NamedList();
    List<String> noTV = new ArrayList<String>();
    List<String> noPos = new ArrayList<String>();
    List<String> noOff = new ArrayList<String>();

    // we have specific fields to retrieve
    if (fldLst != null) {
      String[] fields = SolrPluginUtils.split(fldLst);
      for (String field : fields) {
        SchemaField sf = schema.getFieldOrNull(field);
        if (sf != null) {
          if (sf.storeTermVector()) {
            FieldOptions option = fieldOptions.get(field);
            if (option == null) {
              option = new FieldOptions();
              option.fieldName = field;
              fieldOptions.put(field, option);
            }
            // get the per field mappings
            option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
            option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
            option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
            // Validate these are even an option
            option.positions =
                params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
            if (option.positions == true && sf.storeTermPositions() == false) {
              noPos.add(field);
            }
            option.offsets =
                params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
            if (option.offsets == true && sf.storeTermOffsets() == false) {
              noOff.add(field);
            }
          } else { // field doesn't have term vectors
            noTV.add(field);
          }
        } else {
          // field doesn't exist
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
        }
      }
    } // else, deal with all fields
    boolean hasWarnings = false;
    if (noTV.isEmpty() == false) {
      warnings.add("noTermVectors", noTV);
      hasWarnings = true;
    }
    if (noPos.isEmpty() == false) {
      warnings.add("noPositions", noPos);
      hasWarnings = true;
    }
    if (noOff.isEmpty() == false) {
      warnings.add("noOffsets", noOff);
      hasWarnings = true;
    }
    if (hasWarnings == true) {
      termVectors.add("warnings", warnings);
    }

    DocListAndSet listAndSet = rb.getResults();
    List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
    Iterator<Integer> iter;
    if (docIds != null && docIds.isEmpty() == false) {
      iter = docIds.iterator();
    } else {
      DocList list = listAndSet.docList;
      iter = list.iterator();
    }
    SolrIndexSearcher searcher = rb.req.getSearcher();

    IndexReader reader = searcher.getReader();
    // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors
    SchemaField keyField = schema.getUniqueKeyField();
    String uniqFieldName = null;
    if (keyField != null) {
      uniqFieldName = keyField.getName();
    }
    // Only load the id field to get the uniqueKey of that field
    SetBasedFieldSelector fieldSelector =
        new SetBasedFieldSelector(
            Collections.singleton(uniqFieldName), Collections.<String>emptySet());
    TVMapper mapper = new TVMapper(reader);
    mapper.fieldOptions =
        allFields; // this will only stay set if fieldOptions.isEmpty() (in other words, only if the
                   // user didn't set any fields)
    while (iter.hasNext()) {
      Integer docId = iter.next();
      NamedList docNL = new NamedList();
      mapper.docNL = docNL;
      termVectors.add("doc-" + docId, docNL);

      if (keyField != null) {
        Document document = reader.document(docId, fieldSelector);
        Fieldable uniqId = document.getFieldable(uniqFieldName);
        String uniqVal = null;
        if (uniqId != null) {
          uniqVal = keyField.getType().storedToReadable(uniqId);
        }
        if (uniqVal != null) {
          docNL.add("uniqueKey", uniqVal);
          termVectors.add("uniqueKeyFieldName", uniqFieldName);
        }
      }
      if (fieldOptions.isEmpty() == false) {
        for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
          mapper.fieldOptions = entry.getValue();
          reader.getTermFreqVector(docId, entry.getKey(), mapper);
        }
      } else {
        // deal with all fields by using the allFieldMapper
        reader.getTermFreqVector(docId, mapper);
      }
    }
  }
  @SuppressWarnings("unchecked")
  private static SimpleOrderedMap<Object> getIndexedFieldsInfo(
      final SolrIndexSearcher searcher, final Set<String> fields, final int numTerms)
      throws Exception {

    IndexReader reader = searcher.getReader();
    IndexSchema schema = searcher.getSchema();

    // Walk the term enum and keep a priority queue for each map in our set
    Map<String, TopTermQueue> ttinfo = null;
    if (numTerms > 0) {
      ttinfo = getTopTerms(reader, fields, numTerms, null);
    }
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
    for (String fieldName : fieldNames) {
      if (fields != null && !fields.contains(fieldName)) {
        continue; // if a field is specified, only them
      }

      SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

      SchemaField sfield = schema.getFieldOrNull(fieldName);
      FieldType ftype = (sfield == null) ? null : sfield.getType();

      f.add("type", (ftype == null) ? null : ftype.getTypeName());
      f.add("schema", getFieldFlags(sfield));
      if (sfield != null
          && schema.isDynamicField(sfield.getName())
          && schema.getDynamicPattern(sfield.getName()) != null) {
        f.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
      }

      // If numTerms==0, the call is just asking for a quick field list
      if (ttinfo != null && sfield != null && sfield.indexed()) {
        Query q = new TermRangeQuery(fieldName, null, null, false, false);
        TopDocs top = searcher.search(q, 1);
        if (top.totalHits > 0) {
          // Find a document with this field
          try {
            Document doc = searcher.doc(top.scoreDocs[0].doc);
            Fieldable fld = doc.getFieldable(fieldName);
            if (fld != null) {
              f.add("index", getFieldFlags(fld));
            } else {
              // it is a non-stored field...
              f.add("index", "(unstored field)");
            }
          } catch (Exception ex) {
            log.warn("error reading field: " + fieldName);
          }
        }
        f.add("docs", top.totalHits);

        TopTermQueue topTerms = ttinfo.get(fieldName);
        if (topTerms != null) {
          f.add("distinct", topTerms.distinctTerms);

          // Include top terms
          f.add("topTerms", topTerms.toNamedList(searcher.getSchema()));

          // Add a histogram
          f.add("histogram", topTerms.histogram.toNamedList());
        }
      }

      // Add the field
      finfo.add(fieldName, f);
    }
    return finfo;
  }