public TermFreqVector searchIndexReturnFreqTerms(String searchString, String termString) {
    System.out.println("Searching for '" + searchString + "'");
    // Directory directory = FSDirectory.getDirectory();
    IndexReader indexReader;
    TermFreqVector termFreqDoc = null;
    try {
      indexReader = IndexReader.open(indexDirectory);
      IndexSearcher indexSearcher = new IndexSearcher(indexReader);
      Term term = new Term(termString, searchString);
      TermQuery query = new TermQuery(term);
      TopDocs topDocs = indexSearcher.search(query, 10);
      if (topDocs.scoreDocs.length > 0) {
        // while(it.hasNext()){
        int docId = topDocs.scoreDocs[0].doc;
        Document doc = indexSearcher.doc(docId);
        //	textOfURL = doc.get("text");
        // sourceCodeOfURL = doc.get("html");
        //	this.docId = docID;
        termFreqDoc = indexReader.getTermFreqVector(docId, "text");
      }
    } catch (CorruptIndexException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

    return termFreqDoc;
  }
  @Test(groups = "ch12")
  public void vectorTest() throws Exception {
    FullTextSession session = Search.getFullTextSession(openSession());
    Transaction tx = session.beginTransaction();
    buildIndex(session, tx);

    try {
      tx = session.beginTransaction();

      Query query = new TermQuery(new Term("content", "properties"));
      System.out.println(query.toString());

      FullTextQuery hibQuery = session.createFullTextQuery(query, ElectricalProperties.class);
      hibQuery.setProjection(
          FullTextQuery.DOCUMENT, FullTextQuery.DOCUMENT_ID, FullTextQuery.SCORE);

      reader = getReader(session);

      List<Object[]> results = hibQuery.list();

      assert results.size() > 0 : "no results returned";
      for (int x = 0; x < results.size(); x++) {

        Integer docId = (Integer) results.get(x)[1];
        TermPositionVector vector = (TermPositionVector) reader.getTermFreqVector(docId, "content");
        String[] terms = vector.getTerms();
        int[] f = vector.getTermFrequencies();

        System.out.println(results.get(x)[2]);
        for (int y = 0; y < vector.size(); y++) {
          System.out.print("docID# =>" + docId);
          System.out.print(" term => " + terms[y]);
          System.out.print(" freq => " + f[y]);

          int[] positions = vector.getTermPositions(y);
          TermVectorOffsetInfo[] offsets = vector.getOffsets(y);
          for (int z = 0; z < positions.length; z++) {
            System.out.print(" position => " + positions[z]);
            System.out.print(" starting offset => " + offsets[z].getStartOffset());
            System.out.println(" ending offset => " + offsets[z].getEndOffset());
          }
          System.out.println("---------------");
        }
      }
      for (Object element :
          session.createQuery("from " + ElectricalProperties.class.getName()).list())
        session.delete(element);

      tx.commit();
    } finally {
      session.close();
      if (provider != null) {
        provider.closeReader(reader);
      }
    }
  }
Пример #3
0
  private static SimpleOrderedMap<Object> getDocumentFieldsInfo(
      Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException {
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    for (Object o : doc.getFields()) {
      Fieldable fieldable = (Fieldable) o;
      SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

      SchemaField sfield = schema.getFieldOrNull(fieldable.name());
      FieldType ftype = (sfield == null) ? null : sfield.getType();

      f.add("type", (ftype == null) ? null : ftype.getTypeName());
      f.add("schema", getFieldFlags(sfield));
      f.add("flags", getFieldFlags(fieldable));

      Term t =
          new Term(
              fieldable.name(),
              ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue());

      f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable));

      // TODO: this really should be "stored"
      f.add("internal", fieldable.stringValue()); // may be a binary number

      byte[] arr = fieldable.getBinaryValue();
      if (arr != null) {
        f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length));
      }
      f.add("boost", fieldable.getBoost());
      f.add(
          "docFreq",
          t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields

      // If we have a term vector, return that
      if (fieldable.isTermVectorStored()) {
        try {
          TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name());
          if (v != null) {
            SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
            for (int i = 0; i < v.size(); i++) {
              tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]);
            }
            f.add("termVector", tfv);
          }
        } catch (Exception ex) {
          log.warn("error writing term vector", ex);
        }
      }

      finfo.add(fieldable.name(), f);
    }
    return finfo;
  }
Пример #4
0
  public static void main(String[] args) {
    TestIndexing t = new TestIndexing();
    t.createIndex();

    try {
      IndexReader reader = t.getReader();

      TweetVocabulary vocabulary = new TweetVocabulary(reader, "tweet");
      System.out.println(vocabulary); //

      for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
          continue;
        }

        TermFreqVector tfv = reader.getTermFreqVector(i, "tweet");
        System.out.println(tfv); //
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Пример #5
0
 /**
  * Reconstruct document fields.
  *
  * @param docNum document number. If this document is deleted, but the index is not optimized yet,
  *     the reconstruction process may still yield the reconstructed field content even from
  *     deleted documents.
  * @return reconstructed document
  * @throws Exception
  */
 public Reconstructed reconstruct(int docNum) throws Exception {
   if (docNum < 0 || docNum > reader.maxDoc()) {
     throw new Exception("Document number outside of valid range.");
   }
   Reconstructed res = new Reconstructed();
   if (deleted != null && deleted.get(docNum)) {
     throw new Exception("Document is deleted.");
   } else {
     Document doc = reader.document(docNum);
     for (int i = 0; i < fieldNames.length; i++) {
       Field[] fs = doc.getFields(fieldNames[i]);
       if (fs != null && fs.length > 0) {
         res.getStoredFields().put(fieldNames[i], fs);
       }
     }
   }
   // collect values from unstored fields
   HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames));
   // try to use term vectors if available
   progress.maxValue = fieldNames.length;
   progress.curValue = 0;
   progress.minValue = 0;
   for (int i = 0; i < fieldNames.length; i++) {
     TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]);
     if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) {
       TermPositionVector tpv = (TermPositionVector) tvf;
       progress.message = "Reading term vectors ...";
       progress.curValue = i;
       setChanged();
       notifyObservers(progress);
       BytesRef[] tv = tpv.getTerms();
       for (int k = 0; k < tv.length; k++) {
         // do we have positions?
         int[] posArr = tpv.getTermPositions(k);
         if (posArr == null) {
           // only offsets
           TermVectorOffsetInfo[] offsets = tpv.getOffsets(k);
           if (offsets.length == 0) {
             continue;
           }
           // convert offsets into positions
           posArr = convertOffsets(offsets);
         }
         GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]);
         if (gsa == null) {
           gsa = new GrowableStringArray();
           res.getReconstructedFields().put(fieldNames[i], gsa);
         }
         for (int m = 0; m < posArr.length; m++) {
           gsa.append(posArr[m], "|", tv[k].utf8ToString());
         }
       }
       fields.remove(fieldNames[i]); // got what we wanted
     }
   }
   // this loop collects data only from left-over fields
   // not yet collected through term vectors
   progress.maxValue = fields.size();
   progress.curValue = 0;
   progress.minValue = 0;
   for (String fld : fields) {
     progress.message = "Collecting terms in " + fld + " ...";
     progress.curValue++;
     setChanged();
     notifyObservers(progress);
     Terms terms = MultiFields.getTerms(reader, fld);
     if (terms == null) { // no terms in this field
       continue;
     }
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null);
       if (dpe == null) { // no position info for this field
         break;
       }
       int num = dpe.advance(docNum);
       if (num != docNum) { // either greater than or NO_MORE_DOCS
         continue; // no data for this term in this doc
       }
       String term = te.term().utf8ToString();
       GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld);
       if (gsa == null) {
         gsa = new GrowableStringArray();
         res.getReconstructedFields().put(fld, gsa);
       }
       for (int k = 0; k < dpe.freq(); k++) {
         int pos = dpe.nextPosition();
         gsa.append(pos, "|", term);
       }
     }
   }
   progress.message = "Done.";
   progress.curValue = 100;
   setChanged();
   notifyObservers(progress);
   return res;
 }
  /**
   * Calculates the cosine similarity between two documents.
   *
   * @param d1 the first document
   * @param d2 the second document
   * @return the cosine similarity
   * @throws IOException
   */
  public double getCosineSimilarity(String d1, String d2) throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile")));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    @SuppressWarnings("deprecation")
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    Document doc1 = new Document();
    StringReader d1reader = new StringReader(d1);
    doc1.add(new Field("contents", d1reader, TermVector.YES));

    writer.addDocument(doc1);
    Document doc2 = new Document();
    StringReader d2reader = new StringReader(d2);

    doc2.add(new Field("contents", d2reader, TermVector.YES));
    writer.addDocument(doc2);
    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[2];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents");
    TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents");
    // System.out.println(tfvs1.toString());
    if (tfvs1 == null || tfvs2 == null) {
      return 0.0;
    }

    String[] termTexts1 = tfvs1.getTerms();

    String[] termTexts2 = tfvs2.getTerms();

    // Store the terms and their positions in a hashmap - this represents the vocabulary
    int pos = 0;
    for (String term : termTexts1) {
      terms.put(term, pos++);
    }
    for (String term : termTexts2) {
      if (!terms.containsKey(term)) {
        terms.put(term, pos++);
      }
    }

    docs[0] = new DocVector(terms);
    docs[1] = new DocVector(terms);
    int[] termFreqs1 = tfvs1.getTermFrequencies();
    for (int j = 0; j < termTexts1.length; j++) {
      // System.out.println("termtext:"+termTexts1[j]);
      double idfValue = getIDF(RAMreader, termTexts1[j]);
      // System.out.println("idf:"+idfValue);
      double tfIdfValue = termFreqs1[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[0].setEntry(termTexts1[j], tfIdfValue);
    }

    int[] termFreqs2 = tfvs2.getTermFrequencies();
    for (int j = 0; j < termTexts2.length; j++) {
      double idfValue = getIDF(RAMreader, termTexts2[j]);
      double tfIdfValue = termFreqs2[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[1].setEntry(termTexts2[j], tfIdfValue);
    }

    //
    //
    //
    //		System.out.println(terms.toString());
    //		System.out.println(docs[0]);
    //		System.out.println(docs[1]);
    RAMreader.close();
    ramDir.close();
    //        docs[0].normalize();
    //        docs[1].normalize();

    // Return the cosine similarity of the term vectors

    return calcCosineSimilarity(docs[0], docs[1]);
  }
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }

    NamedList termVectors = new NamedList();
    rb.rsp.add(TERM_VECTORS, termVectors);
    FieldOptions allFields = new FieldOptions();
    // figure out what options we have, and try to get the appropriate vector
    allFields.termFreq = params.getBool(TermVectorParams.TF, false);
    allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
    allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
    allFields.docFreq = params.getBool(TermVectorParams.DF, false);
    allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
    // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
    // short cut to all values.
    boolean all = params.getBool(TermVectorParams.ALL, false);
    if (all == true) {
      allFields.termFreq = true;
      allFields.positions = true;
      allFields.offsets = true;
      allFields.docFreq = true;
      allFields.tfIdf = true;
    }

    String fldLst = params.get(TermVectorParams.FIELDS);
    if (fldLst == null) {
      fldLst = params.get(CommonParams.FL);
    }

    // use this to validate our fields
    IndexSchema schema = rb.req.getSchema();
    // Build up our per field mapping
    Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
    NamedList warnings = new NamedList();
    List<String> noTV = new ArrayList<String>();
    List<String> noPos = new ArrayList<String>();
    List<String> noOff = new ArrayList<String>();

    // we have specific fields to retrieve
    if (fldLst != null) {
      String[] fields = SolrPluginUtils.split(fldLst);
      for (String field : fields) {
        SchemaField sf = schema.getFieldOrNull(field);
        if (sf != null) {
          if (sf.storeTermVector()) {
            FieldOptions option = fieldOptions.get(field);
            if (option == null) {
              option = new FieldOptions();
              option.fieldName = field;
              fieldOptions.put(field, option);
            }
            // get the per field mappings
            option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
            option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
            option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
            // Validate these are even an option
            option.positions =
                params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
            if (option.positions == true && sf.storeTermPositions() == false) {
              noPos.add(field);
            }
            option.offsets =
                params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
            if (option.offsets == true && sf.storeTermOffsets() == false) {
              noOff.add(field);
            }
          } else { // field doesn't have term vectors
            noTV.add(field);
          }
        } else {
          // field doesn't exist
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
        }
      }
    } // else, deal with all fields
    boolean hasWarnings = false;
    if (noTV.isEmpty() == false) {
      warnings.add("noTermVectors", noTV);
      hasWarnings = true;
    }
    if (noPos.isEmpty() == false) {
      warnings.add("noPositions", noPos);
      hasWarnings = true;
    }
    if (noOff.isEmpty() == false) {
      warnings.add("noOffsets", noOff);
      hasWarnings = true;
    }
    if (hasWarnings == true) {
      termVectors.add("warnings", warnings);
    }

    DocListAndSet listAndSet = rb.getResults();
    List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
    Iterator<Integer> iter;
    if (docIds != null && docIds.isEmpty() == false) {
      iter = docIds.iterator();
    } else {
      DocList list = listAndSet.docList;
      iter = list.iterator();
    }
    SolrIndexSearcher searcher = rb.req.getSearcher();

    IndexReader reader = searcher.getReader();
    // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors
    SchemaField keyField = schema.getUniqueKeyField();
    String uniqFieldName = null;
    if (keyField != null) {
      uniqFieldName = keyField.getName();
    }
    // Only load the id field to get the uniqueKey of that field
    SetBasedFieldSelector fieldSelector =
        new SetBasedFieldSelector(
            Collections.singleton(uniqFieldName), Collections.<String>emptySet());
    TVMapper mapper = new TVMapper(reader);
    mapper.fieldOptions =
        allFields; // this will only stay set if fieldOptions.isEmpty() (in other words, only if the
                   // user didn't set any fields)
    while (iter.hasNext()) {
      Integer docId = iter.next();
      NamedList docNL = new NamedList();
      mapper.docNL = docNL;
      termVectors.add("doc-" + docId, docNL);

      if (keyField != null) {
        Document document = reader.document(docId, fieldSelector);
        Fieldable uniqId = document.getFieldable(uniqFieldName);
        String uniqVal = null;
        if (uniqId != null) {
          uniqVal = keyField.getType().storedToReadable(uniqId);
        }
        if (uniqVal != null) {
          docNL.add("uniqueKey", uniqVal);
          termVectors.add("uniqueKeyFieldName", uniqFieldName);
        }
      }
      if (fieldOptions.isEmpty() == false) {
        for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
          mapper.fieldOptions = entry.getValue();
          reader.getTermFreqVector(docId, entry.getKey(), mapper);
        }
      } else {
        // deal with all fields by using the allFieldMapper
        reader.getTermFreqVector(docId, mapper);
      }
    }
  }
 /** @see LuceneIndexReader#getTermFreqVector(int, String) */
 public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
   return indexReader.getTermFreqVector(docNumber, field);
 }