private static SimpleOrderedMap<Object> getDocumentFieldsInfo(
      Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException {
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    for (Object o : doc.getFields()) {
      Fieldable fieldable = (Fieldable) o;
      SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

      SchemaField sfield = schema.getFieldOrNull(fieldable.name());
      FieldType ftype = (sfield == null) ? null : sfield.getType();

      f.add("type", (ftype == null) ? null : ftype.getTypeName());
      f.add("schema", getFieldFlags(sfield));
      f.add("flags", getFieldFlags(fieldable));

      Term t =
          new Term(
              fieldable.name(),
              ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue());

      f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable));

      // TODO: this really should be "stored"
      f.add("internal", fieldable.stringValue()); // may be a binary number

      byte[] arr = fieldable.getBinaryValue();
      if (arr != null) {
        f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length));
      }
      f.add("boost", fieldable.getBoost());
      f.add(
          "docFreq",
          t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields

      // If we have a term vector, return that
      if (fieldable.isTermVectorStored()) {
        try {
          TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name());
          if (v != null) {
            SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
            for (int i = 0; i < v.size(); i++) {
              tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]);
            }
            f.add("termVector", tfv);
          }
        } catch (Exception ex) {
          log.warn("error writing term vector", ex);
        }
      }

      finfo.add(fieldable.name(), f);
    }
    return finfo;
  }
  /**
   * Calculates the similarity between two TermFreqVectors
   *
   * @param vec1 the first TermFreqVector
   * @param vec2 the second TermFreqVector
   * @return the cosine similarity of the TermFreqVectors
   */
  public double cosineSimilarity(TermFreqVector vec1, TermFreqVector vec2) throws IOException {

    HashMap<String, Integer> terms = new HashMap<String, Integer>();

    // Get all of the terms and term frequencies in the two vecors
    String[] termTexts1 = vec1.getTerms();
    String[] termTexts2 = vec2.getTerms();
    int[] termFreqs1 = vec1.getTermFrequencies();
    int[] termFreqs2 = vec2.getTermFrequencies();

    // Store the terms and their positions in a hashmap - this represents the vocabulary
    int pos = 0;
    for (String term : termTexts1) {
      terms.put(term, pos++);
    }
    for (String term : termTexts2) {
      if (!terms.containsKey(term)) {
        terms.put(term, pos++);
      }
    }

    // Create vectors representing the two documents
    DocVector dv1 = new DocVector(terms);
    DocVector dv2 = new DocVector(terms);

    // Set the entries in the two documents, i.e., the term weights in the document vectors
    for (int i = 0; i < termTexts1.length; i++) {
      dv1.setEntry(termTexts1[i], termFreqs1[i]);
    }
    for (int i = 0; i < termTexts2.length; i++) {
      dv2.setEntry(termTexts2[i], termFreqs2[i]);
    }

    // Normalize
    dv1.normalize();
    dv2.normalize();

    // Return the cosine similarity of the two document vectors
    return (dv1.vector.dotProduct(dv2.vector)) / (dv1.vector.getNorm() * dv2.vector.getNorm());
  }
 public final String[] get(
     final ReaderLocal reader, final int docId, final String field, final Timer timer)
     throws IOException, ParseException, SyntaxError {
   FieldContentCacheKey key = new FieldContentCacheKey(field, docId);
   String[] terms = getAndPromote(key);
   if (terms != null) return terms;
   TermFreqVector termFreqVector = reader.getTermFreqVector(docId, field);
   if (termFreqVector == null) return null;
   terms = termFreqVector.getTerms();
   if (terms == null) return null;
   put(key, terms);
   return terms;
 }
Example #4
0
 private static final Map<String, FacetItem> computeMultivaluedTFV(
     ReaderAbstract reader, String fieldName, DocIdInterface docIdInterface)
     throws IOException, SearchLibException {
   Map<String, FacetItem> termMap = new HashMap<String, FacetItem>();
   if (docIdInterface.getSize() == 0) return termMap;
   for (int docId : docIdInterface.getIds()) {
     TermFreqVector tfv = reader.getTermFreqVector(docId, fieldName);
     if (tfv == null) continue;
     String[] terms = tfv.getTerms();
     int[] freqs = tfv.getTermFrequencies();
     if (terms == null || freqs == null) continue;
     int i = 0;
     for (String term : terms) {
       if (freqs[i++] > 0) {
         FacetItem facetItem = termMap.get(term);
         if (facetItem == null) termMap.put(term, new FacetItem(term, 1));
         else facetItem.count++;
       }
     }
   }
   return termMap;
 }
  public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences)
      throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File("lib/stoplists/en.txt"));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    for (String s : fileSentences) {
      Document doc1 = new Document();
      StringReader d1reader = new StringReader(s);
      doc1.add(new Field("contents", d1reader, TermVector.YES));
      writer.addDocument(doc1);
    }

    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[fileSentences.size()];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    int pos = 0;
    while (termEnum.next()) {
      Term term = termEnum.term();
      if (!"contents".equals(term.field())) break;
      terms.put(term.text(), pos++);
    }

    // System.out.println("Num terms:"+terms.size());

    for (int i = 0; i < fileSentences.size(); i++) {
      TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i);
      docs[i] = new DocVector(terms);
      if (tfvs == null) continue;
      for (TermFreqVector tfv : tfvs) {
        String[] termTexts = tfv.getTerms();
        int[] termFreqs = tfv.getTermFrequencies();
        for (int j = 0; j < termTexts.length; j++) {
          double idfValue = getIDF(RAMreader, termTexts[j]);
          double tfIdfValue = termFreqs[j] * idfValue;
          docs[i].setEntry(termTexts[j], tfIdfValue);
        }
      }
      docs[i].normalize();
    }

    RAMreader.close();
    ramDir.close();
    // ramDir.close();
    // System.out.println(RAMreader.numDocs());
    // System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19]));
    return docs;
  }
  /**
   * Calculates the cosine similarity between two documents.
   *
   * @param d1 the first document
   * @param d2 the second document
   * @return the cosine similarity
   * @throws IOException
   */
  public double getCosineSimilarity(String d1, String d2) throws IOException {

    RAMDirectory ramDir = new RAMDirectory();
    FileReader fr = new FileReader(new File(WikiHelper.getSpecificProperty("stopwordFile")));

    //	Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr);
    // Index the full text of both documents
    @SuppressWarnings("deprecation")
    // IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true,
    // IndexWriter.MaxFieldLength.UNLIMITED);
    IndexWriter writer =
        new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
    Document doc1 = new Document();
    StringReader d1reader = new StringReader(d1);
    doc1.add(new Field("contents", d1reader, TermVector.YES));

    writer.addDocument(doc1);
    Document doc2 = new Document();
    StringReader d2reader = new StringReader(d2);

    doc2.add(new Field("contents", d2reader, TermVector.YES));
    writer.addDocument(doc2);
    //  writer.commit();
    writer.close();

    DocVector[] docs = new DocVector[2];
    // Build a term vector for each document
    IndexReader RAMreader = IndexReader.open(ramDir);
    Map<String, Integer> terms = new HashMap<String, Integer>();
    TermEnum termEnum = RAMreader.terms(new Term("contents"));

    // System.out.println(RAMreader.numDocs());
    TermFreqVector tfvs1 = RAMreader.getTermFreqVector(0, "contents");
    TermFreqVector tfvs2 = RAMreader.getTermFreqVector(1, "contents");
    // System.out.println(tfvs1.toString());
    if (tfvs1 == null || tfvs2 == null) {
      return 0.0;
    }

    String[] termTexts1 = tfvs1.getTerms();

    String[] termTexts2 = tfvs2.getTerms();

    // Store the terms and their positions in a hashmap - this represents the vocabulary
    int pos = 0;
    for (String term : termTexts1) {
      terms.put(term, pos++);
    }
    for (String term : termTexts2) {
      if (!terms.containsKey(term)) {
        terms.put(term, pos++);
      }
    }

    docs[0] = new DocVector(terms);
    docs[1] = new DocVector(terms);
    int[] termFreqs1 = tfvs1.getTermFrequencies();
    for (int j = 0; j < termTexts1.length; j++) {
      // System.out.println("termtext:"+termTexts1[j]);
      double idfValue = getIDF(RAMreader, termTexts1[j]);
      // System.out.println("idf:"+idfValue);
      double tfIdfValue = termFreqs1[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[0].setEntry(termTexts1[j], tfIdfValue);
    }

    int[] termFreqs2 = tfvs2.getTermFrequencies();
    for (int j = 0; j < termTexts2.length; j++) {
      double idfValue = getIDF(RAMreader, termTexts2[j]);
      double tfIdfValue = termFreqs2[j] * idfValue;
      // docs[i].setEntry(termTexts[j], termFreqs[j]);
      // System.out.println("TF IDF value "+termFreqs[j]+" "+termTexts[j]+"
      // "+idfValue+"\t"+tfIdfValue);
      docs[1].setEntry(termTexts2[j], tfIdfValue);
    }

    //
    //
    //
    //		System.out.println(terms.toString());
    //		System.out.println(docs[0]);
    //		System.out.println(docs[1]);
    RAMreader.close();
    ramDir.close();
    //        docs[0].normalize();
    //        docs[1].normalize();

    // Return the cosine similarity of the term vectors

    return calcCosineSimilarity(docs[0], docs[1]);
  }