예제 #1
0
파일: Lexicon.java 프로젝트: ePADD/muse
  /**
   * returns docs with ALL given sentiments. special cases: sentiments can be an array of length 1
   * and be "None", in which case all documents with no sentiments are returned. special cases:
   * sentiments can be an array of length 1 and be "all", in which case all documents with any
   * sentiments are returned.
   *
   * @param captions
   */
  public Collection<Document> getDocsWithSentiments(
      String sentiments[],
      Indexer indexer,
      Collection<Document> docs,
      int cluster,
      boolean originalContentOnly,
      String... captions) {
    Collection<Document> result = null;
    // note: multiple sentiments are possible, they are ANDED
    if (sentiments == null || sentiments.length == 0) return result;

    Set<Document> docs_set = Util.castOrCloneAsSet(docs);
    if (sentiments.length == 1 && "all".equalsIgnoreCase(sentiments[0]))
      return getDocsWithAnyEmotions(indexer, docs_set, originalContentOnly);

    // note: we'll pass in null for docs, and intersect with the given set of docs later
    // otherwise we'd just be doing it again and again for each category and lexer
    Map<String, Collection<Document>> map =
        getEmotions(indexer, null, false, originalContentOnly, captions);
    for (int i = 0; i < sentiments.length; i++) {
      Collection<Document> temp1 =
          ("None".equalsIgnoreCase(sentiments[i]))
              ? getDocsWithNoEmotions(indexer, docs_set, originalContentOnly)
              : map.get(sentiments[i]);
      if (temp1 == null) { // no matches, just return
        result = new LinkedHashSet<Document>();
        return result;
      }
      if (result == null) result = temp1;
      else result.retainAll(temp1);
    }
    // result.retainAll(docs);
    return Util.setIntersection(result, docs_set);
  }
예제 #2
0
파일: Lexicon.java 프로젝트: ePADD/muse
  /**
   * Core sentiment detection method. doNota = none of the above
   *
   * @param captions (null/none = all)
   */
  public Map<String, Collection<Document>> getEmotions(
      Indexer indexer,
      Collection<Document> docs,
      boolean doNota,
      boolean originalContentOnly,
      String... captions) {
    Collection<Lexicon1Lang> lexicons = getRelevantLexicon1Langs(docs);
    Map<String, Collection<Document>> result = new LinkedHashMap<>();
    Set<Document> docs_set = Util.castOrCloneAsSet(docs);
    // aggregate results for each lang into result
    for (Lexicon1Lang lex : lexicons) {
      Map<String, Collection<Document>> resultsForThisLang =
          (doNota
              ? lex.getEmotionsWithNOTA(indexer, docs_set, originalContentOnly)
              : lex.getEmotions(indexer, docs_set, originalContentOnly, captions));
      if (resultsForThisLang == null) continue;

      for (String caption : resultsForThisLang.keySet()) {
        Collection<Document> resultDocsThisLang = resultsForThisLang.get(caption);
        Collection<Document> resultDocs = result.get(caption);
        // if caption doesn't exist already, create a new entry, or else add to the existing set of
        // docs that match this caption
        if (resultDocs == null) result.put(caption, resultDocsThisLang);
        else resultDocs.addAll(resultDocsThisLang);
      }
    }
    // TODO: the result can be cached at server to avoid redundant computation (by concurrent users,
    // which are few for now)
    return result;
  }
예제 #3
0
파일: Lexicon.java 프로젝트: ePADD/muse
    /**
     * main entry point: returns a category -> docs map for each (non-zero) category in the current
     * captionToQueryMap.
     *
     * @indexer must already have run
     * @docs results are restrictes to these docs. assumes all docs if docs is null or empty.
     * @captions (null/none = all)
     *     <p>vihari This is a weird name for a method that returns documents with emotions instead
     *     of emotions.
     */
    public Map<String, Collection<Document>> getEmotions(
        Indexer indexer,
        Collection<Document> docs,
        boolean originalContentOnly,
        String... captions) {
      Map<String, Collection<Document>> result = new LinkedHashMap<String, Collection<Document>>();
      Set<Document> docs_set = Util.castOrCloneAsSet(docs);
      //			for (String[] emotion: emotionsData)
      String[] selected_captions =
          captions.length > 0 ? captions : captionToExpandedQuery.keySet().toArray(new String[0]);
      for (String caption : selected_captions) {
        String query = captionToExpandedQuery.get(caption);
        if (query == null) {
          log.warn("Skipping unknown caption '" + caption + "'");
          continue;
        }

        // query is simply word1|word2|word3 etc for that sentiment
        // the -1 indicates that we want all docs in the indexer that match the query
        int threshold = 1;
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setThreshold(threshold);
        options.setQueryType(Indexer.QueryType.ORIGINAL);
        Collection<Document> docsForCaption = indexer.docsForQuery(query, options);
        /*
        log.info (docsForCaption.size() + " before");
        threshold = 2;
        docsForCaption = indexer.docsForQuery(query, -1, threshold);
        log.info (docsForCaption.size() + " after");
        */
        //				Set<Document> docs = indexer.docsWithPhraseThreshold(query, -1, 2); // in future, we
        // may have a higher threshold for sentiment matching
        // if @param docs is present, retain only those docs that match, otherwise retain all
        if (!Util.nullOrEmpty(docs_set))
          // docsForCaption.retainAll(docs_set);
          docsForCaption = Util.listIntersection(docsForCaption, docs_set);

        // put it in the result only if at least 1 doc matches
        if (docsForCaption.size() > 0) result.put(caption, docsForCaption);
      }
      return result;
    }
예제 #4
0
파일: Lexicon.java 프로젝트: ePADD/muse
  // accumulates counts returned by lexicons in each language
  // TODO: It is possible to write a generic accumulator that accumulates sum over all the languages
  public Map<String, Integer> getLexiconCounts(Indexer indexer, boolean originalContentOnly) {
    List<Document> docs = indexer.docs;
    Collection<Lexicon1Lang> lexicons = getRelevantLexicon1Langs(docs);
    Map<String, Integer> result = new LinkedHashMap<String, Integer>();
    Set<Document> docs_set = Util.castOrCloneAsSet(docs);
    // aggregate results for each lang into result
    for (Lexicon1Lang lex : lexicons) {
      Map<String, Integer> resultsForThisLang = lex.getLexiconCounts(indexer, originalContentOnly);
      if (resultsForThisLang == null) continue;

      for (String caption : resultsForThisLang.keySet()) {
        Integer resultCountsThisLang = resultsForThisLang.get(caption);
        Integer resultCounts = result.get(caption);
        // if caption doesn't exist already, create a new entry, or else add to the existing set of
        // docs that match this caption
        if (resultCounts == null) result.put(caption, resultCountsThisLang);
        else result.put(caption, resultCounts + resultCountsThisLang);
      }
    }
    return result;
  }