/** * returns docs with ALL given sentiments. special cases: sentiments can be an array of length 1 * and be "None", in which case all documents with no sentiments are returned. special cases: * sentiments can be an array of length 1 and be "all", in which case all documents with any * sentiments are returned. * * @param captions */ public Collection<Document> getDocsWithSentiments( String sentiments[], Indexer indexer, Collection<Document> docs, int cluster, boolean originalContentOnly, String... captions) { Collection<Document> result = null; // note: multiple sentiments are possible, they are ANDED if (sentiments == null || sentiments.length == 0) return result; Set<Document> docs_set = Util.castOrCloneAsSet(docs); if (sentiments.length == 1 && "all".equalsIgnoreCase(sentiments[0])) return getDocsWithAnyEmotions(indexer, docs_set, originalContentOnly); // note: we'll pass in null for docs, and intersect with the given set of docs later // otherwise we'd just be doing it again and again for each category and lexer Map<String, Collection<Document>> map = getEmotions(indexer, null, false, originalContentOnly, captions); for (int i = 0; i < sentiments.length; i++) { Collection<Document> temp1 = ("None".equalsIgnoreCase(sentiments[i])) ? getDocsWithNoEmotions(indexer, docs_set, originalContentOnly) : map.get(sentiments[i]); if (temp1 == null) { // no matches, just return result = new LinkedHashSet<Document>(); return result; } if (result == null) result = temp1; else result.retainAll(temp1); } // result.retainAll(docs); return Util.setIntersection(result, docs_set); }
/** * Core sentiment detection method. doNota = none of the above * * @param captions (null/none = all) */ public Map<String, Collection<Document>> getEmotions( Indexer indexer, Collection<Document> docs, boolean doNota, boolean originalContentOnly, String... captions) { Collection<Lexicon1Lang> lexicons = getRelevantLexicon1Langs(docs); Map<String, Collection<Document>> result = new LinkedHashMap<>(); Set<Document> docs_set = Util.castOrCloneAsSet(docs); // aggregate results for each lang into result for (Lexicon1Lang lex : lexicons) { Map<String, Collection<Document>> resultsForThisLang = (doNota ? lex.getEmotionsWithNOTA(indexer, docs_set, originalContentOnly) : lex.getEmotions(indexer, docs_set, originalContentOnly, captions)); if (resultsForThisLang == null) continue; for (String caption : resultsForThisLang.keySet()) { Collection<Document> resultDocsThisLang = resultsForThisLang.get(caption); Collection<Document> resultDocs = result.get(caption); // if caption doesn't exist already, create a new entry, or else add to the existing set of // docs that match this caption if (resultDocs == null) result.put(caption, resultDocsThisLang); else resultDocs.addAll(resultDocsThisLang); } } // TODO: the result can be cached at server to avoid redundant computation (by concurrent users, // which are few for now) return result; }
/** * main entry point: returns a category -> docs map for each (non-zero) category in the current * captionToQueryMap. * * @indexer must already have run * @docs results are restrictes to these docs. assumes all docs if docs is null or empty. * @captions (null/none = all) * <p>vihari This is a weird name for a method that returns documents with emotions instead * of emotions. */ public Map<String, Collection<Document>> getEmotions( Indexer indexer, Collection<Document> docs, boolean originalContentOnly, String... captions) { Map<String, Collection<Document>> result = new LinkedHashMap<String, Collection<Document>>(); Set<Document> docs_set = Util.castOrCloneAsSet(docs); // for (String[] emotion: emotionsData) String[] selected_captions = captions.length > 0 ? captions : captionToExpandedQuery.keySet().toArray(new String[0]); for (String caption : selected_captions) { String query = captionToExpandedQuery.get(caption); if (query == null) { log.warn("Skipping unknown caption '" + caption + "'"); continue; } // query is simply word1|word2|word3 etc for that sentiment // the -1 indicates that we want all docs in the indexer that match the query int threshold = 1; Indexer.QueryOptions options = new Indexer.QueryOptions(); options.setThreshold(threshold); options.setQueryType(Indexer.QueryType.ORIGINAL); Collection<Document> docsForCaption = indexer.docsForQuery(query, options); /* log.info (docsForCaption.size() + " before"); threshold = 2; docsForCaption = indexer.docsForQuery(query, -1, threshold); log.info (docsForCaption.size() + " after"); */ // Set<Document> docs = indexer.docsWithPhraseThreshold(query, -1, 2); // in future, we // may have a higher threshold for sentiment matching // if @param docs is present, retain only those docs that match, otherwise retain all if (!Util.nullOrEmpty(docs_set)) // docsForCaption.retainAll(docs_set); docsForCaption = Util.listIntersection(docsForCaption, docs_set); // put it in the result only if at least 1 doc matches if (docsForCaption.size() > 0) result.put(caption, docsForCaption); } return result; }
// accumulates counts returned by lexicons in each language // TODO: It is possible to write a generic accumulator that accumulates sum over all the languages public Map<String, Integer> getLexiconCounts(Indexer indexer, boolean originalContentOnly) { List<Document> docs = indexer.docs; Collection<Lexicon1Lang> lexicons = getRelevantLexicon1Langs(docs); Map<String, Integer> result = new LinkedHashMap<String, Integer>(); Set<Document> docs_set = Util.castOrCloneAsSet(docs); // aggregate results for each lang into result for (Lexicon1Lang lex : lexicons) { Map<String, Integer> resultsForThisLang = lex.getLexiconCounts(indexer, originalContentOnly); if (resultsForThisLang == null) continue; for (String caption : resultsForThisLang.keySet()) { Integer resultCountsThisLang = resultsForThisLang.get(caption); Integer resultCounts = result.get(caption); // if caption doesn't exist already, create a new entry, or else add to the existing set of // docs that match this caption if (resultCounts == null) result.put(caption, resultCountsThisLang); else result.put(caption, resultCounts + resultCountsThisLang); } } return result; }