/** * @param scores * @param restrictionSet * @return intersection of set (Multiset<Integer>) and restrictionSet (if restrictionSet non-null * & non-empty); otherwise return set */ public static Multiset<Integer> intersect( final Multiset<Integer> scores, final Set<Integer> restrictionSet) { if (restrictionSet != null && !restrictionSet.isEmpty()) { int prevSize = scores.size(); Multiset<Integer> intersection = HashMultiset.create(scores); intersection.retainAll(restrictionSet); log.debug( prevSize != 0 ? ("Size saving by retainAll = " + (((prevSize - intersection.size()) * 100) / prevSize)) + "%" : ""); return intersection; } return scores; }
/** * @param orConditions StatisticsQueryOrConditions<StatisticsQueryCondition> * @param statisticsStorage * @param scoringExps Set of experiments that have at least one non-zero score for * statisticsQuery. This is used retrieving efos to be displayed in heatmap when no query efvs * exist (c.f. atlasStatisticsQueryService.getScoringAttributesForGenes()) * @return Multiset<Integer> containing experiment counts corresponding to all attributes in each * StatisticsQueryCondition in orConditions */ private static Multiset<Integer> getScoresForOrConditions( final StatisticsQueryOrConditions<StatisticsQueryCondition> orConditions, StatisticsStorage statisticsStorage, Set<ExperimentInfo> scoringExps) { Multiset<Integer> scores = HashMultiset.create(); for (StatisticsQueryCondition orCondition : orConditions.getConditions()) { orCondition.setBioEntityIdRestrictionSet(orConditions.getBioEntityIdRestrictionSet()); scores.addAll(scoreQuery(orCondition, statisticsStorage, scoringExps)); } // Now apply orConditions' min experiments restriction to scores Multiset<Integer> qualifyingScores = HashMultiset.create(); for (Multiset.Entry<Integer> entry : scores.entrySet()) { if (entry.getCount() >= orConditions.getMinExperiments()) { qualifyingScores.setCount(entry.getElement(), entry.getCount()); } } return qualifyingScores; }
/** * @param statsQuery StatisticsQueryCondition * @param statisticsStorage * @param scoringExps Set of experiments that have at least one non-zero score for * statisticsQuery. This is used retrieving efos to be displayed in heatmap when no query efvs * exist (c.f. atlasStatisticsQueryService.getScoringAttributesForGenes()) * @return experiment counts corresponding for statsQuery */ public static Multiset<Integer> getExperimentCounts( StatisticsQueryCondition statsQuery, StatisticsStorage statisticsStorage, Set<ExperimentInfo> scoringExps) { long start = System.currentTimeMillis(); Multiset<Integer> counts = StatisticsQueryUtils.scoreQuery(statsQuery, statisticsStorage, scoringExps); long dur = System.currentTimeMillis() - start; int numOfGenesWithCounts = counts.elementSet().size(); if (numOfGenesWithCounts > 0) { log.debug( "StatisticsQuery: " + statsQuery.prettyPrint() + " ==> result set size: " + numOfGenesWithCounts + " (duration: " + dur + " ms)"); } return counts; }
/** * The core scoring method for statistics queries * * @param statisticsQuery query to be peformed on statisticsStorage * @param statisticsStorage core data for Statistics qeries * @param scoringExps an out parameter. * <p>- If null, experiment counts result of statisticsQuery should be returned. if - If * non-null, it serves as a flag that an optimised statisticsQuery should be performed to just * collect Experiments for which non-zero counts exist for Statistics query. A typical call * scenario in this case is just one efv per statisticsQuery, in which we can both: 1. check * if the efv Attribute itself is a scoring one 2. map this Attribute and Experimeants in * scoringExps to efo terms - via the reverse mapping efv-experiment-> efo term in EfoIndex * (c.f. atlasStatisticsQueryService.getScoringAttributesForGenes()) * @return Multiset of aggregated experiment counts, where the set of scores genes is intersected * across statisticsQuery.getConditions(), and union-ed across attributes within each * condition in statisticsQuery.getConditions(). */ public static Multiset<Integer> scoreQuery( StatisticsQueryCondition statisticsQuery, final StatisticsStorage statisticsStorage, Set<ExperimentInfo> scoringExps) { // gatherScoringExpsOnly -> experiment counts should be calculated for statisticsQuery // !gatherScoringExpsOnly -> scoring experiments should be collected (into scoringExps) only boolean gatherScoringExpsOnly = scoringExps != null; Set<StatisticsQueryOrConditions<StatisticsQueryCondition>> andStatisticsQueryConditions = statisticsQuery.getConditions(); Multiset<Integer> results = null; if (andStatisticsQueryConditions.isEmpty()) { // End of recursion Set<Integer> bioEntityIdRestrictionSet = statisticsQuery.getBioEntityIdRestrictionSet(); Set<EfAttribute> attributes = statisticsQuery.getAttributes(); if (attributes.isEmpty()) { // No attributes were provided - we have to use pre-computed scores across all attributes Multiset<Integer> scoresAcrossAllEfos = statisticsStorage.getScoresAcrossAllEfos(statisticsQuery.getStatisticsType()); results = intersect(scoresAcrossAllEfos, bioEntityIdRestrictionSet); } else { results = HashMultiset.create(); setQueryExperiments(statisticsQuery, statisticsStorage); // For each experiment in the query, traverse through all attributes and add all gene // indexes into one ConciseSet. This way a gene can score // only once for a single experiment - across all OR attributes in this query. Once all // attributes have been traversed for a single experiment, // add ConciseSet to Multiset results for (ExperimentInfo exp : statisticsQuery.getExperiments()) { FastSet statsForExperiment = new FastSet(); for (EfAttribute attr : attributes) { Map<ExperimentInfo, ConciseSet> expsToStats = getStatisticsForAttribute( statisticsQuery.getStatisticsType(), attr, statisticsStorage); if (expsToStats != null) { if (expsToStats.isEmpty()) { log.debug( "Failed to retrieve stats for stat: " + statisticsQuery.getStatisticsType() + " and attr: " + attr); } else { if (expsToStats.get(exp) != null) { if (!gatherScoringExpsOnly) { statsForExperiment.addAll( intersect(expsToStats.get(exp), bioEntityIdRestrictionSet)); } else if (containsAtLeastOne(expsToStats.get(exp), bioEntityIdRestrictionSet)) { // exp contains at least one non-zero score for at least one gene index in // bioEntityIdRestrictionSet -> add it to scoringExps scoringExps.add(exp); } } else { log.debug( "Failed to retrieve stats for stat: " + statisticsQuery.getStatisticsType() + " exp: " + exp.getAccession() + " and attr: " + attr); } } } } if (!gatherScoringExpsOnly) { results.addAll(statsForExperiment); } } } } else { // run over all AND conditions, do "OR" inside (cf. scoreOrStatisticsQueryConditions()) , // "AND"'ing over the whole thing for (StatisticsQueryOrConditions<StatisticsQueryCondition> orConditions : andStatisticsQueryConditions) { // Pass gene restriction set down to orConditions orConditions.setGeneRestrictionSet(statisticsQuery.getBioEntityIdRestrictionSet()); // process OR conditions Multiset<Integer> condGenes = getScoresForOrConditions(orConditions, statisticsStorage, scoringExps); if (results == null) results = condGenes; else { Iterator<Multiset.Entry<Integer>> resultGenes = results.entrySet().iterator(); while (resultGenes.hasNext()) { Multiset.Entry<Integer> entry = resultGenes.next(); if (!condGenes.contains( entry.getElement())) // AND operation between different top query conditions resultGenes.remove(); else // for all gene ids belonging to intersection of all conditions seen so far, we // accumulate experiment counts results.setCount( entry.getElement(), entry.getCount() + condGenes.count(entry.getElement())); } } } } if (results == null) { results = HashMultiset.create(); } return results; }