/** * This will group by the rank, inspect higher taxa and attempt to merge where possible. In simple * terms, grouped at rank with common values d: a,-,c,d a,b,-,d -,b,-,d will merge into a,b,c,d * where NO CONFLICTS exist. Introducing a conflict: a,-,c,d a,b,-,d -,b,-,d e,-,-,d will produce: * a,b,c,d e,-,-,d * * @param rank The rank being operated on * @param denorm To merge across. Must be sorted to the rank declared before calling */ public void merge( final LINNEAN_RANK rank, List<DenormClassification> denorm, Map<LINNEAN_RANK, Set<String>> homonyms) { if (LOG.isDebugEnabled()) { LOG.debug("Starting merging classifications[{}] at rank[{}]", denorm.size(), rank); DenormClassificationUtils.debug(denorm); } // holds the group of things to merge List<DenormClassification> workingGroup = new ArrayList<DenormClassification>(); // the previous record DenormClassification prev = null; for (DenormClassification curr : denorm) { if (prev == null) { // first record workingGroup.add(curr); } else { if (!StringUtils.equals(curr.get(rank), prev.get(rank))) { // there is a change, so perform any necessary merging and copy to merged inferHigherTaxa(denorm, workingGroup, rank, homonyms); // refresh the working group workingGroup.clear(); } workingGroup.add(curr); } prev = curr; } // ensure the last row is handled inferHigherTaxa(denorm, workingGroup, rank, homonyms); }
/** * Inspects the group which should all have the same value at the declared rank. This will infer * higher taxa for each row where there are no conflicts. For each rank starting kingdom then * phylum etc: a) for each that is null at that rank, consider any with a non-null value at that * rank as candidate for copying the rank value b) check each candidate and remove it from the * potential candidates if there is a conflict c) if there is 1 candidate at the end use it, * otherwise, it cannot be used * * @param source To infer what is possible * @param rank the most significant rank being operated on (inclusive) Passing rank of genus, * means you infer k,p,g,o,f and genus */ private void inferHigherTaxa( List<DenormClassification> taxonomy, List<DenormClassification> group, final LINNEAN_RANK rank, Map<LINNEAN_RANK, Set<String>> homonyms) { if (group.size() <= 1) { LOG.debug( "Nothing to merge at rank[{}] since there is/are {} classification(s)", rank, group.size()); } else if (StringUtils.isBlank(group.get(0).get(rank))) { LOG.debug( "Skipping merging of group since the group represents a group with null at the rank"); } else { LOG.info( "Merging classifications[{}] at rank[{}] for group: {}", new Object[] {group.size(), rank, group.get(0).get(rank)}); if (LOG.isDebugEnabled()) { DenormClassificationUtils.debug(group); } // we know we will receive a lot of duplicates, so extract them for performance Map<String, DenormClassification> distinctClassifications = distinctClassifications(group, rank); // inspect from the highest rank to the working rank in order, inferring as we go for (LINNEAN_RANK r : LinneanRank.ranksHigherThan(rank, false)) { // get the "sparse" classifications (e.g. with a null at the rank in question) List<DenormClassification> sparseRecords = new ArrayList<DenormClassification>(); for (DenormClassification d : group) { if (StringUtils.isBlank(d.get(r))) { sparseRecords.add(d); } } // don't continue if there are no sparse records if (sparseRecords.size() < 1) { if (LOG.isDebugEnabled()) { LOG.debug("No classification(s) is/are empty at rank[{}]", r); DenormClassificationUtils.debug(group); } } else { if (LOG.isDebugEnabled()) { LOG.debug(sparseRecords.size() + " classification(s) is/are empty at rank[{}]", r); DenormClassificationUtils.debug(group); } // a classification with a value is a candidate List<DenormClassification> candidates = new ArrayList<DenormClassification>(); for (DenormClassification dc : distinctClassifications.values()) { if (StringUtils.isNotBlank(dc.get(r))) { candidates.add(dc); } } LOG.debug( "{} classification(s) is/are potential candidate(s) from which rank[{}] might be inferred", candidates.size(), r); // for each, check against those with values at the rank if (!candidates.isEmpty()) { for (DenormClassification d : sparseRecords) { LOG.debug("Attempting to infer rank[{}] for: {}", r, d); // Check each candidate, and add it to the options Set<String> potentials = new HashSet<String>(); for (DenormClassification candidate : candidates) { // if they conflict in higher taxonomy, remove it from the candidate list if (DenormClassificationUtils.haveConflict(d, candidate, r)) { LOG.debug( "Ignoring candidate from potential options due to a conflict: {}", candidate); } else if (DenormClassificationUtils.shareHigherTaxonomy(d, candidate, r)) { LOG.debug( "Adding option[{}] for rank[{}] from candidate: {}", new Object[] {candidate.get(r), r, candidate}); potentials.add(candidate.get(r)); } else { // since we are doing ranks in order, the higher taxa must be identical, or // we have ambiguity. Consider: // a,b,c // -,-,c // d,-,c // If we are on the middle rank and the second row, we see no conflict to // a,b,c but cannot assume b LOG.debug( "Ignoring candidate from potential options due to ambiguity: {}", candidate); } } // within this rank we have now the candidates, but consider working at the 3rd column // in: // a - c d -> this row could be "b" but not definitely because: // - b c d -> this row could be "e" or "a" // e - - d // we would now set the a or b happily, ignorant of the homonym at a lower rank with // empty // 3rd column // To counter this, we hunt for homonyms at any lower rank with a null at the rank we // are working at boolean homonymFound = false; if (StringUtils.isNotBlank(d.get(rank))) { // homonymFound = homonymInLowerRankScan(taxonomy,rank, d); homonymFound = homonymInLowerRank(rank, homonyms, d); } if (homonymFound) { // if homonyms exist, one cannot make inferences LOG.debug("Homonyms found, so rank[{}] cannot be inferred for: {}", r, d); } else if (potentials.size() == 1) { // if there is only one option, use it String value = potentials.iterator().next(); LOG.debug( "{} classification(s) unanimously provided option[{}] at rank[{}] for: {}", new Object[] {candidates.size(), value, r, d}); d.set(r, value); } else { LOG.debug( "{} classification(s) provided {} options at rank[{}], so cannot be inferred for: {}", new Object[] {candidates.size(), potentials.size(), r, d}); // this means this group represents a homonym which should be stored for future // decisions merging // higher taxa if (StringUtils.isNotBlank(d.get(rank))) { LOG.debug("Adding homonym[{}] to rank[{}]", d.get(rank), rank); homonyms.get(rank).add(d.get(rank)); } } } } } } } }