/** * This implementation does the following: a) infers missing genera b) sorts to the scientific * name c) does a homonym aware merge to fill holes in a classification: a,-,c,d a,b,-,d would * merge to a,b,c,d. */ public List<NormClassification> normalize(List<DenormClassification> denorm) { List<NormClassification> result = new ArrayList<NormClassification>(); // infer missing values DenormClassificationUtils.inferSpecies(denorm); DenormClassificationUtils.inferGenera(denorm); Map<LINNEAN_RANK, Set<String>> homonyms = createHomonymCache(); // respecting homonymns, merge higher classification into as few as possible // a,-,c,d // a,b,-,d // would merge to a,b,c,d for example long time = System.currentTimeMillis(); sortAndMerge(LINNEAN_RANK.SS, denorm, homonyms); sortAndMerge(LINNEAN_RANK.S, denorm, homonyms); sortAndMerge(LINNEAN_RANK.G, denorm, homonyms); sortAndMerge(LINNEAN_RANK.F, denorm, homonyms); sortAndMerge(LINNEAN_RANK.O, denorm, homonyms); sortAndMerge(LINNEAN_RANK.C, denorm, homonyms); sortAndMerge(LINNEAN_RANK.P, denorm, homonyms); sortAndMerge(LINNEAN_RANK.K, denorm, homonyms); LOG.info( "Completed classification merging at all ranks in {} sec(s)", (1 + System.currentTimeMillis() - time) / 1000); // now resort to ensure correct ordering from the bottom up Collections.sort(denorm, DenormClassificationUtils.FULL_COMPARATOR); time = System.currentTimeMillis(); LOG.info("Building normalized tree structure for {} classifications", denorm.size()); int id = 1; Map<Integer, NormClassification> norm = new HashMap<Integer, NormClassification>(); Map<LINNEAN_RANK, Integer> parentIds = new HashMap<LINNEAN_RANK, Integer>(); DenormClassification prev = null; for (DenormClassification curr : denorm) { // capture first row boolean change = prev == null; // find where they differ LINNEAN_RANK deviation = change ? LINNEAN_RANK.K : DenormClassificationUtils.rankOfDeviation(curr, prev); if (LOG.isDebugEnabled()) { LOG.debug( "Deviation with previous is at rank[{}] for curr[{}] prev[{}]", new Object[] {deviation, curr, prev}); } for (LINNEAN_RANK r : LinneanRank.ranksLowerThan(deviation, true)) { // clear parentIds not of interest now parentIds.put(r, null); String name = curr.get(r); if (StringUtils.isNotBlank(name)) { // find the parent id to use for this taxon Integer parentId = null; for (LINNEAN_RANK p : LinneanRank.ranksHigherThan(r, false)) { parentId = parentIds.get(p) == null ? parentId : parentIds.get(p); } // create the taxon String author = null; if (LINNEAN_RANK.S == r && StringUtils.isBlank(curr.get(LINNEAN_RANK.SS)) || LINNEAN_RANK.SS == r) { author = curr.getAuthor(); } NormClassification nc = new NormClassification(id, parentId, name, author, r.toString()); norm.put(id, nc); parentIds.put(r, id); id++; // we have just created the concept, but if this is the most significant taxa, // then we need to track any payloads on the newly create concept boolean more = false; for (LINNEAN_RANK r2 : LinneanRank.ranksLowerThan(r, false)) { more |= StringUtils.isNotBlank(curr.get(r2)); } if (!more) { LOG.debug("Adding payloads from [{}] into [{}]", curr.toString(), nc.toString()); nc.getPayloads().addAll(curr.getPayloads()); } } } // handle the special case when you have // "a",null,null,null,null,"f","g","h","i")); // "a",null,null,null,null,"f","g","j","i")); // "a",null,null,null,null,"f","g",null,"i")); // on the 3rd row, we have already created the species, but need to apply the author and // update the payloads if (LINNEAN_RANK.SS == deviation && prev != null && StringUtils.isBlank(curr.getSubspecies()) && // we don't want second row to go in here StringUtils.equals(curr.get(LINNEAN_RANK.S), prev.get(LINNEAN_RANK.S))) { NormClassification prevNorm = norm.get(id - 1); // iterate back to the species concept while (prevNorm != null && !StringUtils.equals(prevNorm.getRank(), LINNEAN_RANK.S.toString())) { if (prevNorm.getParentId() == null) { prevNorm = null; break; } else { prevNorm = norm.get(prevNorm.getParentId()); } } LOG.debug("Previous species: " + prevNorm); if (prevNorm != null && StringUtils.equals(prevNorm.getRank(), LINNEAN_RANK.S.toString())) { LOG.debug( "Updating previous species concept with new author[{}]: {}", curr.getAuthor(), prevNorm); prevNorm.setAuthor(curr.getAuthor()); LOG.debug( "Updating previous payloads from [{}] into previous [{}]", curr.toString(), prevNorm.toString()); prevNorm.getPayloads().addAll(curr.getPayloads()); } } prev = curr; } result.addAll(norm.values()); Collections.sort( result, new Comparator<NormClassification>() { @Override public int compare(NormClassification o1, NormClassification o2) { return o1.getId().compareTo(o2.getId()); } }); LOG.info( "Built normalized tree structure for {} classifications in {} sec(s)", denorm.size(), (1 + System.currentTimeMillis() - time) / 1000); return result; }