/** * Insert a List of new Protein instances. * * @param newInstances being a List of instances to persist. * @return the Collection of persisted instances. This MAY NOT contain the same objects as have * been passed in, for sub-classes that check for the pre-existence of the object in the * database. */ @Override @Transactional public Collection<Protein> insert(Collection<Protein> newInstances) { final PersistedProteins persistedProteins = insertNewProteins(newInstances); final Collection<Protein> allProteins = new ArrayList<Protein>(persistedProteins.getNewProteins()); allProteins.addAll(persistedProteins.getPreExistingProteins()); entityManager.flush(); return allProteins; }
/** * Inserts new Proteins. If there are Protein objects with the same MD5 / sequence in the * database, this method updates these proteins, rather than inserting the new ones. * * <p>Note that this method inserts the new Protein objects AND and new Xrefs (possibly updating * an existing Protein object if necessary with the new Xref.) * * @param newProteins being a List of new Protein objects to insert * @return a new List<Protein> containing all of the inserted / updated Protein objects. (Allows * the caller to retrieve the primary keys for the proteins). */ @Transactional @SuppressWarnings("unchecked") public PersistedProteins insertNewProteins(Collection<Protein> newProteins) { PersistedProteins persistentProteins = new PersistedProteins(); if (newProteins.size() > 0) { // Create a List of MD5s (just as Strings) to query the database with final List<String> newMd5s = new ArrayList<String>(newProteins.size()); for (Protein newProtein : newProteins) { newMd5s.add(newProtein.getMd5()); if (LOGGER.isDebugEnabled()) { LOGGER.debug("MD5 of new protein: " + newProtein.getMd5()); } } // Retrieve any proteins AND associated xrefs that have the same MD5 as one of the 'new' // proteins // being inserted and place in a Map of MD5 to Protein object. final Map<String, Protein> md5ToExistingProtein = new HashMap<String, Protein>(); final Query query = entityManager.createQuery( "select p from Protein p left outer join fetch P.crossReferences where p.md5 in (:md5)"); query.setParameter("md5", newMd5s); for (Protein existingProtein : (List<Protein>) query.getResultList()) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Found 1 existing protein with MD5: " + existingProtein.getMd5()); } md5ToExistingProtein.put(existingProtein.getMd5(), existingProtein); } // Now have the List of 'new' proteins, and a list of existing proteins that match // them. Insert / update proteins as appropriate. for (Protein candidate : newProteins) { // PROTEIN ALREADY EXISTS in the DB. - update cross references and save. if (md5ToExistingProtein.keySet().contains(candidate.getMd5())) { // This protein is already in the database - add any new Xrefs and update. Protein existingProtein = md5ToExistingProtein.get(candidate.getMd5()); boolean updateRequired = false; if (candidate.getCrossReferences() != null) { if (LOGGER.isTraceEnabled()) { LOGGER.trace("Protein TO BE STORED has xrefs:"); } for (ProteinXref xref : candidate.getCrossReferences()) { if (LOGGER.isTraceEnabled()) { LOGGER.trace(xref.getIdentifier()); } // Add any NEW cross references. if (!existingProtein.getCrossReferences().contains(xref)) { if (LOGGER.isTraceEnabled()) { LOGGER.trace( "Adding " + xref.getIdentifier() + " and setting updateRequired = true"); } existingProtein.addCrossReference(xref); updateRequired = true; } } } if (updateRequired) { // PROTEIN is NOT new, but CHANGED (new Xrefs) if (LOGGER.isTraceEnabled()) { LOGGER.trace("Merging protein with new Xrefs: " + existingProtein.getMd5()); } entityManager.merge(existingProtein); } persistentProteins.addPreExistingProtein(existingProtein); } // PROTEIN IS NEW - save it. else { if (LOGGER.isTraceEnabled()) { LOGGER.trace("Saving new protein: " + candidate.getMd5()); } entityManager.persist(candidate); persistentProteins.addNewProtein(candidate); // Check for this new protein next time through the loop, just in case the new source of // proteins is redundant (e.g. a FASTA file with sequences repeated). md5ToExistingProtein.put(candidate.getMd5(), candidate); } } } // Finally return all the persisted Protein objects (new or existing) entityManager.flush(); return persistentProteins; }