/** TODO */ private OrgDisambiguatedEntity findByDetails(RDFOrganization org) { Iso3166Country country = StringUtils.isBlank(org.country) ? null : Iso3166Country.valueOf(org.country); // Find the org by name, city, country and state OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findByNameCityRegionCountryAndSourceType( org.name, org.stateCode, org.stateCode, country, FUNDREF_SOURCE_TYPE); // If no match is found, try with the doi and source type if (existingEntity == null) { existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(org.doi, FUNDREF_SOURCE_TYPE); } return existingEntity; }
/** Creates a disambiguated ORG in the org_disambiguated table */ private OrgDisambiguatedEntity createDisambiguatedOrg(RDFOrganization organization) { LOGGER.info("Creating disambiguated org {}", organization.name); String orgType = organization.type + (StringUtils.isEmpty(organization.subtype) ? "" : "/" + organization.subtype); Iso3166Country country = StringUtils.isNotBlank(organization.country) ? Iso3166Country.fromValue(organization.country) : null; OrgDisambiguatedEntity orgDisambiguatedEntity = new OrgDisambiguatedEntity(); orgDisambiguatedEntity.setName(organization.name); orgDisambiguatedEntity.setCountry(country); orgDisambiguatedEntity.setCity(organization.city); orgDisambiguatedEntity.setRegion(organization.stateCode); orgDisambiguatedEntity.setOrgType(orgType); orgDisambiguatedEntity.setSourceId(organization.doi); orgDisambiguatedEntity.setSourceUrl(organization.doi); orgDisambiguatedEntity.setSourceType(FUNDREF_SOURCE_TYPE); orgDisambiguatedDao.persist(orgDisambiguatedEntity); return orgDisambiguatedEntity; }
/** Executes the import process */ private void execute() { try { long start = System.currentTimeMillis(); FileInputStream file = new FileInputStream(fileToLoad); DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = builderFactory.newDocumentBuilder(); Document xmlDocument = builder.parse(file); // Parent node NodeList nodeList = (NodeList) xPath.compile(conceptsExpression).evaluate(xmlDocument, XPathConstants.NODESET); for (int i = 0; i < nodeList.getLength(); i++) { RDFOrganization rdfOrganization = getOrganization(xmlDocument, nodeList.item(i).getAttributes()); LOGGER.info( "Processing organization from RDF, doi:{}, name:{}, country:{}, state:{}, stateCode:{}, type:{}, subtype:{}", new String[] { rdfOrganization.doi, rdfOrganization.name, rdfOrganization.country, rdfOrganization.state, rdfOrganization.stateCode, rdfOrganization.type, rdfOrganization.subtype }); // #1: Look for an existing org OrgDisambiguatedEntity existingEntity = findByDetails(rdfOrganization); if (existingEntity != null) { // #2: If the name, city or region changed, update those values if (entityChanged(rdfOrganization, existingEntity)) { existingEntity.setCity(rdfOrganization.city); Iso3166Country country = StringUtils.isNotBlank(rdfOrganization.country) ? Iso3166Country.fromValue(rdfOrganization.country) : null; existingEntity.setCountry(country); existingEntity.setName(rdfOrganization.name); String orgType = rdfOrganization.type + (StringUtils.isNotBlank(rdfOrganization.subtype) ? ('/' + rdfOrganization.subtype) : ""); existingEntity.setOrgType(orgType); existingEntity.setRegion(rdfOrganization.stateCode); existingEntity.setSourceId(rdfOrganization.doi); existingEntity.setSourceType(FUNDREF_SOURCE_TYPE); existingEntity.setSourceUrl(rdfOrganization.doi); orgDisambiguatedDao.merge(existingEntity); updatedOrgs += 1; } else if (idChanged(rdfOrganization, existingEntity)) { // #3: If the ID changed, create an external identifier createExternalIdentifier(existingEntity, rdfOrganization.doi); addedExternalIdentifiers += 1; } } else { // #4: Else, create the new org OrgDisambiguatedEntity newOrg = createDisambiguatedOrg(rdfOrganization); addedDisambiguatedOrgs += 1; } } long end = System.currentTimeMillis(); LOGGER.info("Time taken to process the files: {}", (end - start)); } catch (FileNotFoundException fne) { LOGGER.error("Unable to read file {}", fileToLoad); } catch (ParserConfigurationException pce) { LOGGER.error("Unable to initialize the DocumentBuilder"); } catch (IOException ioe) { LOGGER.error("Unable to parse document {}", fileToLoad); } catch (SAXException se) { LOGGER.error("Unable to parse document {}", fileToLoad); } catch (XPathExpressionException xpe) { LOGGER.error("XPathExpressionException {}", xpe.getMessage()); } finally { LOGGER.info( "Number new Disambiguated Orgs={}, Updated Orgs={}, new External Identifiers={}", new Object[] {addedDisambiguatedOrgs, updatedOrgs, addedExternalIdentifiers, getTotal()}); } }