/** * Returns the normalized Authority value for an author based on the name passed in. If no * authority exists, null will be returned. * * @param author the author to get the authority information for * @return the normalized authority information or null if no authority exists. */ public static String getNormalizedAuthorAuthorityFromDatabase(String author) { if (!connectToDatabase()) { return null; } else { try { getPreferredAuthorByOriginalNameStmt.setString(1, author); // First check without normalization ResultSet originalNameResults = getPreferredAuthorByOriginalNameStmt.executeQuery(); if (originalNameResults.next()) { String authority = originalNameResults.getString("normalizedName"); // Found a match originalNameResults.close(); return authority; } else { // No match, check alternate names for the author String normalizedAuthor = AuthorNormalizer.getNormalizedName(author); getPreferredAuthorByAlternateNameStmt.setString(1, normalizedAuthor); ResultSet alternateNameResults = getPreferredAuthorByAlternateNameStmt.executeQuery(); if (alternateNameResults.next()) { String authority = alternateNameResults.getString("normalizedName"); alternateNameResults.close(); return authority; } } } catch (Exception e) { logger.error("Error loading authority information from database", e); } } return null; }
public static void loadAuthoritiesFromVIAF() { if (!connectToDatabase()) { return; } try { PreparedStatement addPreferredAuthorStmt = authoritiesConn.prepareStatement( "INSERT INTO preferred_authors (viafId, originalName, normalizedName, wikipediaLink) VALUES (?, ?, ?, ?) ON DUPLICATE KEY UPDATE originalName = VALUES(originalName), normalizedName = VALUES(normalizedName), wikipediaLink = VALUES(wikipediaLink)"); PreparedStatement addAlternateAuthorStmt = authoritiesConn.prepareStatement( "INSERT IGNORE alternate_authors (viafId, alternateName) VALUES (?, ?)"); File viafFile = new File("d:/data/vufind-plus/viaf/viaf-20150115-clusters-rdf.xml"); // Read data from the file one line at a time since the whole thing is HUGE try { BufferedReader reader = new BufferedReader(new FileReader(viafFile)); // Setup the XML processor DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); XPath xpath = XPathFactory.newInstance().newXPath(); XPathExpression viafIdExpression = xpath.compile("RDF/Description"); XPathExpression conceptsExpression = xpath.compile("RDF/Concept"); String curLine = reader.readLine(); while (curLine != null) { if (!curLine.startsWith("<")) { curLine = curLine.substring(curLine.indexOf('<')); } // Get XMl for the line Document doc = dBuilder.parse(new InputSource(new StringReader(curLine))); // Load VIAF ID, Preferred authority, and alternate labels, also wikipedia article? Element viafIdNode = (Element) viafIdExpression.evaluate(doc, XPathConstants.NODE); String viafIdStr = viafIdNode.getAttribute("rdf:about"); viafIdStr = viafIdStr.replace("http://viaf.org/viaf/", ""); Long viafId = Long.parseLong(viafIdStr); NodeList concepts = (NodeList) conceptsExpression.evaluate(doc, XPathConstants.NODESET); HashSet<String> altLabels = new HashSet<String>(); String preferredLabel = null; String wikipediaLink = null; // Scan through the concepts to get the Library of Congress Concept for (int i = 0; i < concepts.getLength(); i++) { Node curConcept = concepts.item(i); boolean isPreferredConcept = false; String preferredConceptLabel = null; HashSet<String> altLabelsForConcept = new HashSet<String>(); // Get the scheme for the concept NodeList conceptInfoNodes = curConcept.getChildNodes(); for (int j = 0; j < conceptInfoNodes.getLength(); j++) { Element conceptInfoNode = (Element) conceptInfoNodes.item(j); if (conceptInfoNode.getTagName().equals("skos:inScheme")) { String schemeName = conceptInfoNode.getAttribute("rdf:resource"); if (schemeName.equals("http://viaf.org/authorityScheme/LC")) { isPreferredConcept = true; } } else if (conceptInfoNode.getTagName().equals("skos:prefLabel")) { preferredConceptLabel = conceptInfoNode.getTextContent(); } else if (conceptInfoNode.getTagName().equals("skos:altLabel")) { altLabelsForConcept.add(conceptInfoNode.getTextContent()); } else if (conceptInfoNode.getTagName().equals("foaf:isPrimaryTopicOf")) { if (conceptInfoNode.hasAttribute("rdf:resource") && conceptInfoNode .getAttribute("rdf:resource") .startsWith("http://en.wikipedia.org")) { wikipediaLink = conceptInfoNode.getAttribute("rdf:resource"); } } } if (isPreferredConcept) { preferredLabel = preferredConceptLabel; altLabels = altLabelsForConcept; break; } } if (preferredLabel != null) { String normalizedName = AuthorNormalizer.getNormalizedName(preferredLabel); if (normalizedName.length() > 50) { logger.warn("Normalized author longer than 50 characters " + normalizedName); normalizedName = normalizedName.substring(0, 50); } if (preferredLabel.length() > 200) { logger.warn("Author longer than 200 characters " + preferredLabel); preferredLabel = preferredLabel.substring(0, 200); } addPreferredAuthorStmt.setLong(1, viafId); addPreferredAuthorStmt.setString(2, preferredLabel); addPreferredAuthorStmt.setString(3, normalizedName); addPreferredAuthorStmt.setString(4, wikipediaLink); addPreferredAuthorStmt.executeUpdate(); // To make lookups faster, we will want to put alternate labels as first name last name // in addition to // last name, first name for (String curAltName : altLabels) { // Add the normalized author name for improved performance doing lookups String normalizedAltAuthor = AuthorNormalizer.getNormalizedName(curAltName); if (normalizedAltAuthor.length() > 200) { logger.warn( "Normalized alternate author longer than 200 characters " + preferredLabel); normalizedAltAuthor = normalizedAltAuthor.substring(0, 200); } addAlternateAuthorStmt.setLong(1, viafId); addAlternateAuthorStmt.setString(2, normalizedAltAuthor); addAlternateAuthorStmt.executeUpdate(); // See if we need to reverse the author name to first name / last name String reversedName = AuthorNormalizer.getDisplayName(curAltName); if (reversedName != null) { String normalizedReversedName = AuthorNormalizer.getNormalizedName(reversedName); if (normalizedReversedName.length() > 200) { logger.warn( "Normalized reversed alternate author longer than 200 characters " + preferredLabel); normalizedReversedName = normalizedReversedName.substring(0, 200); } addAlternateAuthorStmt.setLong(1, viafId); addAlternateAuthorStmt.setString(2, normalizedReversedName); addAlternateAuthorStmt.executeUpdate(); } } // TODO: optionally load related works from see also? } else { // logger.warn("No preferred Label found for cluster " + viafIdStr); } // Get the next line curLine = reader.readLine(); } } catch (Exception e) { logger.error("Error loading authorities from VIAF", e); } } catch (SQLException e) { logger.error("Unable to connect to database", e); } if (authoritiesConn != null) { return; } }