예제 #1
0
 /**
  * Returns the normalized Authority value for an author based on the name passed in. If no
  * authority exists, null will be returned.
  *
  * @param author the author to get the authority information for
  * @return the normalized authority information or null if no authority exists.
  */
 public static String getNormalizedAuthorAuthorityFromDatabase(String author) {
   if (!connectToDatabase()) {
     return null;
   } else {
     try {
       getPreferredAuthorByOriginalNameStmt.setString(1, author);
       // First check without normalization
       ResultSet originalNameResults = getPreferredAuthorByOriginalNameStmt.executeQuery();
       if (originalNameResults.next()) {
         String authority = originalNameResults.getString("normalizedName");
         // Found a match
         originalNameResults.close();
         return authority;
       } else {
         // No match, check alternate names for the author
         String normalizedAuthor = AuthorNormalizer.getNormalizedName(author);
         getPreferredAuthorByAlternateNameStmt.setString(1, normalizedAuthor);
         ResultSet alternateNameResults = getPreferredAuthorByAlternateNameStmt.executeQuery();
         if (alternateNameResults.next()) {
           String authority = alternateNameResults.getString("normalizedName");
           alternateNameResults.close();
           return authority;
         }
       }
     } catch (Exception e) {
       logger.error("Error loading authority information from database", e);
     }
   }
   return null;
 }
예제 #2
0
  public static void loadAuthoritiesFromVIAF() {
    if (!connectToDatabase()) {
      return;
    }
    try {
      PreparedStatement addPreferredAuthorStmt =
          authoritiesConn.prepareStatement(
              "INSERT INTO preferred_authors (viafId, originalName, normalizedName, wikipediaLink) VALUES (?, ?, ?, ?) ON DUPLICATE KEY UPDATE originalName = VALUES(originalName), normalizedName = VALUES(normalizedName), wikipediaLink = VALUES(wikipediaLink)");
      PreparedStatement addAlternateAuthorStmt =
          authoritiesConn.prepareStatement(
              "INSERT IGNORE alternate_authors (viafId, alternateName) VALUES (?, ?)");

      File viafFile = new File("d:/data/vufind-plus/viaf/viaf-20150115-clusters-rdf.xml");

      // Read data from the file one line at a time since the whole thing is HUGE
      try {
        BufferedReader reader = new BufferedReader(new FileReader(viafFile));

        // Setup the XML processor
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        XPath xpath = XPathFactory.newInstance().newXPath();
        XPathExpression viafIdExpression = xpath.compile("RDF/Description");
        XPathExpression conceptsExpression = xpath.compile("RDF/Concept");

        String curLine = reader.readLine();
        while (curLine != null) {
          if (!curLine.startsWith("<")) {
            curLine = curLine.substring(curLine.indexOf('<'));
          }
          // Get XMl for the line
          Document doc = dBuilder.parse(new InputSource(new StringReader(curLine)));

          // Load VIAF ID, Preferred authority, and alternate labels, also wikipedia article?
          Element viafIdNode = (Element) viafIdExpression.evaluate(doc, XPathConstants.NODE);
          String viafIdStr = viafIdNode.getAttribute("rdf:about");
          viafIdStr = viafIdStr.replace("http://viaf.org/viaf/", "");
          Long viafId = Long.parseLong(viafIdStr);
          NodeList concepts = (NodeList) conceptsExpression.evaluate(doc, XPathConstants.NODESET);
          HashSet<String> altLabels = new HashSet<String>();
          String preferredLabel = null;
          String wikipediaLink = null;
          // Scan through the concepts to get the Library of Congress Concept
          for (int i = 0; i < concepts.getLength(); i++) {
            Node curConcept = concepts.item(i);

            boolean isPreferredConcept = false;
            String preferredConceptLabel = null;
            HashSet<String> altLabelsForConcept = new HashSet<String>();
            // Get the scheme for the concept
            NodeList conceptInfoNodes = curConcept.getChildNodes();
            for (int j = 0; j < conceptInfoNodes.getLength(); j++) {
              Element conceptInfoNode = (Element) conceptInfoNodes.item(j);
              if (conceptInfoNode.getTagName().equals("skos:inScheme")) {
                String schemeName = conceptInfoNode.getAttribute("rdf:resource");
                if (schemeName.equals("http://viaf.org/authorityScheme/LC")) {
                  isPreferredConcept = true;
                }
              } else if (conceptInfoNode.getTagName().equals("skos:prefLabel")) {
                preferredConceptLabel = conceptInfoNode.getTextContent();
              } else if (conceptInfoNode.getTagName().equals("skos:altLabel")) {
                altLabelsForConcept.add(conceptInfoNode.getTextContent());
              } else if (conceptInfoNode.getTagName().equals("foaf:isPrimaryTopicOf")) {
                if (conceptInfoNode.hasAttribute("rdf:resource")
                    && conceptInfoNode
                        .getAttribute("rdf:resource")
                        .startsWith("http://en.wikipedia.org")) {
                  wikipediaLink = conceptInfoNode.getAttribute("rdf:resource");
                }
              }
            }

            if (isPreferredConcept) {
              preferredLabel = preferredConceptLabel;
              altLabels = altLabelsForConcept;
              break;
            }
          }

          if (preferredLabel != null) {
            String normalizedName = AuthorNormalizer.getNormalizedName(preferredLabel);
            if (normalizedName.length() > 50) {
              logger.warn("Normalized author longer than 50 characters " + normalizedName);
              normalizedName = normalizedName.substring(0, 50);
            }
            if (preferredLabel.length() > 200) {
              logger.warn("Author longer than 200 characters " + preferredLabel);
              preferredLabel = preferredLabel.substring(0, 200);
            }

            addPreferredAuthorStmt.setLong(1, viafId);
            addPreferredAuthorStmt.setString(2, preferredLabel);
            addPreferredAuthorStmt.setString(3, normalizedName);
            addPreferredAuthorStmt.setString(4, wikipediaLink);
            addPreferredAuthorStmt.executeUpdate();

            // To make lookups faster, we will want to put alternate labels as first name last name
            // in addition to
            // last name, first name
            for (String curAltName : altLabels) {
              // Add the normalized author name for improved performance doing lookups
              String normalizedAltAuthor = AuthorNormalizer.getNormalizedName(curAltName);
              if (normalizedAltAuthor.length() > 200) {
                logger.warn(
                    "Normalized alternate author longer than 200 characters " + preferredLabel);
                normalizedAltAuthor = normalizedAltAuthor.substring(0, 200);
              }
              addAlternateAuthorStmt.setLong(1, viafId);
              addAlternateAuthorStmt.setString(2, normalizedAltAuthor);
              addAlternateAuthorStmt.executeUpdate();

              // See if we need to reverse the author name to first name / last name
              String reversedName = AuthorNormalizer.getDisplayName(curAltName);
              if (reversedName != null) {
                String normalizedReversedName = AuthorNormalizer.getNormalizedName(reversedName);
                if (normalizedReversedName.length() > 200) {
                  logger.warn(
                      "Normalized reversed alternate author longer than 200 characters "
                          + preferredLabel);
                  normalizedReversedName = normalizedReversedName.substring(0, 200);
                }
                addAlternateAuthorStmt.setLong(1, viafId);
                addAlternateAuthorStmt.setString(2, normalizedReversedName);
                addAlternateAuthorStmt.executeUpdate();
              }
            }

            // TODO: optionally load related works from see also?
          } else {
            // logger.warn("No preferred Label found for cluster " + viafIdStr);
          }

          // Get the next line
          curLine = reader.readLine();
        }
      } catch (Exception e) {
        logger.error("Error loading authorities from VIAF", e);
      }
    } catch (SQLException e) {
      logger.error("Unable to connect to database", e);
    }
    if (authoritiesConn != null) {
      return;
    }
  }