예제 #1
0
  /**
   * Check for metadata in the catalog having the same resource identifier as the harvested record.
   *
   * <p>If one dataset (same MD_metadata/../identificationInfo/../identifier/../code) (eg. a NMA
   * layer for roads) is described in 2 or more catalogs with different metadata uuids. The metadata
   * may be slightly different depending on the author, but the resource is the same. When
   * harvesting, some users would like to have the capability to exclude "duplicate" description of
   * the same dataset.
   *
   * <p>The check is made searching the identifier field in the index using {@link
   * LuceneSearcher#getAllMetadataFromIndexFor(String, String, String, Set, boolean)}
   *
   * @param uuid the metadata unique identifier
   * @param response the XML document to check
   * @return true if a record with same resource identifier is found. false otherwise.
   */
  private boolean foundDuplicateForResource(String uuid, Element response) {
    String schema = dataMan.autodetectSchema(response);

    if (schema.startsWith("iso19139")) {
      String resourceIdentifierXPath =
          "gmd:identificationInfo/*/gmd:citation/gmd:CI_Citation/gmd:identifier/*/gmd:code/gco:CharacterString";
      String resourceIdentifierLuceneIndexField = "identifier";
      String defaultLanguage = "eng";

      try {
        // Extract resource identifier
        XPath xp = XPath.newInstance(resourceIdentifierXPath);
        xp.addNamespace("gmd", "http://www.isotc211.org/2005/gmd");
        xp.addNamespace("gco", "http://www.isotc211.org/2005/gco");
        @SuppressWarnings("unchecked")
        List<Element> resourceIdentifiers = xp.selectNodes(response);
        if (resourceIdentifiers.size() > 0) {
          // Check if the metadata to import has a resource identifier
          // existing in current catalog for a record with a different UUID

          log.debug("  - Resource identifiers found : " + resourceIdentifiers.size());

          for (Element identifierNode : resourceIdentifiers) {
            String identifier = identifierNode.getTextTrim();
            log.debug("    - Searching for duplicates for resource identifier: " + identifier);

            Map<String, Map<String, String>> values =
                LuceneSearcher.getAllMetadataFromIndexFor(
                    defaultLanguage,
                    resourceIdentifierLuceneIndexField,
                    identifier,
                    Collections.singleton("_uuid"),
                    true);
            log.debug("    - Number of resources with same identifier: " + values.size());
            for (Map<String, String> recordFieldValues : values.values()) {
              String indexRecordUuid = recordFieldValues.get("_uuid");
              if (!indexRecordUuid.equals(uuid)) {
                log.debug(
                    "      - UUID "
                        + indexRecordUuid
                        + " in index does not match harvested record UUID "
                        + uuid);
                log.warning(
                    "      - Duplicates found. Skipping record with UUID "
                        + uuid
                        + " and resource identifier "
                        + identifier);

                result.duplicatedResource++;
                return true;
              }
            }
          }
        }
      } catch (Exception e) {
        log.warning(
            "      - Error when searching for resource duplicate "
                + uuid
                + ". Error is: "
                + e.getMessage());
        e.printStackTrace();
      }
    }
    return false;
  }