Beispiel #1
0
  public static List<Element> handlePropertyName(
      String[] propertyNames,
      ServiceContext context,
      boolean freq,
      int maxRecords,
      String cswServiceSpecificConstraint,
      LuceneConfig luceneConfig)
      throws Exception {

    List<Element> domainValuesList = null;

    if (Log.isDebugEnabled(Geonet.CSW))
      Log.debug(
          Geonet.CSW,
          "Handling property names '"
              + Arrays.toString(propertyNames)
              + "' with max records of "
              + maxRecords);

    for (int i = 0; i < propertyNames.length; i++) {

      if (i == 0) domainValuesList = new ArrayList<Element>();

      // Initialize list of values element.
      Element listOfValues = null;

      // Generate DomainValues element
      Element domainValues = new Element("DomainValues", Csw.NAMESPACE_CSW);

      // FIXME what should be the type ???
      domainValues.setAttribute("type", "csw:Record");

      String property = propertyNames[i].trim();

      // Set propertyName in any case.
      Element pn = new Element("PropertyName", Csw.NAMESPACE_CSW);
      domainValues.addContent(pn.setText(property));

      GeonetContext gc = (GeonetContext) context.getHandlerContext(Geonet.CONTEXT_NAME);
      SearchManager sm = gc.getSearchmanager();

      IndexAndTaxonomy indexAndTaxonomy = sm.getNewIndexReader(null);
      try {
        GeonetworkMultiReader reader = indexAndTaxonomy.indexReader;
        BooleanQuery groupsQuery = (BooleanQuery) CatalogSearcher.getGroupsQuery(context);
        BooleanQuery query = null;

        // Apply CSW service specific constraint
        if (StringUtils.isNotEmpty(cswServiceSpecificConstraint)) {
          Query constraintQuery =
              CatalogSearcher.getCswServiceSpecificConstraintQuery(
                  cswServiceSpecificConstraint, luceneConfig);

          query = new BooleanQuery();

          BooleanClause.Occur occur = LuceneUtils.convertRequiredAndProhibitedToOccur(true, false);

          query.add(groupsQuery, occur);
          query.add(constraintQuery, occur);

        } else {
          query = groupsQuery;
        }

        List<Pair<String, Boolean>> sortFields =
            Collections.singletonList(Pair.read(Geonet.SearchResult.SortBy.RELEVANCE, true));
        Sort sort = LuceneSearcher.makeSort(sortFields, context.getLanguage(), false);
        CachingWrapperFilter filter = null;

        Pair<TopDocs, Element> searchResults =
            LuceneSearcher.doSearchAndMakeSummary(
                maxRecords,
                0,
                maxRecords,
                context.getLanguage(),
                null,
                reader,
                query,
                filter,
                sort,
                null,
                false,
                false,
                false,
                false // Scoring is useless for GetDomain operation
                );
        TopDocs hits = searchResults.one();

        try {
          // Get mapped lucene field in CSW configuration
          String indexField = CatalogConfiguration.getFieldMapping().get(property.toLowerCase());
          if (indexField != null) property = indexField;

          // check if params asked is in the index using getFieldNames ?
          FieldInfos fi = new SlowCompositeReaderWrapper(reader).getFieldInfos();
          if (fi.fieldInfo(property) == null) continue;

          boolean isRange = false;
          if (CatalogConfiguration.getGetRecordsRangeFields().contains(property)) isRange = true;

          if (isRange) listOfValues = new Element("RangeOfValues", Csw.NAMESPACE_CSW);
          else listOfValues = new Element("ListOfValues", Csw.NAMESPACE_CSW);

          Set<String> fields = new HashSet<String>();
          fields.add(property);
          fields.add("_isTemplate");

          // parse each document in the index
          String[] fieldValues;
          SortedSet<String> sortedValues = new TreeSet<String>();
          HashMap<String, Integer> duplicateValues = new HashMap<String, Integer>();
          for (int j = 0; j < hits.scoreDocs.length; j++) {
            DocumentStoredFieldVisitor selector = new DocumentStoredFieldVisitor(fields);
            reader.document(hits.scoreDocs[j].doc, selector);
            Document doc = selector.getDocument();

            // Skip templates and subTemplates
            String[] isTemplate = doc.getValues("_isTemplate");
            if (isTemplate[0] != null && !isTemplate[0].equals("n")) continue;

            // Get doc values for specified property
            fieldValues = doc.getValues(property);
            if (fieldValues == null) continue;

            addtoSortedSet(sortedValues, fieldValues, duplicateValues);
          }

          SummaryComparator valuesComparator =
              new SummaryComparator(SortOption.FREQUENCY, Type.STRING, context.getLanguage(), null);
          TreeSet<Map.Entry<String, Integer>> sortedValuesFrequency =
              new TreeSet<Map.Entry<String, Integer>>(valuesComparator);
          sortedValuesFrequency.addAll(duplicateValues.entrySet());

          if (freq) return createValuesByFrequency(sortedValuesFrequency);
          else listOfValues.addContent(createValuesElement(sortedValues, isRange));

        } finally {
          // any children means that the catalog was unable to determine
          // anything about the specified parameter
          if (listOfValues != null && listOfValues.getChildren().size() != 0)
            domainValues.addContent(listOfValues);

          // Add current DomainValues to the list
          domainValuesList.add(domainValues);
        }
      } finally {
        sm.releaseIndexReader(indexAndTaxonomy);
      }
    }
    return domainValuesList;
  }
Beispiel #2
0
  /**
   * Check for metadata in the catalog having the same resource identifier as the harvested record.
   *
   * <p>If one dataset (same MD_metadata/../identificationInfo/../identifier/../code) (eg. a NMA
   * layer for roads) is described in 2 or more catalogs with different metadata uuids. The metadata
   * may be slightly different depending on the author, but the resource is the same. When
   * harvesting, some users would like to have the capability to exclude "duplicate" description of
   * the same dataset.
   *
   * <p>The check is made searching the identifier field in the index using {@link
   * LuceneSearcher#getAllMetadataFromIndexFor(String, String, String, Set, boolean)}
   *
   * @param uuid the metadata unique identifier
   * @param response the XML document to check
   * @return true if a record with same resource identifier is found. false otherwise.
   */
  private boolean foundDuplicateForResource(String uuid, Element response) {
    String schema = dataMan.autodetectSchema(response);

    if (schema.startsWith("iso19139")) {
      String resourceIdentifierXPath =
          "gmd:identificationInfo/*/gmd:citation/gmd:CI_Citation/gmd:identifier/*/gmd:code/gco:CharacterString";
      String resourceIdentifierLuceneIndexField = "identifier";
      String defaultLanguage = "eng";

      try {
        // Extract resource identifier
        XPath xp = XPath.newInstance(resourceIdentifierXPath);
        xp.addNamespace("gmd", "http://www.isotc211.org/2005/gmd");
        xp.addNamespace("gco", "http://www.isotc211.org/2005/gco");
        @SuppressWarnings("unchecked")
        List<Element> resourceIdentifiers = xp.selectNodes(response);
        if (resourceIdentifiers.size() > 0) {
          // Check if the metadata to import has a resource identifier
          // existing in current catalog for a record with a different UUID

          log.debug("  - Resource identifiers found : " + resourceIdentifiers.size());

          for (Element identifierNode : resourceIdentifiers) {
            String identifier = identifierNode.getTextTrim();
            log.debug("    - Searching for duplicates for resource identifier: " + identifier);

            Map<String, Map<String, String>> values =
                LuceneSearcher.getAllMetadataFromIndexFor(
                    defaultLanguage,
                    resourceIdentifierLuceneIndexField,
                    identifier,
                    Collections.singleton("_uuid"),
                    true);
            log.debug("    - Number of resources with same identifier: " + values.size());
            for (Map<String, String> recordFieldValues : values.values()) {
              String indexRecordUuid = recordFieldValues.get("_uuid");
              if (!indexRecordUuid.equals(uuid)) {
                log.debug(
                    "      - UUID "
                        + indexRecordUuid
                        + " in index does not match harvested record UUID "
                        + uuid);
                log.warning(
                    "      - Duplicates found. Skipping record with UUID "
                        + uuid
                        + " and resource identifier "
                        + identifier);

                result.duplicatedResource++;
                return true;
              }
            }
          }
        }
      } catch (Exception e) {
        log.warning(
            "      - Error when searching for resource duplicate "
                + uuid
                + ". Error is: "
                + e.getMessage());
        e.printStackTrace();
      }
    }
    return false;
  }