Ejemplo n.º 1
0
  private static void getStatistics2(List<ColumnNode> columnNodes) {

    if (columnNodes == null) return;

    int numberOfAttributesWhoseTypeIsFirstCRFType = 0;
    int numberOfAttributesWhoseTypeIsInCRFTypes = 0;
    for (ColumnNode cn : columnNodes) {
      List<SemanticType> userSemanticTypes = cn.getUserSemanticTypes();
      List<SemanticType> top4Suggestions = cn.getTopKLearnedSemanticTypes(4);

      for (int i = 0; i < top4Suggestions.size(); i++) {
        SemanticType st = top4Suggestions.get(i);
        if (userSemanticTypes != null) {
          for (SemanticType t : userSemanticTypes) {
            if (st.getModelLabelString().equalsIgnoreCase(t.getModelLabelString())) {
              if (i == 0) numberOfAttributesWhoseTypeIsFirstCRFType++;
              numberOfAttributesWhoseTypeIsInCRFTypes++;
              i = top4Suggestions.size();
              break;
            }
          }
        }
      }
    }

    //		System.out.println(columnNodes.size() + "\t" + numberOfAttributesWhoseTypeIsInCRFTypes +
    // "\t" + numberOfAttributesWhoseTypeIsFirstCRFType);

    System.out.println("totalNumberOfAttributes: " + columnNodes.size());
    System.out.println(
        "numberOfAttributesWhoseTypeIsInCRFTypes: " + numberOfAttributesWhoseTypeIsInCRFTypes);
    System.out.println(
        "numberOfAttributesWhoseTypeIsFirstCRFType:" + numberOfAttributesWhoseTypeIsFirstCRFType);
  }
  private void saveSemanticTypesInformation(
      Worksheet worksheet, VWorkspace vWorkspace, Collection<SemanticType> semanticTypes)
      throws JSONException {
    JSONArray typesArray = new JSONArray();

    // Add the vworksheet information
    JSONObject vwIDJObj = new JSONObject();
    vwIDJObj.put(ClientJsonKeys.name.name(), ParameterType.vWorksheetId.name());
    vwIDJObj.put(ClientJsonKeys.type.name(), ParameterType.vWorksheetId.name());
    vwIDJObj.put(ClientJsonKeys.value.name(), vWorksheetId);
    typesArray.put(vwIDJObj);

    // Add the check history information
    JSONObject chIDJObj = new JSONObject();
    chIDJObj.put(ClientJsonKeys.name.name(), ParameterType.checkHistory.name());
    chIDJObj.put(ClientJsonKeys.type.name(), ParameterType.other.name());
    chIDJObj.put(ClientJsonKeys.value.name(), false);
    typesArray.put(chIDJObj);

    for (SemanticType type : semanticTypes) {
      // Add the hNode information
      JSONObject hNodeJObj = new JSONObject();
      hNodeJObj.put(ClientJsonKeys.name.name(), ParameterType.hNodeId.name());
      hNodeJObj.put(ClientJsonKeys.type.name(), ParameterType.hNodeId.name());
      hNodeJObj.put(ClientJsonKeys.value.name(), type.getHNodeId());
      typesArray.put(hNodeJObj);

      // Add the semantic type information
      JSONObject typeJObj = new JSONObject();
      typeJObj.put(ClientJsonKeys.name.name(), ClientJsonKeys.SemanticType.name());
      typeJObj.put(ClientJsonKeys.type.name(), ParameterType.other.name());
      typeJObj.put(ClientJsonKeys.value.name(), type.getJSONArrayRepresentation());
      typesArray.put(typeJObj);
    }
    setInputParameterJson(typesArray.toString(4));
  }
Ejemplo n.º 3
0
  @Override
  public void generateJson(String prefix, PrintWriter pw, VWorkspace vWorkspace) {
    Workspace workspace = vWorkspace.getWorkspace();
    alignment = AlignmentManager.Instance().getAlignment(workspace.getId(), worksheetId);
    SemanticTypes types = worksheet.getSemanticTypes();
    Map<String, ColumnNode> hNodeIdTocolumnNodeMap = createColumnNodeMap();
    Map<String, SemanticTypeNode> hNodeIdToDomainNodeMap = createDomainNodeMap();

    JSONStringer jsonStr = new JSONStringer();
    try {
      JSONWriter writer = jsonStr.object();
      writer.key("worksheetId").value(worksheetId).key("updateType").value("SemanticTypesUpdate");

      writer.key(JsonKeys.Types.name());
      writer.array();
      // Iterate through all the columns
      for (HNodePath path : worksheet.getHeaders().getAllPaths()) {
        HNode node = path.getLeaf();
        String nodeId = node.getId();

        writer.object();

        // Check if a semantic type exists for the HNode
        SemanticType type = types.getSemanticTypeForHNodeId(nodeId);
        if (type != null && type.getConfidenceLevel() != SemanticType.ConfidenceLevel.Low) {
          writer
              .key(JsonKeys.HNodeId.name())
              .value(type.getHNodeId())
              .key(JsonKeys.SemanticTypesArray.name())
              .array();

          ColumnNode alignmentColumnNode = hNodeIdTocolumnNodeMap.get(type.getHNodeId());
          SemanticTypeNode domainNode = hNodeIdToDomainNodeMap.get(type.getHNodeId());

          if (alignmentColumnNode == null || domainNode == null) {
            logger.error(
                "Column node or domain node not found in alignment."
                    + " (This should not happen conceptually!):"
                    + type);
            continue;
          }

          // Add the primary semantic type
          writer
              .object()
              .key(JsonKeys.Origin.name())
              .value(type.getOrigin().name())
              .key(JsonKeys.ConfidenceLevel.name())
              .value(type.getConfidenceLevel().name())
              .key(JsonKeys.isPrimary.name())
              .value(true);

          // Add the RDF literal type to show in the text box
          String rdfLiteralType =
              alignmentColumnNode.getRdfLiteralType() == null
                  ? ""
                  : alignmentColumnNode.getRdfLiteralType().getDisplayName();
          String language =
              alignmentColumnNode.getLanguage() == null ? "" : alignmentColumnNode.getLanguage();
          writer.key(JsonKeys.rdfLiteralType.name()).value(rdfLiteralType);
          writer.key(JsonKeys.language.name()).value(language);

          //					String domainDisplayLabel = (domainNode.getLabel().getPrefix() != null &&
          // (!domainNode.getLabel().getPrefix().equals(""))) ?
          //							(domainNode.getLabel().getPrefix() + ":" + domainNode.getLocalId()) :
          // domainNode.getLocalId();
          if (!type.isClass()) {
            writer
                .key(JsonKeys.FullType.name())
                .value(type.getType().getUri())
                .key(JsonKeys.DisplayLabel.name())
                .value(type.getType().getDisplayName())
                .key(JsonKeys.DisplayRDFSLabel.name())
                .value(type.getType().getRdfsLabel())
                .key(JsonKeys.DisplayRDFSComment.name())
                .value(type.getType().getRdfsComment())
                .key(JsonKeys.DomainId.name())
                .value(domainNode.getId())
                .key(JsonKeys.DomainUri.name())
                .value(domainNode.getUri())
                .key(JsonKeys.DisplayDomainLabel.name())
                .value(domainNode.getDisplayId())
                .key(JsonKeys.DomainRDFSLabel.name())
                .value(domainNode.getRdfsLabel())
                .key(JsonKeys.DomainRDFSComment.name())
                .value(domainNode.getRdfsComment());
          } else {
            writer
                .key(JsonKeys.FullType.name())
                .value(domainNode.getId())
                .key(JsonKeys.DisplayLabel.name())
                .value(domainNode.getDisplayId())
                .key(JsonKeys.DisplayRDFSLabel.name())
                .value(domainNode.getRdfsLabel())
                .key(JsonKeys.DisplayRDFSComment.name())
                .value(domainNode.getRdfsComment())
                .key(JsonKeys.DomainId.name())
                .value("")
                .key(JsonKeys.DomainUri.name())
                .value("")
                .key(JsonKeys.DisplayDomainLabel.name())
                .value("")
                .key(JsonKeys.DomainRDFSLabel.name())
                .value("")
                .key(JsonKeys.DomainRDFSComment.name())
                .value("");
          }

          // Mark the special properties
          writer
              .key(JsonKeys.isMetaProperty.name())
              .value(isMetaProperty(type.getType(), alignmentColumnNode));

          writer.endObject();

          // Iterate through the synonym semantic types
          SynonymSemanticTypes synTypes = types.getSynonymTypesForHNodeId(nodeId);

          if (synTypes != null) {
            for (SemanticType synType : synTypes.getSynonyms()) {
              writer
                  .object()
                  .key(JsonKeys.HNodeId.name())
                  .value(synType.getHNodeId())
                  .key(JsonKeys.FullType.name())
                  .value(synType.getType().getUri())
                  .key(JsonKeys.Origin.name())
                  .value(synType.getOrigin().name())
                  .key(JsonKeys.ConfidenceLevel.name())
                  .value(synType.getConfidenceLevel().name())
                  .key(JsonKeys.DisplayLabel.name())
                  .value(synType.getType().getDisplayName())
                  .key(JsonKeys.DisplayRDFSLabel.name())
                  .value(synType.getType().getRdfsLabel())
                  .key(JsonKeys.DisplayRDFSComment.name())
                  .value(synType.getType().getRdfsComment())
                  .key(JsonKeys.isPrimary.name())
                  .value(false);
              if (!synType.isClass()) {
                writer
                    .key(JsonKeys.DomainUri.name())
                    .value(synType.getDomain().getUri())
                    .key(JsonKeys.DomainId.name())
                    .value("")
                    .key(JsonKeys.DisplayDomainLabel.name())
                    .value(synType.getDomain().getDisplayName())
                    .key(JsonKeys.DomainRDFSLabel.name())
                    .value(synType.getDomain().getRdfsLabel())
                    .key(JsonKeys.DomainRDFSComment.name())
                    .value(synType.getDomain().getRdfsComment());
              } else {
                writer
                    .key(JsonKeys.DomainId.name())
                    .value("")
                    .key(JsonKeys.DomainUri.name())
                    .value("")
                    .key(JsonKeys.DisplayDomainLabel.name())
                    .value("")
                    .key(JsonKeys.DomainRDFSLabel.name())
                    .value("")
                    .key(JsonKeys.DomainRDFSComment.name())
                    .value("");
              }
              writer.endObject();
            }
          }
          writer.endArray();
        } else {
          writer.key(JsonKeys.HNodeId.name()).value(nodeId);
          writer.key(JsonKeys.SemanticTypesArray.name()).array().endArray();
        }

        writer.endObject();
      }
      writer.endArray();
      writer.endObject();

      pw.print(writer.toString());
    } catch (JSONException e) {
      logger.error("Error occured while writing to JSON!", e);
    }
  }
Ejemplo n.º 4
0
  private SemanticTypeMapping addSemanticTypeStruct(
      ColumnNode sourceColumn, SemanticType semanticType, Set<Node> addedNodes) {

    logger.debug("adding semantic type to the graph ... ");

    if (addedNodes == null) addedNodes = new HashSet<Node>();

    if (semanticType == null) {
      logger.error("semantic type is null.");
      return null;
    }
    if (semanticType.getDomain() == null) {
      logger.error("semantic type does not have any domain");
      return null;
    }

    if (semanticType.getType() == null) {
      logger.error("semantic type does not have any link");
      return null;
    }

    String domainUri = semanticType.getDomain().getUri();
    String propertyUri = semanticType.getType().getUri();
    Double confidence = semanticType.getConfidenceScore();
    Origin origin = semanticType.getOrigin();

    if (domainUri == null || domainUri.isEmpty()) {
      logger.error("semantic type does not have any domain");
      return null;
    }

    if (propertyUri == null || propertyUri.isEmpty()) {
      logger.error("semantic type does not have any link");
      return null;
    }

    logger.debug(
        "semantic type: " + domainUri + "|" + propertyUri + "|" + confidence + "|" + origin);

    InternalNode source = null;
    String nodeId;

    nodeId = nodeIdFactory.getNodeId(domainUri);
    source = new InternalNode(nodeId, new Label(domainUri));
    if (!this.graphBuilder.addNodeAndUpdate(source, addedNodes)) return null;

    nodeId = new RandomGUID().toString();
    ColumnNode target = new ColumnNode(nodeId, nodeId, sourceColumn.getColumnName(), null);
    if (!this.graphBuilder.addNode(target)) return null;
    addedNodes.add(target);

    String linkId = LinkIdFactory.getLinkId(propertyUri, source.getId(), target.getId());
    LabeledLink link;
    if (propertyUri.equalsIgnoreCase(ClassInstanceLink.getFixedLabel().getUri()))
      link = new ClassInstanceLink(linkId);
    else {
      Label label = this.ontologyManager.getUriLabel(propertyUri);
      link = new DataPropertyLink(linkId, label);
    }
    if (!this.graphBuilder.addLink(source, target, link)) return null;

    SemanticTypeMapping mappingStruct =
        new SemanticTypeMapping(sourceColumn, semanticType, source, link, target);

    return mappingStruct;
  }
Ejemplo n.º 5
0
  private Set<SemanticTypeMapping> findSemanticTypeInGraph(
      ColumnNode sourceColumn,
      SemanticType semanticType,
      HashMap<String, Integer> semanticTypesCount,
      Set<Node> addedNodes) {

    logger.debug("finding matches for semantic type in the graph ... ");

    if (addedNodes == null) addedNodes = new HashSet<Node>();

    Set<SemanticTypeMapping> mappings = new HashSet<SemanticTypeMapping>();

    if (semanticType == null) {
      logger.error("semantic type is null.");
      return mappings;
    }
    if (semanticType.getDomain() == null) {
      logger.error("semantic type does not have any domain");
      return mappings;
    }

    if (semanticType.getType() == null) {
      logger.error("semantic type does not have any link");
      return mappings;
    }

    String domainUri = semanticType.getDomain().getUri();
    String propertyUri = semanticType.getType().getUri();
    Double confidence = semanticType.getConfidenceScore();
    Origin origin = semanticType.getOrigin();

    Integer countOfSemanticType = semanticTypesCount.get(domainUri + propertyUri);
    if (countOfSemanticType == null) {
      logger.error("count of semantic type should not be null or zero");
      return mappings;
    }

    if (domainUri == null || domainUri.isEmpty()) {
      logger.error("semantic type does not have any domain");
      return mappings;
    }

    if (propertyUri == null || propertyUri.isEmpty()) {
      logger.error("semantic type does not have any link");
      return mappings;
    }

    logger.debug(
        "semantic type: " + domainUri + "|" + propertyUri + "|" + confidence + "|" + origin);

    // add dataproperty to existing classes if sl is a data node mapping
    //		Set<Node> foundInternalNodes = new HashSet<Node>();
    Set<SemanticTypeMapping> semanticTypeMatches =
        this.graphBuilder.getSemanticTypeMatches().get(domainUri + propertyUri);
    if (semanticTypeMatches != null) {
      for (SemanticTypeMapping stm : semanticTypeMatches) {

        SemanticTypeMapping mp =
            new SemanticTypeMapping(
                sourceColumn, semanticType, stm.getSource(), stm.getLink(), stm.getTarget());
        mappings.add(mp);
        //				foundInternalNodes.add(stm.getSource());
      }
    }

    logger.debug("adding data property to the found internal nodes ...");

    Integer count;
    boolean allowMultipleSamePropertiesPerNode =
        ModelingConfigurationRegistry.getInstance()
            .getModelingConfiguration(
                ContextParametersRegistry.getInstance()
                    .getContextParameters(ontologyManager.getContextId())
                    .getKarmaHome())
            .isMultipleSamePropertyPerNode();
    Set<Node> nodesWithSameUriOfDomain = this.graphBuilder.getUriToNodesMap().get(domainUri);
    if (nodesWithSameUriOfDomain != null) {
      for (Node source : nodesWithSameUriOfDomain) {
        count = this.graphBuilder.getNodeDataPropertyCount().get(source.getId() + propertyUri);

        if (count != null) {
          if (allowMultipleSamePropertiesPerNode) {
            if (count >= countOfSemanticType.intValue()) continue;
          } else {
            if (count >= 1) continue;
          }
        }

        String nodeId = new RandomGUID().toString();
        ColumnNode target = new ColumnNode(nodeId, nodeId, sourceColumn.getColumnName(), null);
        if (!this.graphBuilder.addNode(target)) continue;
        ;
        addedNodes.add(target);

        String linkId = LinkIdFactory.getLinkId(propertyUri, source.getId(), target.getId());
        LabeledLink link = new DataPropertyLink(linkId, new Label(propertyUri));
        if (!this.graphBuilder.addLink(source, target, link)) continue;
        ;

        SemanticTypeMapping mp =
            new SemanticTypeMapping(
                sourceColumn, semanticType, (InternalNode) source, link, target);
        mappings.add(mp);
      }
    }

    return mappings;
  }
Ejemplo n.º 6
0
  private CandidateSteinerSets getCandidateSteinerSets(
      List<ColumnNode> columnNodes,
      boolean useCorrectTypes,
      int numberOfCRFCandidates,
      Set<Node> addedNodes) {

    if (columnNodes == null || columnNodes.isEmpty()) return null;

    int maxNumberOfSteinerNodes = columnNodes.size() * 2;
    CandidateSteinerSets candidateSteinerSets =
        new CandidateSteinerSets(maxNumberOfSteinerNodes, ontologyManager.getContextId());

    if (addedNodes == null) addedNodes = new HashSet<Node>();

    Set<SemanticTypeMapping> tempSemanticTypeMappings;
    HashMap<ColumnNode, List<SemanticType>> columnSemanticTypes =
        new HashMap<ColumnNode, List<SemanticType>>();
    HashMap<String, Integer> semanticTypesCount = new HashMap<String, Integer>();
    List<SemanticType> candidateSemanticTypes;
    String domainUri = "", propertyUri = "";

    for (ColumnNode n : columnNodes) {

      candidateSemanticTypes = n.getTopKLearnedSemanticTypes(numberOfCRFCandidates);
      columnSemanticTypes.put(n, candidateSemanticTypes);

      for (SemanticType semanticType : candidateSemanticTypes) {

        if (semanticType == null
            || semanticType.getDomain() == null
            || semanticType.getType() == null) continue;

        domainUri = semanticType.getDomain().getUri();
        propertyUri = semanticType.getType().getUri();

        Integer count = semanticTypesCount.get(domainUri + propertyUri);
        if (count == null) semanticTypesCount.put(domainUri + propertyUri, 1);
        else semanticTypesCount.put(domainUri + propertyUri, count.intValue() + 1);
      }
    }

    int numOfMappings = 1;
    for (ColumnNode n : columnNodes) {

      candidateSemanticTypes = columnSemanticTypes.get(n);
      if (candidateSemanticTypes == null) continue;

      logger.info("===== Column: " + n.getColumnName());

      Set<SemanticTypeMapping> semanticTypeMappings = new HashSet<SemanticTypeMapping>();
      for (SemanticType semanticType : candidateSemanticTypes) {

        logger.info("\t===== Semantic Type: " + semanticType.getModelLabelString());

        if (semanticType == null
            || semanticType.getDomain() == null
            || semanticType.getType() == null) continue;

        domainUri = semanticType.getDomain().getUri();
        propertyUri = semanticType.getType().getUri();
        Integer countOfSemanticType = semanticTypesCount.get(domainUri + propertyUri);
        //				logger.info("count of semantic type: " +  countOfSemanticType);

        tempSemanticTypeMappings =
            findSemanticTypeInGraph(n, semanticType, semanticTypesCount, addedNodes);
        //				logger.info("number of matches for semantic type: " +
        //					 + (tempSemanticTypeMappings == null ? 0 : tempSemanticTypeMappings.size()));

        if (tempSemanticTypeMappings != null) semanticTypeMappings.addAll(tempSemanticTypeMappings);

        int countOfMatches = tempSemanticTypeMappings == null ? 0 : tempSemanticTypeMappings.size();
        if (countOfMatches
            < countOfSemanticType) // No struct in graph is matched with the semantic type, we add a
                                   // new struct to the graph
        {
          for (int i = 0; i < countOfSemanticType - countOfMatches; i++) {
            SemanticTypeMapping mp = addSemanticTypeStruct(n, semanticType, addedNodes);
            if (mp != null) semanticTypeMappings.add(mp);
          }
        }
      }
      //			System.out.println("number of matches for column " + n.getColumnName() +
      //					": " + (semanticTypeMappings == null ? 0 : semanticTypeMappings.size()));
      logger.info(
          "number of matches for column "
              + n.getColumnName()
              + ": "
              + (semanticTypeMappings == null ? 0 : semanticTypeMappings.size()));
      numOfMappings *=
          semanticTypeMappings == null || semanticTypeMappings.isEmpty()
              ? 1
              : semanticTypeMappings.size();

      candidateSteinerSets.updateSteinerSets(semanticTypeMappings);
    }

    //		System.out.println("number of possible mappings: " + numOfMappings);
    logger.info("number of possible mappings: " + numOfMappings);

    return candidateSteinerSets;
  }
  public void run() {
    long start = System.currentTimeMillis();
    // Find the corresponding hNodePath. Used to find examples for training the CRF Model.
    HNodePath currentColumnPath = null;
    List<HNodePath> paths = worksheet.getHeaders().getAllPaths();
    for (HNodePath path : paths) {
      if (path.getLeaf().getId().equals(newType.getHNodeId())) {
        currentColumnPath = path;
        break;
      }
    }

    Map<ColumnFeature, Collection<String>> columnFeatures =
        new HashMap<ColumnFeature, Collection<String>>();

    // Prepare the column name for training
    String columnName = currentColumnPath.getLeaf().getColumnName();
    Collection<String> columnNameList = new ArrayList<String>();
    columnNameList.add(columnName);
    columnFeatures.put(ColumnFeature.ColumnHeaderName, columnNameList);

    // Train the model with the new type
    ArrayList<String> trainingExamples =
        SemanticTypeUtil.getTrainingExamples(worksheet, currentColumnPath);
    boolean trainingResult = false;
    String newTypeString =
        (newType.getDomain() == null)
            ? newType.getType().getUri()
            : newType.getDomain().getUri() + "|" + newType.getType().getUri();

    trainingResult =
        crfModelHandler.addOrUpdateLabel(newTypeString, trainingExamples, columnFeatures);

    if (!trainingResult) {
      logger.error("Error occured while training CRF Model.");
    }
    //		logger.debug("Using type:" + newType.getDomain().getUri() + "|" +
    // newType.getType().getUri());

    // Add the new CRF column model for this column
    ArrayList<String> labels = new ArrayList<String>();
    ArrayList<Double> scores = new ArrayList<Double>();
    trainingResult =
        crfModelHandler.predictLabelForExamples(
            trainingExamples, 4, labels, scores, null, columnFeatures);
    if (!trainingResult) {
      logger.error("Error occured while predicting labels");
    }
    CRFColumnModel newModel = new CRFColumnModel(labels, scores);
    worksheet.getCrfModel().addColumnModel(newType.getHNodeId(), newModel);

    long elapsedTimeMillis = System.currentTimeMillis() - start;
    float elapsedTimeSec = elapsedTimeMillis / 1000F;
    logger.info("Time required for training the semantic type: " + elapsedTimeSec);

    //		long t2 = System.currentTimeMillis();

    // Identify the outliers for the column
    //		SemanticTypeUtil.identifyOutliers(worksheet, newTypeString,currentColumnPath,
    // vWorkspace.getWorkspace().getTagsContainer()
    //				.getTag(TagName.Outlier), columnFeatures, crfModelHandler);

    //		long t3 = System.currentTimeMillis();
    //		logger.info("Identify outliers: "+ (t3-t2));
  }