private static void getStatistics2(List<ColumnNode> columnNodes) { if (columnNodes == null) return; int numberOfAttributesWhoseTypeIsFirstCRFType = 0; int numberOfAttributesWhoseTypeIsInCRFTypes = 0; for (ColumnNode cn : columnNodes) { List<SemanticType> userSemanticTypes = cn.getUserSemanticTypes(); List<SemanticType> top4Suggestions = cn.getTopKLearnedSemanticTypes(4); for (int i = 0; i < top4Suggestions.size(); i++) { SemanticType st = top4Suggestions.get(i); if (userSemanticTypes != null) { for (SemanticType t : userSemanticTypes) { if (st.getModelLabelString().equalsIgnoreCase(t.getModelLabelString())) { if (i == 0) numberOfAttributesWhoseTypeIsFirstCRFType++; numberOfAttributesWhoseTypeIsInCRFTypes++; i = top4Suggestions.size(); break; } } } } } // System.out.println(columnNodes.size() + "\t" + numberOfAttributesWhoseTypeIsInCRFTypes + // "\t" + numberOfAttributesWhoseTypeIsFirstCRFType); System.out.println("totalNumberOfAttributes: " + columnNodes.size()); System.out.println( "numberOfAttributesWhoseTypeIsInCRFTypes: " + numberOfAttributesWhoseTypeIsInCRFTypes); System.out.println( "numberOfAttributesWhoseTypeIsFirstCRFType:" + numberOfAttributesWhoseTypeIsFirstCRFType); }
private void saveSemanticTypesInformation( Worksheet worksheet, VWorkspace vWorkspace, Collection<SemanticType> semanticTypes) throws JSONException { JSONArray typesArray = new JSONArray(); // Add the vworksheet information JSONObject vwIDJObj = new JSONObject(); vwIDJObj.put(ClientJsonKeys.name.name(), ParameterType.vWorksheetId.name()); vwIDJObj.put(ClientJsonKeys.type.name(), ParameterType.vWorksheetId.name()); vwIDJObj.put(ClientJsonKeys.value.name(), vWorksheetId); typesArray.put(vwIDJObj); // Add the check history information JSONObject chIDJObj = new JSONObject(); chIDJObj.put(ClientJsonKeys.name.name(), ParameterType.checkHistory.name()); chIDJObj.put(ClientJsonKeys.type.name(), ParameterType.other.name()); chIDJObj.put(ClientJsonKeys.value.name(), false); typesArray.put(chIDJObj); for (SemanticType type : semanticTypes) { // Add the hNode information JSONObject hNodeJObj = new JSONObject(); hNodeJObj.put(ClientJsonKeys.name.name(), ParameterType.hNodeId.name()); hNodeJObj.put(ClientJsonKeys.type.name(), ParameterType.hNodeId.name()); hNodeJObj.put(ClientJsonKeys.value.name(), type.getHNodeId()); typesArray.put(hNodeJObj); // Add the semantic type information JSONObject typeJObj = new JSONObject(); typeJObj.put(ClientJsonKeys.name.name(), ClientJsonKeys.SemanticType.name()); typeJObj.put(ClientJsonKeys.type.name(), ParameterType.other.name()); typeJObj.put(ClientJsonKeys.value.name(), type.getJSONArrayRepresentation()); typesArray.put(typeJObj); } setInputParameterJson(typesArray.toString(4)); }
@Override public void generateJson(String prefix, PrintWriter pw, VWorkspace vWorkspace) { Workspace workspace = vWorkspace.getWorkspace(); alignment = AlignmentManager.Instance().getAlignment(workspace.getId(), worksheetId); SemanticTypes types = worksheet.getSemanticTypes(); Map<String, ColumnNode> hNodeIdTocolumnNodeMap = createColumnNodeMap(); Map<String, SemanticTypeNode> hNodeIdToDomainNodeMap = createDomainNodeMap(); JSONStringer jsonStr = new JSONStringer(); try { JSONWriter writer = jsonStr.object(); writer.key("worksheetId").value(worksheetId).key("updateType").value("SemanticTypesUpdate"); writer.key(JsonKeys.Types.name()); writer.array(); // Iterate through all the columns for (HNodePath path : worksheet.getHeaders().getAllPaths()) { HNode node = path.getLeaf(); String nodeId = node.getId(); writer.object(); // Check if a semantic type exists for the HNode SemanticType type = types.getSemanticTypeForHNodeId(nodeId); if (type != null && type.getConfidenceLevel() != SemanticType.ConfidenceLevel.Low) { writer .key(JsonKeys.HNodeId.name()) .value(type.getHNodeId()) .key(JsonKeys.SemanticTypesArray.name()) .array(); ColumnNode alignmentColumnNode = hNodeIdTocolumnNodeMap.get(type.getHNodeId()); SemanticTypeNode domainNode = hNodeIdToDomainNodeMap.get(type.getHNodeId()); if (alignmentColumnNode == null || domainNode == null) { logger.error( "Column node or domain node not found in alignment." + " (This should not happen conceptually!):" + type); continue; } // Add the primary semantic type writer .object() .key(JsonKeys.Origin.name()) .value(type.getOrigin().name()) .key(JsonKeys.ConfidenceLevel.name()) .value(type.getConfidenceLevel().name()) .key(JsonKeys.isPrimary.name()) .value(true); // Add the RDF literal type to show in the text box String rdfLiteralType = alignmentColumnNode.getRdfLiteralType() == null ? "" : alignmentColumnNode.getRdfLiteralType().getDisplayName(); String language = alignmentColumnNode.getLanguage() == null ? "" : alignmentColumnNode.getLanguage(); writer.key(JsonKeys.rdfLiteralType.name()).value(rdfLiteralType); writer.key(JsonKeys.language.name()).value(language); // String domainDisplayLabel = (domainNode.getLabel().getPrefix() != null && // (!domainNode.getLabel().getPrefix().equals(""))) ? // (domainNode.getLabel().getPrefix() + ":" + domainNode.getLocalId()) : // domainNode.getLocalId(); if (!type.isClass()) { writer .key(JsonKeys.FullType.name()) .value(type.getType().getUri()) .key(JsonKeys.DisplayLabel.name()) .value(type.getType().getDisplayName()) .key(JsonKeys.DisplayRDFSLabel.name()) .value(type.getType().getRdfsLabel()) .key(JsonKeys.DisplayRDFSComment.name()) .value(type.getType().getRdfsComment()) .key(JsonKeys.DomainId.name()) .value(domainNode.getId()) .key(JsonKeys.DomainUri.name()) .value(domainNode.getUri()) .key(JsonKeys.DisplayDomainLabel.name()) .value(domainNode.getDisplayId()) .key(JsonKeys.DomainRDFSLabel.name()) .value(domainNode.getRdfsLabel()) .key(JsonKeys.DomainRDFSComment.name()) .value(domainNode.getRdfsComment()); } else { writer .key(JsonKeys.FullType.name()) .value(domainNode.getId()) .key(JsonKeys.DisplayLabel.name()) .value(domainNode.getDisplayId()) .key(JsonKeys.DisplayRDFSLabel.name()) .value(domainNode.getRdfsLabel()) .key(JsonKeys.DisplayRDFSComment.name()) .value(domainNode.getRdfsComment()) .key(JsonKeys.DomainId.name()) .value("") .key(JsonKeys.DomainUri.name()) .value("") .key(JsonKeys.DisplayDomainLabel.name()) .value("") .key(JsonKeys.DomainRDFSLabel.name()) .value("") .key(JsonKeys.DomainRDFSComment.name()) .value(""); } // Mark the special properties writer .key(JsonKeys.isMetaProperty.name()) .value(isMetaProperty(type.getType(), alignmentColumnNode)); writer.endObject(); // Iterate through the synonym semantic types SynonymSemanticTypes synTypes = types.getSynonymTypesForHNodeId(nodeId); if (synTypes != null) { for (SemanticType synType : synTypes.getSynonyms()) { writer .object() .key(JsonKeys.HNodeId.name()) .value(synType.getHNodeId()) .key(JsonKeys.FullType.name()) .value(synType.getType().getUri()) .key(JsonKeys.Origin.name()) .value(synType.getOrigin().name()) .key(JsonKeys.ConfidenceLevel.name()) .value(synType.getConfidenceLevel().name()) .key(JsonKeys.DisplayLabel.name()) .value(synType.getType().getDisplayName()) .key(JsonKeys.DisplayRDFSLabel.name()) .value(synType.getType().getRdfsLabel()) .key(JsonKeys.DisplayRDFSComment.name()) .value(synType.getType().getRdfsComment()) .key(JsonKeys.isPrimary.name()) .value(false); if (!synType.isClass()) { writer .key(JsonKeys.DomainUri.name()) .value(synType.getDomain().getUri()) .key(JsonKeys.DomainId.name()) .value("") .key(JsonKeys.DisplayDomainLabel.name()) .value(synType.getDomain().getDisplayName()) .key(JsonKeys.DomainRDFSLabel.name()) .value(synType.getDomain().getRdfsLabel()) .key(JsonKeys.DomainRDFSComment.name()) .value(synType.getDomain().getRdfsComment()); } else { writer .key(JsonKeys.DomainId.name()) .value("") .key(JsonKeys.DomainUri.name()) .value("") .key(JsonKeys.DisplayDomainLabel.name()) .value("") .key(JsonKeys.DomainRDFSLabel.name()) .value("") .key(JsonKeys.DomainRDFSComment.name()) .value(""); } writer.endObject(); } } writer.endArray(); } else { writer.key(JsonKeys.HNodeId.name()).value(nodeId); writer.key(JsonKeys.SemanticTypesArray.name()).array().endArray(); } writer.endObject(); } writer.endArray(); writer.endObject(); pw.print(writer.toString()); } catch (JSONException e) { logger.error("Error occured while writing to JSON!", e); } }
private SemanticTypeMapping addSemanticTypeStruct( ColumnNode sourceColumn, SemanticType semanticType, Set<Node> addedNodes) { logger.debug("adding semantic type to the graph ... "); if (addedNodes == null) addedNodes = new HashSet<Node>(); if (semanticType == null) { logger.error("semantic type is null."); return null; } if (semanticType.getDomain() == null) { logger.error("semantic type does not have any domain"); return null; } if (semanticType.getType() == null) { logger.error("semantic type does not have any link"); return null; } String domainUri = semanticType.getDomain().getUri(); String propertyUri = semanticType.getType().getUri(); Double confidence = semanticType.getConfidenceScore(); Origin origin = semanticType.getOrigin(); if (domainUri == null || domainUri.isEmpty()) { logger.error("semantic type does not have any domain"); return null; } if (propertyUri == null || propertyUri.isEmpty()) { logger.error("semantic type does not have any link"); return null; } logger.debug( "semantic type: " + domainUri + "|" + propertyUri + "|" + confidence + "|" + origin); InternalNode source = null; String nodeId; nodeId = nodeIdFactory.getNodeId(domainUri); source = new InternalNode(nodeId, new Label(domainUri)); if (!this.graphBuilder.addNodeAndUpdate(source, addedNodes)) return null; nodeId = new RandomGUID().toString(); ColumnNode target = new ColumnNode(nodeId, nodeId, sourceColumn.getColumnName(), null); if (!this.graphBuilder.addNode(target)) return null; addedNodes.add(target); String linkId = LinkIdFactory.getLinkId(propertyUri, source.getId(), target.getId()); LabeledLink link; if (propertyUri.equalsIgnoreCase(ClassInstanceLink.getFixedLabel().getUri())) link = new ClassInstanceLink(linkId); else { Label label = this.ontologyManager.getUriLabel(propertyUri); link = new DataPropertyLink(linkId, label); } if (!this.graphBuilder.addLink(source, target, link)) return null; SemanticTypeMapping mappingStruct = new SemanticTypeMapping(sourceColumn, semanticType, source, link, target); return mappingStruct; }
private Set<SemanticTypeMapping> findSemanticTypeInGraph( ColumnNode sourceColumn, SemanticType semanticType, HashMap<String, Integer> semanticTypesCount, Set<Node> addedNodes) { logger.debug("finding matches for semantic type in the graph ... "); if (addedNodes == null) addedNodes = new HashSet<Node>(); Set<SemanticTypeMapping> mappings = new HashSet<SemanticTypeMapping>(); if (semanticType == null) { logger.error("semantic type is null."); return mappings; } if (semanticType.getDomain() == null) { logger.error("semantic type does not have any domain"); return mappings; } if (semanticType.getType() == null) { logger.error("semantic type does not have any link"); return mappings; } String domainUri = semanticType.getDomain().getUri(); String propertyUri = semanticType.getType().getUri(); Double confidence = semanticType.getConfidenceScore(); Origin origin = semanticType.getOrigin(); Integer countOfSemanticType = semanticTypesCount.get(domainUri + propertyUri); if (countOfSemanticType == null) { logger.error("count of semantic type should not be null or zero"); return mappings; } if (domainUri == null || domainUri.isEmpty()) { logger.error("semantic type does not have any domain"); return mappings; } if (propertyUri == null || propertyUri.isEmpty()) { logger.error("semantic type does not have any link"); return mappings; } logger.debug( "semantic type: " + domainUri + "|" + propertyUri + "|" + confidence + "|" + origin); // add dataproperty to existing classes if sl is a data node mapping // Set<Node> foundInternalNodes = new HashSet<Node>(); Set<SemanticTypeMapping> semanticTypeMatches = this.graphBuilder.getSemanticTypeMatches().get(domainUri + propertyUri); if (semanticTypeMatches != null) { for (SemanticTypeMapping stm : semanticTypeMatches) { SemanticTypeMapping mp = new SemanticTypeMapping( sourceColumn, semanticType, stm.getSource(), stm.getLink(), stm.getTarget()); mappings.add(mp); // foundInternalNodes.add(stm.getSource()); } } logger.debug("adding data property to the found internal nodes ..."); Integer count; boolean allowMultipleSamePropertiesPerNode = ModelingConfigurationRegistry.getInstance() .getModelingConfiguration( ContextParametersRegistry.getInstance() .getContextParameters(ontologyManager.getContextId()) .getKarmaHome()) .isMultipleSamePropertyPerNode(); Set<Node> nodesWithSameUriOfDomain = this.graphBuilder.getUriToNodesMap().get(domainUri); if (nodesWithSameUriOfDomain != null) { for (Node source : nodesWithSameUriOfDomain) { count = this.graphBuilder.getNodeDataPropertyCount().get(source.getId() + propertyUri); if (count != null) { if (allowMultipleSamePropertiesPerNode) { if (count >= countOfSemanticType.intValue()) continue; } else { if (count >= 1) continue; } } String nodeId = new RandomGUID().toString(); ColumnNode target = new ColumnNode(nodeId, nodeId, sourceColumn.getColumnName(), null); if (!this.graphBuilder.addNode(target)) continue; ; addedNodes.add(target); String linkId = LinkIdFactory.getLinkId(propertyUri, source.getId(), target.getId()); LabeledLink link = new DataPropertyLink(linkId, new Label(propertyUri)); if (!this.graphBuilder.addLink(source, target, link)) continue; ; SemanticTypeMapping mp = new SemanticTypeMapping( sourceColumn, semanticType, (InternalNode) source, link, target); mappings.add(mp); } } return mappings; }
private CandidateSteinerSets getCandidateSteinerSets( List<ColumnNode> columnNodes, boolean useCorrectTypes, int numberOfCRFCandidates, Set<Node> addedNodes) { if (columnNodes == null || columnNodes.isEmpty()) return null; int maxNumberOfSteinerNodes = columnNodes.size() * 2; CandidateSteinerSets candidateSteinerSets = new CandidateSteinerSets(maxNumberOfSteinerNodes, ontologyManager.getContextId()); if (addedNodes == null) addedNodes = new HashSet<Node>(); Set<SemanticTypeMapping> tempSemanticTypeMappings; HashMap<ColumnNode, List<SemanticType>> columnSemanticTypes = new HashMap<ColumnNode, List<SemanticType>>(); HashMap<String, Integer> semanticTypesCount = new HashMap<String, Integer>(); List<SemanticType> candidateSemanticTypes; String domainUri = "", propertyUri = ""; for (ColumnNode n : columnNodes) { candidateSemanticTypes = n.getTopKLearnedSemanticTypes(numberOfCRFCandidates); columnSemanticTypes.put(n, candidateSemanticTypes); for (SemanticType semanticType : candidateSemanticTypes) { if (semanticType == null || semanticType.getDomain() == null || semanticType.getType() == null) continue; domainUri = semanticType.getDomain().getUri(); propertyUri = semanticType.getType().getUri(); Integer count = semanticTypesCount.get(domainUri + propertyUri); if (count == null) semanticTypesCount.put(domainUri + propertyUri, 1); else semanticTypesCount.put(domainUri + propertyUri, count.intValue() + 1); } } int numOfMappings = 1; for (ColumnNode n : columnNodes) { candidateSemanticTypes = columnSemanticTypes.get(n); if (candidateSemanticTypes == null) continue; logger.info("===== Column: " + n.getColumnName()); Set<SemanticTypeMapping> semanticTypeMappings = new HashSet<SemanticTypeMapping>(); for (SemanticType semanticType : candidateSemanticTypes) { logger.info("\t===== Semantic Type: " + semanticType.getModelLabelString()); if (semanticType == null || semanticType.getDomain() == null || semanticType.getType() == null) continue; domainUri = semanticType.getDomain().getUri(); propertyUri = semanticType.getType().getUri(); Integer countOfSemanticType = semanticTypesCount.get(domainUri + propertyUri); // logger.info("count of semantic type: " + countOfSemanticType); tempSemanticTypeMappings = findSemanticTypeInGraph(n, semanticType, semanticTypesCount, addedNodes); // logger.info("number of matches for semantic type: " + // + (tempSemanticTypeMappings == null ? 0 : tempSemanticTypeMappings.size())); if (tempSemanticTypeMappings != null) semanticTypeMappings.addAll(tempSemanticTypeMappings); int countOfMatches = tempSemanticTypeMappings == null ? 0 : tempSemanticTypeMappings.size(); if (countOfMatches < countOfSemanticType) // No struct in graph is matched with the semantic type, we add a // new struct to the graph { for (int i = 0; i < countOfSemanticType - countOfMatches; i++) { SemanticTypeMapping mp = addSemanticTypeStruct(n, semanticType, addedNodes); if (mp != null) semanticTypeMappings.add(mp); } } } // System.out.println("number of matches for column " + n.getColumnName() + // ": " + (semanticTypeMappings == null ? 0 : semanticTypeMappings.size())); logger.info( "number of matches for column " + n.getColumnName() + ": " + (semanticTypeMappings == null ? 0 : semanticTypeMappings.size())); numOfMappings *= semanticTypeMappings == null || semanticTypeMappings.isEmpty() ? 1 : semanticTypeMappings.size(); candidateSteinerSets.updateSteinerSets(semanticTypeMappings); } // System.out.println("number of possible mappings: " + numOfMappings); logger.info("number of possible mappings: " + numOfMappings); return candidateSteinerSets; }
public void run() { long start = System.currentTimeMillis(); // Find the corresponding hNodePath. Used to find examples for training the CRF Model. HNodePath currentColumnPath = null; List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); for (HNodePath path : paths) { if (path.getLeaf().getId().equals(newType.getHNodeId())) { currentColumnPath = path; break; } } Map<ColumnFeature, Collection<String>> columnFeatures = new HashMap<ColumnFeature, Collection<String>>(); // Prepare the column name for training String columnName = currentColumnPath.getLeaf().getColumnName(); Collection<String> columnNameList = new ArrayList<String>(); columnNameList.add(columnName); columnFeatures.put(ColumnFeature.ColumnHeaderName, columnNameList); // Train the model with the new type ArrayList<String> trainingExamples = SemanticTypeUtil.getTrainingExamples(worksheet, currentColumnPath); boolean trainingResult = false; String newTypeString = (newType.getDomain() == null) ? newType.getType().getUri() : newType.getDomain().getUri() + "|" + newType.getType().getUri(); trainingResult = crfModelHandler.addOrUpdateLabel(newTypeString, trainingExamples, columnFeatures); if (!trainingResult) { logger.error("Error occured while training CRF Model."); } // logger.debug("Using type:" + newType.getDomain().getUri() + "|" + // newType.getType().getUri()); // Add the new CRF column model for this column ArrayList<String> labels = new ArrayList<String>(); ArrayList<Double> scores = new ArrayList<Double>(); trainingResult = crfModelHandler.predictLabelForExamples( trainingExamples, 4, labels, scores, null, columnFeatures); if (!trainingResult) { logger.error("Error occured while predicting labels"); } CRFColumnModel newModel = new CRFColumnModel(labels, scores); worksheet.getCrfModel().addColumnModel(newType.getHNodeId(), newModel); long elapsedTimeMillis = System.currentTimeMillis() - start; float elapsedTimeSec = elapsedTimeMillis / 1000F; logger.info("Time required for training the semantic type: " + elapsedTimeSec); // long t2 = System.currentTimeMillis(); // Identify the outliers for the column // SemanticTypeUtil.identifyOutliers(worksheet, newTypeString,currentColumnPath, // vWorkspace.getWorkspace().getTagsContainer() // .getTag(TagName.Outlier), columnFeatures, crfModelHandler); // long t3 = System.currentTimeMillis(); // logger.info("Identify outliers: "+ (t3-t2)); }