/** * Gets values for the mapped attributes in src mapped from the target * * @param source * @param context */ private void mapFields(String source, Context context) { mappedFields.clear(); String[] srcItems = source.split(fieldDelimRegex); if (prntDetail) { System.out.println("src record: " + srcItems[0]); } for (Field field : fields) { List<FieldMapping> mappings = field.getMappings(); if (null != mappings) { for (FieldMapping fldMapping : mappings) { int matchingOrdinal = fldMapping.getMatchingOrdinal(); if (-1 == matchingOrdinal) { continue; } MappedValue mappedValue = mappedFields.get(matchingOrdinal); if (null == mappedValue) { mappedValue = new MappedValue(); mappedValue.setField(field); mappedFields.put(matchingOrdinal, mappedValue); } List<String> mappedValues = mappedValue.getValues(); String value = srcItems[field.getOrdinal()]; if (prntDetail) { System.out.println("src value: " + value); } List<FieldMapping.ValueMapping> valueMappings = fldMapping.getValueMappings(); if (null != valueMappings) { for (FieldMapping.ValueMapping valMapping : valueMappings) { if (field.getDataType().equals("categorical")) { // store mapped values if (valMapping.getThisValue().equals(value)) { mappedValues.add(valMapping.getThatValue()); context.getCounter("Data", "Mapped Value").increment(1); if (prntDetail) { System.out.println( "mapped: " + value + " " + valMapping.getThatValue() + " matching ordinal:" + matchingOrdinal); } break; } } else if (field.getDataType().equals("int")) { int valueInt = Integer.parseInt(value); int[] range = valMapping.getThisValueRange(); if (null != range) { if (valueInt >= range[0] && valueInt <= range[1]) { mappedValues.add(valMapping.getThatValue()); break; } } } } } else { if (prntDetail) { System.out.println("non mapped: " + value + " matching ordinal:" + matchingOrdinal); } if (!value.isEmpty()) { mappedValues.add(value); } } } } } }
/** * Gets distance between two entities * * @param source * @param target * @param context * @return * @throws IOException */ private int findSimilarity(String source, String target, Context context) throws IOException { int sim = 0; mapFields(source, context); String[] trgItems = target.split(fieldDelimRegex); double dist = 0; context.getCounter("Data", "Target Field Count").increment(targetFields.size()); if (prntDetail) { System.out.println("target record: " + trgItems[0]); } distStrategy.initialize(); for (Field field : targetFields) { dist = 0; Integer ordinal = field.getOrdinal(); String trgItem = trgItems[ordinal]; boolean skipAttr = false; if (prntDetail) { System.out.println("ordinal: " + ordinal + " target:" + trgItem); } MappedValue mappedValueObj = mappedFields.get(ordinal); if (null == mappedValueObj) { // non mapped passive attributes continue; } List<String> mappedValues = mappedValueObj.getValues(); Field srcField = mappedValueObj.getField(); if (!trgItem.isEmpty()) { if (field.getDataType().equals("categorical")) { if (!mappedValues.isEmpty()) { double thisDist; dist = 1.0; for (String mappedValue : mappedValues) { thisDist = schema.findCattegoricalDistance(mappedValue, trgItem, ordinal); if (thisDist < dist) { dist = thisDist; } if (prntDetail) { System.out.println( "dist calculation: ordinal: " + ordinal + " src:" + mappedValue + " target:" + trgItem + " dist:" + dist); } } context.getCounter("Data", "Dist Calculated").increment(1); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } context.getCounter("Data", "Missing Source").increment(1); } } else if (field.getDataType().equals("int")) { if (!mappedValues.isEmpty()) { int trgItemInt = Integer.parseInt(trgItem); int srcItemInt = getAverageMappedValue(mappedValues); dist = getDistForNumeric(srcField, srcItemInt, field, trgItemInt); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } } } else if (field.getDataType().equals("text")) { if (!mappedValues.isEmpty()) { String trgItemTxt = trgItem; String srcItemTxt = mappedValues.get(0); dist = textSimStrategy.findDistance(trgItemTxt, srcItemTxt); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } } } else if (field.getDataType().equals("location")) { if (!mappedValues.isEmpty()) { String trgItemTxt = trgItem; String srcItemTxt = mappedValues.get(0); dist = getDistForLocation(trgItemTxt, srcItemTxt, field); } else { // missing source skipAttr = true; } } } else { // missing target value if (schema.getMissingValueHandler().equals("default")) { context.getCounter("Data", "Missing Target").increment(1); dist = getDistForMissingTrg(field, mappedValues); } else { skipAttr = true; } } if (!skipAttr) { distStrategy.accumulate(dist, field); } } sim = distStrategy.getSimilarity(); return sim; }