/** * Gets distance between two entities * * @param source * @param target * @param context * @return * @throws IOException */ private int findSimilarity(String source, String target, Context context) throws IOException { int sim = 0; mapFields(source, context); String[] trgItems = target.split(fieldDelimRegex); double dist = 0; context.getCounter("Data", "Target Field Count").increment(targetFields.size()); if (prntDetail) { System.out.println("target record: " + trgItems[0]); } distStrategy.initialize(); for (Field field : targetFields) { dist = 0; Integer ordinal = field.getOrdinal(); String trgItem = trgItems[ordinal]; boolean skipAttr = false; if (prntDetail) { System.out.println("ordinal: " + ordinal + " target:" + trgItem); } MappedValue mappedValueObj = mappedFields.get(ordinal); if (null == mappedValueObj) { // non mapped passive attributes continue; } List<String> mappedValues = mappedValueObj.getValues(); Field srcField = mappedValueObj.getField(); if (!trgItem.isEmpty()) { if (field.getDataType().equals("categorical")) { if (!mappedValues.isEmpty()) { double thisDist; dist = 1.0; for (String mappedValue : mappedValues) { thisDist = schema.findCattegoricalDistance(mappedValue, trgItem, ordinal); if (thisDist < dist) { dist = thisDist; } if (prntDetail) { System.out.println( "dist calculation: ordinal: " + ordinal + " src:" + mappedValue + " target:" + trgItem + " dist:" + dist); } } context.getCounter("Data", "Dist Calculated").increment(1); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } context.getCounter("Data", "Missing Source").increment(1); } } else if (field.getDataType().equals("int")) { if (!mappedValues.isEmpty()) { int trgItemInt = Integer.parseInt(trgItem); int srcItemInt = getAverageMappedValue(mappedValues); dist = getDistForNumeric(srcField, srcItemInt, field, trgItemInt); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } } } else if (field.getDataType().equals("text")) { if (!mappedValues.isEmpty()) { String trgItemTxt = trgItem; String srcItemTxt = mappedValues.get(0); dist = textSimStrategy.findDistance(trgItemTxt, srcItemTxt); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } } } else if (field.getDataType().equals("location")) { if (!mappedValues.isEmpty()) { String trgItemTxt = trgItem; String srcItemTxt = mappedValues.get(0); dist = getDistForLocation(trgItemTxt, srcItemTxt, field); } else { // missing source skipAttr = true; } } } else { // missing target value if (schema.getMissingValueHandler().equals("default")) { context.getCounter("Data", "Missing Target").increment(1); dist = getDistForMissingTrg(field, mappedValues); } else { skipAttr = true; } } if (!skipAttr) { distStrategy.accumulate(dist, field); } } sim = distStrategy.getSimilarity(); return sim; }