/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); if (null == entity) { if (identifyWithFilePrefix) { FileSplit fileInpSplit = (FileSplit) context.getInputSplit(); String filePrefix = fileInpSplit.getPath().getName().substring(0, filePrefixLength); entity = schema.getEntityByFilePrefix(filePrefix); } else { entity = schema.getEntityBySize(items.length); } idOrdinal = entity.getIdField().getOrdinal(); } if (null != entity) { hash = items[idOrdinal].hashCode() % bucketCount; hash = hash < 0 ? -hash : hash; if (entity.getType() == 0) { if (identifyWithFilePrefix) { valueHolder.set("0," + value.toString()); } else { valueHolder.set(value); } for (int i = 0; i < bucketCount; ++i) { keyHolder.set((hash * bucketCount + i) * 10); context.write(keyHolder, valueHolder); } } else { if (identifyWithFilePrefix) { valueHolder.set("1," + value.toString()); } else { valueHolder.set(value); } for (int i = 0; i < bucketCount; ++i) { keyHolder.set(((i * bucketCount + hash) * 10) + 1); context.write(keyHolder, valueHolder); } } } else { } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context) */ protected void setup(Context context) throws IOException, InterruptedException { // load schema Configuration conf = context.getConfiguration(); String filePath = conf.get("schema.file.path"); FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); FSDataInputStream fs = dfs.open(src); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, MixedTypeSchema.class); firstTypeSize = schema.getEntityByType(0).getFieldCount(); firstIdOrdinal = schema.getEntityByType(0).getIdField().getOrdinal(); secondIdOrdinal = schema.getEntityByType(1).getIdField().getOrdinal(); Field field = schema.getEntityByType(0).getClassAttributeField(); if (null != field) { firstClassAttrOrdinal = field.getOrdinal(); secondClassAttrOrdinal = schema.getEntityByType(0).getClassAttributeField().getOrdinal(); } fields = schema.getEntityByType(0).getFields(); targetFields = schema.getEntityByType(1).getFields(); scale = context.getConfiguration().getInt("distance.scale", 1000); distStrategy = schema.createDistanceStrategy(scale); fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]"); fieldDelim = context.getConfiguration().get("field.delim", ","); textSimStrategy = schema.createTextSimilarityStrategy(); outputVerbose = context.getConfiguration().getBoolean("sim.output.verbose", true); identifyWithFilePrefix = context.getConfiguration().getBoolean("identify.with.file.prefix", false); System.out.println( "firstTypeSize: " + firstTypeSize + " firstIdOrdinal:" + firstIdOrdinal + " secondIdOrdinal:" + secondIdOrdinal + " Source field count:" + fields.size() + " Target field count:" + targetFields.size()); }
/** * Gets distance between numerial values * * @param srcField * @param srcVal * @param trgField * @param trgVal * @return */ private double getDistForNumeric(Field srcField, int srcVal, Field trgField, int trgVal) { double dist = 0; boolean linear = false; String distFun = srcField.getNumDistFunction(); if (distFun.equals("equalSoft")) { linear = true; } else if (distFun.equals("equalHard")) { dist = srcVal == trgVal ? 0 : 1; } else if (distFun.equals("minSoft")) { if (trgVal >= srcVal) { dist = 0; } else { linear = true; } } else if (distFun.equals("minHard")) { dist = trgVal >= srcVal ? 0 : 1; } else if (distFun.equals("maxSoft")) { if (trgVal <= srcVal) { dist = 0; } else { linear = true; } } else if (distFun.equals("maxHard")) { dist = trgVal <= srcVal ? 0 : 1; } if (linear) { if (trgField.getMax() > trgField.getMin()) { dist = ((double) (srcVal - trgVal)) / (trgField.getMax() - trgField.getMin()); } else { int max = srcVal > trgVal ? srcVal : trgVal; double diff = ((double) (srcVal - trgVal)) / max; if (diff < 0) { diff = -diff; } dist = diff > schema.getNumericDiffThreshold() ? 1.0 : 0.0; } if (dist < 0) { dist = -dist; } } return dist; }
/** * Gets distance between two entities * * @param source * @param target * @param context * @return * @throws IOException */ private int findSimilarity(String source, String target, Context context) throws IOException { int sim = 0; mapFields(source, context); String[] trgItems = target.split(fieldDelimRegex); double dist = 0; context.getCounter("Data", "Target Field Count").increment(targetFields.size()); if (prntDetail) { System.out.println("target record: " + trgItems[0]); } distStrategy.initialize(); for (Field field : targetFields) { dist = 0; Integer ordinal = field.getOrdinal(); String trgItem = trgItems[ordinal]; boolean skipAttr = false; if (prntDetail) { System.out.println("ordinal: " + ordinal + " target:" + trgItem); } MappedValue mappedValueObj = mappedFields.get(ordinal); if (null == mappedValueObj) { // non mapped passive attributes continue; } List<String> mappedValues = mappedValueObj.getValues(); Field srcField = mappedValueObj.getField(); if (!trgItem.isEmpty()) { if (field.getDataType().equals("categorical")) { if (!mappedValues.isEmpty()) { double thisDist; dist = 1.0; for (String mappedValue : mappedValues) { thisDist = schema.findCattegoricalDistance(mappedValue, trgItem, ordinal); if (thisDist < dist) { dist = thisDist; } if (prntDetail) { System.out.println( "dist calculation: ordinal: " + ordinal + " src:" + mappedValue + " target:" + trgItem + " dist:" + dist); } } context.getCounter("Data", "Dist Calculated").increment(1); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } context.getCounter("Data", "Missing Source").increment(1); } } else if (field.getDataType().equals("int")) { if (!mappedValues.isEmpty()) { int trgItemInt = Integer.parseInt(trgItem); int srcItemInt = getAverageMappedValue(mappedValues); dist = getDistForNumeric(srcField, srcItemInt, field, trgItemInt); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } } } else if (field.getDataType().equals("text")) { if (!mappedValues.isEmpty()) { String trgItemTxt = trgItem; String srcItemTxt = mappedValues.get(0); dist = textSimStrategy.findDistance(trgItemTxt, srcItemTxt); } else { // missing source if (schema.getMissingValueHandler().equals("default")) { dist = getDistForMissingSrc(field, trgItem); } else { skipAttr = true; } } } else if (field.getDataType().equals("location")) { if (!mappedValues.isEmpty()) { String trgItemTxt = trgItem; String srcItemTxt = mappedValues.get(0); dist = getDistForLocation(trgItemTxt, srcItemTxt, field); } else { // missing source skipAttr = true; } } } else { // missing target value if (schema.getMissingValueHandler().equals("default")) { context.getCounter("Data", "Missing Target").increment(1); dist = getDistForMissingTrg(field, mappedValues); } else { skipAttr = true; } } if (!skipAttr) { distStrategy.accumulate(dist, field); } } sim = distStrategy.getSimilarity(); return sim; }