Exemplo n.º 1
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);

      if (null == entity) {
        if (identifyWithFilePrefix) {
          FileSplit fileInpSplit = (FileSplit) context.getInputSplit();
          String filePrefix = fileInpSplit.getPath().getName().substring(0, filePrefixLength);
          entity = schema.getEntityByFilePrefix(filePrefix);
        } else {
          entity = schema.getEntityBySize(items.length);
        }
        idOrdinal = entity.getIdField().getOrdinal();
      }

      if (null != entity) {
        hash = items[idOrdinal].hashCode() % bucketCount;
        hash = hash < 0 ? -hash : hash;
        if (entity.getType() == 0) {
          if (identifyWithFilePrefix) {
            valueHolder.set("0," + value.toString());
          } else {
            valueHolder.set(value);
          }
          for (int i = 0; i < bucketCount; ++i) {
            keyHolder.set((hash * bucketCount + i) * 10);
            context.write(keyHolder, valueHolder);
          }
        } else {
          if (identifyWithFilePrefix) {
            valueHolder.set("1," + value.toString());
          } else {
            valueHolder.set(value);
          }
          for (int i = 0; i < bucketCount; ++i) {
            keyHolder.set(((i * bucketCount + hash) * 10) + 1);
            context.write(keyHolder, valueHolder);
          }
        }
      } else {

      }
    }
Exemplo n.º 2
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
     */
    protected void setup(Context context) throws IOException, InterruptedException {
      // load schema
      Configuration conf = context.getConfiguration();
      String filePath = conf.get("schema.file.path");
      FileSystem dfs = FileSystem.get(conf);
      Path src = new Path(filePath);
      FSDataInputStream fs = dfs.open(src);
      ObjectMapper mapper = new ObjectMapper();
      schema = mapper.readValue(fs, MixedTypeSchema.class);

      firstTypeSize = schema.getEntityByType(0).getFieldCount();
      firstIdOrdinal = schema.getEntityByType(0).getIdField().getOrdinal();
      secondIdOrdinal = schema.getEntityByType(1).getIdField().getOrdinal();
      Field field = schema.getEntityByType(0).getClassAttributeField();
      if (null != field) {
        firstClassAttrOrdinal = field.getOrdinal();
        secondClassAttrOrdinal = schema.getEntityByType(0).getClassAttributeField().getOrdinal();
      }

      fields = schema.getEntityByType(0).getFields();
      targetFields = schema.getEntityByType(1).getFields();
      scale = context.getConfiguration().getInt("distance.scale", 1000);
      distStrategy = schema.createDistanceStrategy(scale);
      fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]");
      fieldDelim = context.getConfiguration().get("field.delim", ",");
      textSimStrategy = schema.createTextSimilarityStrategy();
      outputVerbose = context.getConfiguration().getBoolean("sim.output.verbose", true);
      identifyWithFilePrefix =
          context.getConfiguration().getBoolean("identify.with.file.prefix", false);

      System.out.println(
          "firstTypeSize: "
              + firstTypeSize
              + " firstIdOrdinal:"
              + firstIdOrdinal
              + " secondIdOrdinal:"
              + secondIdOrdinal
              + " Source field count:"
              + fields.size()
              + " Target field count:"
              + targetFields.size());
    }
Exemplo n.º 3
0
    /**
     * Gets distance between numerial values
     *
     * @param srcField
     * @param srcVal
     * @param trgField
     * @param trgVal
     * @return
     */
    private double getDistForNumeric(Field srcField, int srcVal, Field trgField, int trgVal) {
      double dist = 0;
      boolean linear = false;
      String distFun = srcField.getNumDistFunction();

      if (distFun.equals("equalSoft")) {
        linear = true;
      } else if (distFun.equals("equalHard")) {
        dist = srcVal == trgVal ? 0 : 1;
      } else if (distFun.equals("minSoft")) {
        if (trgVal >= srcVal) {
          dist = 0;
        } else {
          linear = true;
        }
      } else if (distFun.equals("minHard")) {
        dist = trgVal >= srcVal ? 0 : 1;
      } else if (distFun.equals("maxSoft")) {
        if (trgVal <= srcVal) {
          dist = 0;
        } else {
          linear = true;
        }
      } else if (distFun.equals("maxHard")) {
        dist = trgVal <= srcVal ? 0 : 1;
      }

      if (linear) {
        if (trgField.getMax() > trgField.getMin()) {
          dist = ((double) (srcVal - trgVal)) / (trgField.getMax() - trgField.getMin());
        } else {
          int max = srcVal > trgVal ? srcVal : trgVal;
          double diff = ((double) (srcVal - trgVal)) / max;
          if (diff < 0) {
            diff = -diff;
          }
          dist = diff > schema.getNumericDiffThreshold() ? 1.0 : 0.0;
        }
        if (dist < 0) {
          dist = -dist;
        }
      }

      return dist;
    }
Exemplo n.º 4
0
    /**
     * Gets distance between two entities
     *
     * @param source
     * @param target
     * @param context
     * @return
     * @throws IOException
     */
    private int findSimilarity(String source, String target, Context context) throws IOException {
      int sim = 0;
      mapFields(source, context);
      String[] trgItems = target.split(fieldDelimRegex);

      double dist = 0;
      context.getCounter("Data", "Target Field Count").increment(targetFields.size());
      if (prntDetail) {
        System.out.println("target record: " + trgItems[0]);
      }

      distStrategy.initialize();
      for (Field field : targetFields) {
        dist = 0;
        Integer ordinal = field.getOrdinal();
        String trgItem = trgItems[ordinal];
        boolean skipAttr = false;
        if (prntDetail) {
          System.out.println("ordinal: " + ordinal + " target:" + trgItem);
        }

        MappedValue mappedValueObj = mappedFields.get(ordinal);
        if (null == mappedValueObj) {
          // non mapped passive attributes
          continue;
        }

        List<String> mappedValues = mappedValueObj.getValues();
        Field srcField = mappedValueObj.getField();
        if (!trgItem.isEmpty()) {
          if (field.getDataType().equals("categorical")) {
            if (!mappedValues.isEmpty()) {
              double thisDist;
              dist = 1.0;
              for (String mappedValue : mappedValues) {
                thisDist = schema.findCattegoricalDistance(mappedValue, trgItem, ordinal);
                if (thisDist < dist) {
                  dist = thisDist;
                }
                if (prntDetail) {
                  System.out.println(
                      "dist calculation: ordinal: "
                          + ordinal
                          + " src:"
                          + mappedValue
                          + " target:"
                          + trgItem
                          + " dist:"
                          + dist);
                }
              }

              context.getCounter("Data", "Dist Calculated").increment(1);
            } else {
              // missing source
              if (schema.getMissingValueHandler().equals("default")) {
                dist = getDistForMissingSrc(field, trgItem);
              } else {
                skipAttr = true;
              }
              context.getCounter("Data", "Missing Source").increment(1);
            }
          } else if (field.getDataType().equals("int")) {
            if (!mappedValues.isEmpty()) {
              int trgItemInt = Integer.parseInt(trgItem);
              int srcItemInt = getAverageMappedValue(mappedValues);
              dist = getDistForNumeric(srcField, srcItemInt, field, trgItemInt);
            } else {
              // missing source
              if (schema.getMissingValueHandler().equals("default")) {
                dist = getDistForMissingSrc(field, trgItem);
              } else {
                skipAttr = true;
              }
            }
          } else if (field.getDataType().equals("text")) {
            if (!mappedValues.isEmpty()) {
              String trgItemTxt = trgItem;
              String srcItemTxt = mappedValues.get(0);
              dist = textSimStrategy.findDistance(trgItemTxt, srcItemTxt);
            } else {
              // missing source
              if (schema.getMissingValueHandler().equals("default")) {
                dist = getDistForMissingSrc(field, trgItem);
              } else {
                skipAttr = true;
              }
            }
          } else if (field.getDataType().equals("location")) {
            if (!mappedValues.isEmpty()) {
              String trgItemTxt = trgItem;
              String srcItemTxt = mappedValues.get(0);
              dist = getDistForLocation(trgItemTxt, srcItemTxt, field);
            } else {
              // missing source
              skipAttr = true;
            }
          }
        } else {
          // missing target value
          if (schema.getMissingValueHandler().equals("default")) {
            context.getCounter("Data", "Missing Target").increment(1);
            dist = getDistForMissingTrg(field, mappedValues);
          } else {
            skipAttr = true;
          }
        }

        if (!skipAttr) {
          distStrategy.accumulate(dist, field);
        }
      }
      sim = distStrategy.getSimilarity();
      return sim;
    }