Пример #1
0
    /**
     * Gets distance between two entities
     *
     * @param source
     * @param target
     * @param context
     * @return
     * @throws IOException
     */
    private int findSimilarity(String source, String target, Context context) throws IOException {
      int sim = 0;
      mapFields(source, context);
      String[] trgItems = target.split(fieldDelimRegex);

      double dist = 0;
      context.getCounter("Data", "Target Field Count").increment(targetFields.size());
      if (prntDetail) {
        System.out.println("target record: " + trgItems[0]);
      }

      distStrategy.initialize();
      for (Field field : targetFields) {
        dist = 0;
        Integer ordinal = field.getOrdinal();
        String trgItem = trgItems[ordinal];
        boolean skipAttr = false;
        if (prntDetail) {
          System.out.println("ordinal: " + ordinal + " target:" + trgItem);
        }

        MappedValue mappedValueObj = mappedFields.get(ordinal);
        if (null == mappedValueObj) {
          // non mapped passive attributes
          continue;
        }

        List<String> mappedValues = mappedValueObj.getValues();
        Field srcField = mappedValueObj.getField();
        if (!trgItem.isEmpty()) {
          if (field.getDataType().equals("categorical")) {
            if (!mappedValues.isEmpty()) {
              double thisDist;
              dist = 1.0;
              for (String mappedValue : mappedValues) {
                thisDist = schema.findCattegoricalDistance(mappedValue, trgItem, ordinal);
                if (thisDist < dist) {
                  dist = thisDist;
                }
                if (prntDetail) {
                  System.out.println(
                      "dist calculation: ordinal: "
                          + ordinal
                          + " src:"
                          + mappedValue
                          + " target:"
                          + trgItem
                          + " dist:"
                          + dist);
                }
              }

              context.getCounter("Data", "Dist Calculated").increment(1);
            } else {
              // missing source
              if (schema.getMissingValueHandler().equals("default")) {
                dist = getDistForMissingSrc(field, trgItem);
              } else {
                skipAttr = true;
              }
              context.getCounter("Data", "Missing Source").increment(1);
            }
          } else if (field.getDataType().equals("int")) {
            if (!mappedValues.isEmpty()) {
              int trgItemInt = Integer.parseInt(trgItem);
              int srcItemInt = getAverageMappedValue(mappedValues);
              dist = getDistForNumeric(srcField, srcItemInt, field, trgItemInt);
            } else {
              // missing source
              if (schema.getMissingValueHandler().equals("default")) {
                dist = getDistForMissingSrc(field, trgItem);
              } else {
                skipAttr = true;
              }
            }
          } else if (field.getDataType().equals("text")) {
            if (!mappedValues.isEmpty()) {
              String trgItemTxt = trgItem;
              String srcItemTxt = mappedValues.get(0);
              dist = textSimStrategy.findDistance(trgItemTxt, srcItemTxt);
            } else {
              // missing source
              if (schema.getMissingValueHandler().equals("default")) {
                dist = getDistForMissingSrc(field, trgItem);
              } else {
                skipAttr = true;
              }
            }
          } else if (field.getDataType().equals("location")) {
            if (!mappedValues.isEmpty()) {
              String trgItemTxt = trgItem;
              String srcItemTxt = mappedValues.get(0);
              dist = getDistForLocation(trgItemTxt, srcItemTxt, field);
            } else {
              // missing source
              skipAttr = true;
            }
          }
        } else {
          // missing target value
          if (schema.getMissingValueHandler().equals("default")) {
            context.getCounter("Data", "Missing Target").increment(1);
            dist = getDistForMissingTrg(field, mappedValues);
          } else {
            skipAttr = true;
          }
        }

        if (!skipAttr) {
          distStrategy.accumulate(dist, field);
        }
      }
      sim = distStrategy.getSimilarity();
      return sim;
    }