Exemple #1
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);

      srcEntityId = items[0];
      trgEntityId = items[1];
      rank = Integer.parseInt(items[items.length - 1]);

      outKey.initialize();
      if (recordInOutput) {
        // include source and taraget record
        if (recLength == -1) {
          recLength = (items.length - 3) / 2;
          srcRecBeg = 2;
          srcRecEnd = trgRecBeg = 2 + recLength;
          trgRecEnd = trgRecBeg + recLength;
        }
        srcRec = org.chombo.util.Utility.join(items, srcRecBeg, srcRecEnd, fieldDelim);
        trgRec = org.chombo.util.Utility.join(items, trgRecBeg, trgRecEnd, fieldDelim);
        outKey.add(srcEntityId, srcRec, rank);
        outVal.set(trgEntityId + fieldDelim + trgRec + fieldDelim + items[items.length - 1]);
      } else {
        // only target entity id and distance
        outKey.add(srcEntityId, rank);
        outVal.set(trgEntityId + fieldDelim + items[items.length - 1]);
      }
      context.write(outKey, outVal);
    }
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);
      itemList.clear();

      for (int i = 0; i < items.length; ++i) {
        String item = items[i];
        Field field = schema.getEntity().getFieldByOrdinal(i);

        if (null != field && field.getDataType().equals(Field.DATA_TYPE_TEXT)) {
          String format = field.getTextDataSubTypeFormat();
          if (field.getDataSubType().equals(Field.TEXT_TYPE_PERSON_NAME)) {
            item = countryFormat.personNameFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_ONE)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressOneFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_TWO)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressTwoFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_CITY)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STATE)) {
            item = countryFormat.stateFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_ZIP)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_COUNTRY)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_EMAIL_ADDR)) {
            item = countryFormat.emailFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_PHONE_NUM)) {
            item = countryFormat.phoneNumFormat(item, format);
          } else {
            // if text field analyze
            item = tokenize(item);
          }
        }
        itemList.add(item);
      }

      // build value string
      valueHolder.set(org.chombo.util.Utility.join(itemList, fieldDelim));
      context.write(NullWritable.get(), valueHolder);
    }
Exemple #3
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
     */
    protected void reduce(Tuple key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
      srcEntityId = key.getString(0);
      count = 0;
      boolean doEmitNeighbor = false;
      valueList.clear();
      for (Text value : values) {
        doEmitNeighbor = false;
        // count based neighbor
        if (nearestByCount) {
          doEmitNeighbor = true;
          if (++count >= topMatchCount) {
            doEmitNeighbor = false;
          }
        }

        // distance based neighbors
        if (nearestByDistance) {
          // distance based neighbor
          String[] items = value.toString().split(fieldDelim);
          distance = Integer.parseInt(items[items.length - 1]);
          if (distance <= topMatchDistance) {
            if (!nearestByCount) {
              doEmitNeighbor = true;
            }
          } else {
            doEmitNeighbor = false;
          }
        }

        if (doEmitNeighbor) {
          // along with neighbors
          if (compactOutput) {
            if (recordInOutput) {
              // contains id,record,rank - strip out entity ID and rank
              String[] valueItems = value.toString().split(fieldDelim);
              valueList.add(org.chombo.util.Utility.join(valueItems, 1, valueItems.length - 1));
            } else {
              // contains id, rank
              valueList.add(value.toString());
            }
          } else {
            outVal.set(srcEntityId + fieldDelim + value.toString());
            context.write(NullWritable.get(), outVal);
          }
        } else {
          // only source entity if neighborhood condition not met
          if (outputWithNoNeighbor && !compactOutput) {
            outVal.set(srcEntityId);
            context.write(NullWritable.get(), outVal);
          }
        }
      }

      // emit in compact format
      if (compactOutput) {
        boolean doEmit = true;
        String srcRec = recordInOutput ? key.getString(1) : "";
        int numNeighbor = valueList.size();
        if (0 == numNeighbor) {
          // only source entity if neighborhood condition not met
          if (outputWithNoNeighbor) {
            outVal.set(
                recordInOutput
                    ? srcEntityId + fieldDelim + srcRec + fieldDelim + numNeighbor
                    : srcEntityId);
          } else {
            doEmit = false;
          }
        } else {
          String targetValues = org.chombo.util.Utility.join(valueList, fieldDelim);
          outVal.set(
              recordInOutput
                  ? srcEntityId + fieldDelim + srcRec + fieldDelim + numNeighbor + targetValues
                  : srcEntityId + fieldDelim + targetValues);
        }
        if (doEmit) {
          context.write(NullWritable.get(), outVal);
        }
      }
    }