Пример #1
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);

      srcEntityId = items[0];
      trgEntityId = items[1];
      rank = Integer.parseInt(items[items.length - 1]);

      outKey.initialize();
      if (recordInOutput) {
        // include source and taraget record
        if (recLength == -1) {
          recLength = (items.length - 3) / 2;
          srcRecBeg = 2;
          srcRecEnd = trgRecBeg = 2 + recLength;
          trgRecEnd = trgRecBeg + recLength;
        }
        srcRec = org.chombo.util.Utility.join(items, srcRecBeg, srcRecEnd, fieldDelim);
        trgRec = org.chombo.util.Utility.join(items, trgRecBeg, trgRecEnd, fieldDelim);
        outKey.add(srcEntityId, srcRec, rank);
        outVal.set(trgEntityId + fieldDelim + trgRec + fieldDelim + items[items.length - 1]);
      } else {
        // only target entity id and distance
        outKey.add(srcEntityId, rank);
        outVal.set(trgEntityId + fieldDelim + items[items.length - 1]);
      }
      context.write(outKey, outVal);
    }
Пример #2
0
  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Running aggregates  for numerical attributes";
    job.setJobName(jobName);

    job.setJarByClass(RunningAggregator.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "chombo");
    job.setMapperClass(RunningAggregator.AggrMapper.class);
    job.setReducerClass(RunningAggregator.AggrReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }
Пример #3
0
  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Rating predictor  MR";
    job.setJobName(jobName);

    job.setJarByClass(UtilityPredictor.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(UtilityPredictor.PredictionMapper.class);
    job.setReducerClass(UtilityPredictor.PredictorReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(ItemIdGroupComprator.class);
    job.setPartitionerClass(ItemIdPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("utp.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }
  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Implicit rating estimator MR";
    job.setJobName(jobName);

    job.setJarByClass(ImplicitRatingEstimator.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(ImplicitRatingEstimator.RatingEstimatorMapper.class);
    job.setReducerClass(ImplicitRatingEstimator.RatingEstimatorReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));
    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }
Пример #5
0
  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Top n matches MR";
    job.setJobName(jobName);

    job.setJarByClass(TopMatches.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(TopMatches.TopMatchesMapper.class);
    job.setReducerClass(TopMatches.TopMatchesReducer.class);
    job.setCombinerClass(TopMatches.TopMatchesCombiner.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("tom.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }
Пример #6
0
  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Rating difference MR";
    job.setJobName(jobName);

    job.setJarByClass(RatingDifference.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RatingDifference.DiffMapper.class);
    job.setReducerClass(RatingDifference.DiffReducer.class);

    job.setMapOutputKeyClass(TextPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    Utility.setConfiguration(job.getConfiguration());
    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));
    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }
Пример #7
0
  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Histogram MR";
    job.setJobName(jobName);

    job.setJarByClass(Histogram.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(Histogram.HistogramMapper.class);
    job.setReducerClass(Histogram.HistogramReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("his.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }
 /* (non-Javadoc)
  * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
  */
 protected void setup(Context context) throws IOException, InterruptedException {
   Configuration config = context.getConfiguration();
   fieldDelim = config.get("field.delim", ",");
   InputStream fs = Utility.getFileStream(config, "rating.mapper.config.path");
   ObjectMapper mapper = new ObjectMapper();
   ratingMapper = mapper.readValue(fs, EngagementToPreferenceMapper.class);
   outputDetail = config.getBoolean("rating.estimator.output.detail", false);
 }
Пример #9
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);
      itemList.clear();

      for (int i = 0; i < items.length; ++i) {
        String item = items[i];
        Field field = schema.getEntity().getFieldByOrdinal(i);

        if (null != field && field.getDataType().equals(Field.DATA_TYPE_TEXT)) {
          String format = field.getTextDataSubTypeFormat();
          if (field.getDataSubType().equals(Field.TEXT_TYPE_PERSON_NAME)) {
            item = countryFormat.personNameFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_ONE)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressOneFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_TWO)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressTwoFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_CITY)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STATE)) {
            item = countryFormat.stateFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_ZIP)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_COUNTRY)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_EMAIL_ADDR)) {
            item = countryFormat.emailFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_PHONE_NUM)) {
            item = countryFormat.phoneNumFormat(item, format);
          } else {
            // if text field analyze
            item = tokenize(item);
          }
        }
        itemList.add(item);
      }

      // build value string
      valueHolder.set(org.chombo.util.Utility.join(itemList, fieldDelim));
      context.write(NullWritable.get(), valueHolder);
    }
Пример #10
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
     */
    protected void reduce(Tuple key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
      srcEntityId = key.getString(0);
      count = 0;
      boolean doEmitNeighbor = false;
      valueList.clear();
      for (Text value : values) {
        doEmitNeighbor = false;
        // count based neighbor
        if (nearestByCount) {
          doEmitNeighbor = true;
          if (++count >= topMatchCount) {
            doEmitNeighbor = false;
          }
        }

        // distance based neighbors
        if (nearestByDistance) {
          // distance based neighbor
          String[] items = value.toString().split(fieldDelim);
          distance = Integer.parseInt(items[items.length - 1]);
          if (distance <= topMatchDistance) {
            if (!nearestByCount) {
              doEmitNeighbor = true;
            }
          } else {
            doEmitNeighbor = false;
          }
        }

        if (doEmitNeighbor) {
          // along with neighbors
          if (compactOutput) {
            if (recordInOutput) {
              // contains id,record,rank - strip out entity ID and rank
              String[] valueItems = value.toString().split(fieldDelim);
              valueList.add(org.chombo.util.Utility.join(valueItems, 1, valueItems.length - 1));
            } else {
              // contains id, rank
              valueList.add(value.toString());
            }
          } else {
            outVal.set(srcEntityId + fieldDelim + value.toString());
            context.write(NullWritable.get(), outVal);
          }
        } else {
          // only source entity if neighborhood condition not met
          if (outputWithNoNeighbor && !compactOutput) {
            outVal.set(srcEntityId);
            context.write(NullWritable.get(), outVal);
          }
        }
      }

      // emit in compact format
      if (compactOutput) {
        boolean doEmit = true;
        String srcRec = recordInOutput ? key.getString(1) : "";
        int numNeighbor = valueList.size();
        if (0 == numNeighbor) {
          // only source entity if neighborhood condition not met
          if (outputWithNoNeighbor) {
            outVal.set(
                recordInOutput
                    ? srcEntityId + fieldDelim + srcRec + fieldDelim + numNeighbor
                    : srcEntityId);
          } else {
            doEmit = false;
          }
        } else {
          String targetValues = org.chombo.util.Utility.join(valueList, fieldDelim);
          outVal.set(
              recordInOutput
                  ? srcEntityId + fieldDelim + srcRec + fieldDelim + numNeighbor + targetValues
                  : srcEntityId + fieldDelim + targetValues);
        }
        if (doEmit) {
          context.write(NullWritable.get(), outVal);
        }
      }
    }
Пример #11
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
     */
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration config = context.getConfiguration();
      if (config.getBoolean("debug.on", false)) {
        LOG.setLevel(Level.DEBUG);
        System.out.println("in debug mode");
      }

      fieldDelim = config.get("field.delim", ",");
      topMatchCount = config.getInt("top.match.count", 10);
      isValidationMode = config.getBoolean("validation.mode", true);
      kernelFunction = config.get("kernel.function", "none");
      kernelParam = config.getInt("kernel.param", -1);
      classCondtionWeighted = config.getBoolean("class.condtion.weighted", false);
      neighborhood = new Neighborhood(kernelFunction, kernelParam, classCondtionWeighted);
      outputClassDistr = config.getBoolean("output.class.distr", false);
      inverseDistanceWeighted = config.getBoolean("inverse.distance.weighted", false);

      // regression
      String predictionMode = config.get("prediction.mode", "classification");
      if (predictionMode.equals("regression")) {
        neighborhood.withPredictionMode(PredictionMode.Regression);
        String regressionMethod = config.get("regression.method", "average");
        regressionMethod = WordUtils.capitalize(regressionMethod);
        neighborhood.withRegressionMethod(RegressionMethod.valueOf(regressionMethod));
      }

      // decision threshold for classification
      decisionThreshold = Double.parseDouble(config.get("decision.threshold", "-1.0"));
      if (decisionThreshold > 0 && neighborhood.IsInClassificationMode()) {
        String[] classAttrValues = config.get("class.attribute.values").split(",");
        posClassAttrValue = classAttrValues[0];
        negClassAttrValue = classAttrValues[1];
        neighborhood.withDecisionThreshold(decisionThreshold).withPositiveClass(posClassAttrValue);
      }

      // using cost based arbitrator for classification
      useCostBasedClassifier = config.getBoolean("use.cost.based.classifier", false);
      if (useCostBasedClassifier && neighborhood.IsInClassificationMode()) {
        if (null == posClassAttrValue) {
          String[] classAttrValues = config.get("class.attribute.values").split(",");
          posClassAttrValue = classAttrValues[0];
          negClassAttrValue = classAttrValues[1];
        }

        int[] missclassificationCost =
            Utility.intArrayFromString(config.get("misclassification.cost"));
        falsePosCost = missclassificationCost[0];
        falseNegCost = missclassificationCost[1];
        costBasedArbitrator =
            new CostBasedArbitrator(
                negClassAttrValue, posClassAttrValue, falseNegCost, falsePosCost);
      }

      // confusion matrix for classification validation
      if (isValidationMode) {
        if (neighborhood.IsInClassificationMode()) {
          InputStream fs =
              Utility.getFileStream(context.getConfiguration(), "feature.schema.file.path");
          ObjectMapper mapper = new ObjectMapper();
          schema = mapper.readValue(fs, FeatureSchema.class);
          classAttrField = schema.findClassAttrField();
          List<String> cardinality = classAttrField.getCardinality();
          predictingClasses = new String[2];
          predictingClasses[0] = cardinality.get(0);
          predictingClasses[1] = cardinality.get(1);
          confMatrix = new ConfusionMatrix(predictingClasses[0], predictingClasses[1]);
        }
      }
      LOG.debug(
          "classCondtionWeighted:"
              + classCondtionWeighted
              + "outputClassDistr:"
              + outputClassDistr);
    }