@Override public void setup(Context context) { this.context = context; classIndex = ClassIndex.getInstance(); Configuration conf = context.getConfiguration(); float threshold = conf.getFloat(Util.CONF_MINIMUM_DF_OF_HOSTS, 0); try { Path headerPath = new Path(conf.get(Util.CONF_HEADER_PATH)); LOG.info("Reading dataset header..." + headerPath.toString()); header = new DataSetHeader(conf, headerPath); if (!classIndex.isPopulated()) { classIndex.init(conf); classIndex.populateIndex(); } if (threshold > 0.0) { LOG.info("loading DF values"); hostsWithMinimumDF = Util.getHostsWithDocumentFrequencies(conf, threshold); } LOG.info("loading training data..."); loadTrainingInstances(conf.get(Util.CONF_TRAINING_DATE)); } catch (Exception e) { LOG.error("setup failed with an exception!"); e.printStackTrace(); setupFailedException = e; } }
@Override public void map(LongWritable key, ObjectWritable obj, Context context) throws IOException, InterruptedException { if (setupFailedException != null) { throw new IOException(setupFailedException); } DoubleWritable classI = new DoubleWritable(); DoubleWritable value = new DoubleWritable(); lastTime = System.currentTimeMillis(); Instance instance; instance = (Instance) obj.get(); // remove all hosts whose DF is below the threshold if (hostsWithMinimumDF != null) { instance.setAcceptableIndices(hostsWithMinimumDF.keySet()); } // loop through training instances for (Instance trainingInstance : trainingInstances) { try { float jaccardValue = jaccardValue(trainingInstance, instance); int trainingClassId = classIndex.getIndexPosition(trainingInstance.getClassLabel()); classI.set(trainingClassId); value.set(jaccardValue); // store it in an array with the classIndex array.set(new DoubleWritable[] {classI, value}); // and hand it to the reducer context.write(new Text(instance.getId()), array); } catch (Exception e) { e.printStackTrace(); LOG.error("map failed with exception"); throw new IOException(e); } } // count the number of instances per class // context.write(new Text(Util.INSTANCES_PER_CLASS_PATH + " " + // instance.getClassLabel()), ONE); counter++; long timeTaken = System.currentTimeMillis() - lastTime; totalTime += timeTaken; if ((counter % 10) == 0) { // print out some performance stuff LOG.info( "instance " + counter + " duration: " + ((double) timeTaken / 1000) + " s - avg : " + ((double) (totalTime / counter) / 1000) + " s" + " num_values: " + instance.getNumValues()); } double duration = ((double) timeTaken / 1000); if (duration > REPORT_SLOW_INSTANCE_THRESHOLD) { LOG.info( "Mapped a particularly SLOW INSTANCE. classLabel: " + instance.getClassLabel() + ", " + "duration: " + duration + " s (" + duration / 60 + " min)," + " num_values: " + instance.getNumValues()); } }