private float jaccardValue(Instance instA, Instance instB) { TIntSet indicesA = new TIntHashSet(instA.getNumIndices()); for (int i = 0; i < instA.getNumIndices(); i++) { indicesA.add(instA.getIndex(i)); } int sizeOfUnion = indicesA.size(); int sizeOfIntersection = 0; for (int i = 0; i < instB.getNumIndices(); i++) { int hostIndex = instB.getIndex(i); if (indicesA.contains(hostIndex)) { sizeOfIntersection += 1; } else { sizeOfUnion += 1; } } return (float) sizeOfIntersection / (float) sizeOfUnion; }
public void loadTrainingInstances(String date) throws IOException, ClassNotFoundException { String line = null; // TIntByteHashMap map = null; Path file = null; Configuration conf = context.getConfiguration(); Path trainingPath = new Path( conf.get(Util.CONF_DATASET_PATH) + "/" + conf.get(Util.CONF_SESSION_DURATION) + "/" + conf.get(Util.CONF_OPTIONS) + "/" + date); FileSystem fs = FileSystem.get(conf); trainingInstances = new TLinkedHashSet<Instance>(); for (FileStatus fileStatus : fs.listStatus(trainingPath)) { if (fileStatus.isDir()) { continue; } file = fileStatus.getPath(); LOG.info("reading from " + file + "..."); FSDataInputStream in = fs.open(file); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); int i = 0; while (true) { line = reader.readLine(); if (i++ % 10000 == 0) { Util.ping(context, JaccardTestMapper.class); i = 0; } if (line == null) { break; } Instance instance; try { instance = Instance.fromString(header, line, context); } catch (Exception e) { LOG.warn("Skipping invalid instance: " + line); continue; } trainingInstances.add(instance); } } line = null; LOG.info("training day has " + trainingInstances.size() + " classes/instances"); Util.getMemoryInfo(JaccardTestMapper.class); }
@Override public void map(LongWritable key, ObjectWritable obj, Context context) throws IOException, InterruptedException { if (setupFailedException != null) { throw new IOException(setupFailedException); } DoubleWritable classI = new DoubleWritable(); DoubleWritable value = new DoubleWritable(); lastTime = System.currentTimeMillis(); Instance instance; instance = (Instance) obj.get(); // remove all hosts whose DF is below the threshold if (hostsWithMinimumDF != null) { instance.setAcceptableIndices(hostsWithMinimumDF.keySet()); } // loop through training instances for (Instance trainingInstance : trainingInstances) { try { float jaccardValue = jaccardValue(trainingInstance, instance); int trainingClassId = classIndex.getIndexPosition(trainingInstance.getClassLabel()); classI.set(trainingClassId); value.set(jaccardValue); // store it in an array with the classIndex array.set(new DoubleWritable[] {classI, value}); // and hand it to the reducer context.write(new Text(instance.getId()), array); } catch (Exception e) { e.printStackTrace(); LOG.error("map failed with exception"); throw new IOException(e); } } // count the number of instances per class // context.write(new Text(Util.INSTANCES_PER_CLASS_PATH + " " + // instance.getClassLabel()), ONE); counter++; long timeTaken = System.currentTimeMillis() - lastTime; totalTime += timeTaken; if ((counter % 10) == 0) { // print out some performance stuff LOG.info( "instance " + counter + " duration: " + ((double) timeTaken / 1000) + " s - avg : " + ((double) (totalTime / counter) / 1000) + " s" + " num_values: " + instance.getNumValues()); } double duration = ((double) timeTaken / 1000); if (duration > REPORT_SLOW_INSTANCE_THRESHOLD) { LOG.info( "Mapped a particularly SLOW INSTANCE. classLabel: " + instance.getClassLabel() + ", " + "duration: " + duration + " s (" + duration / 60 + " min)," + " num_values: " + instance.getNumValues()); } }