private float jaccardValue(Instance instA, Instance instB) {
    TIntSet indicesA = new TIntHashSet(instA.getNumIndices());

    for (int i = 0; i < instA.getNumIndices(); i++) {
      indicesA.add(instA.getIndex(i));
    }

    int sizeOfUnion = indicesA.size();
    int sizeOfIntersection = 0;

    for (int i = 0; i < instB.getNumIndices(); i++) {
      int hostIndex = instB.getIndex(i);
      if (indicesA.contains(hostIndex)) {
        sizeOfIntersection += 1;
      } else {
        sizeOfUnion += 1;
      }
    }

    return (float) sizeOfIntersection / (float) sizeOfUnion;
  }
  public void loadTrainingInstances(String date) throws IOException, ClassNotFoundException {
    String line = null;

    // TIntByteHashMap map = null;
    Path file = null;

    Configuration conf = context.getConfiguration();

    Path trainingPath =
        new Path(
            conf.get(Util.CONF_DATASET_PATH)
                + "/"
                + conf.get(Util.CONF_SESSION_DURATION)
                + "/"
                + conf.get(Util.CONF_OPTIONS)
                + "/"
                + date);

    FileSystem fs = FileSystem.get(conf);

    trainingInstances = new TLinkedHashSet<Instance>();

    for (FileStatus fileStatus : fs.listStatus(trainingPath)) {
      if (fileStatus.isDir()) {
        continue;
      }

      file = fileStatus.getPath();

      LOG.info("reading from " + file + "...");

      FSDataInputStream in = fs.open(file);
      BufferedReader reader = new BufferedReader(new InputStreamReader(in));

      int i = 0;
      while (true) {
        line = reader.readLine();

        if (i++ % 10000 == 0) {
          Util.ping(context, JaccardTestMapper.class);
          i = 0;
        }

        if (line == null) {
          break;
        }

        Instance instance;
        try {
          instance = Instance.fromString(header, line, context);
        } catch (Exception e) {
          LOG.warn("Skipping invalid instance: " + line);
          continue;
        }
        trainingInstances.add(instance);
      }
    }

    line = null;

    LOG.info("training day has " + trainingInstances.size() + " classes/instances");
    Util.getMemoryInfo(JaccardTestMapper.class);
  }
  @Override
  public void map(LongWritable key, ObjectWritable obj, Context context)
      throws IOException, InterruptedException {

    if (setupFailedException != null) {
      throw new IOException(setupFailedException);
    }

    DoubleWritable classI = new DoubleWritable();
    DoubleWritable value = new DoubleWritable();

    lastTime = System.currentTimeMillis();

    Instance instance;

    instance = (Instance) obj.get();

    // remove all hosts whose DF is below the threshold
    if (hostsWithMinimumDF != null) {
      instance.setAcceptableIndices(hostsWithMinimumDF.keySet());
    }

    // loop through training instances
    for (Instance trainingInstance : trainingInstances) {
      try {
        float jaccardValue = jaccardValue(trainingInstance, instance);

        int trainingClassId = classIndex.getIndexPosition(trainingInstance.getClassLabel());

        classI.set(trainingClassId);
        value.set(jaccardValue);
        // store it in an array with the classIndex
        array.set(new DoubleWritable[] {classI, value});

        // and hand it to the reducer
        context.write(new Text(instance.getId()), array);
      } catch (Exception e) {
        e.printStackTrace();
        LOG.error("map failed with exception");
        throw new IOException(e);
      }
    }

    // count the number of instances per class
    // context.write(new Text(Util.INSTANCES_PER_CLASS_PATH + " " +
    // instance.getClassLabel()), ONE);

    counter++;

    long timeTaken = System.currentTimeMillis() - lastTime;
    totalTime += timeTaken;

    if ((counter % 10) == 0) {
      // print out some performance stuff
      LOG.info(
          "instance "
              + counter
              + " duration: "
              + ((double) timeTaken / 1000)
              + " s - avg : "
              + ((double) (totalTime / counter) / 1000)
              + " s"
              + " num_values: "
              + instance.getNumValues());
    }

    double duration = ((double) timeTaken / 1000);
    if (duration > REPORT_SLOW_INSTANCE_THRESHOLD) {
      LOG.info(
          "Mapped a particularly SLOW INSTANCE. classLabel: "
              + instance.getClassLabel()
              + ", "
              + "duration: "
              + duration
              + " s ("
              + duration / 60
              + " min),"
              + " num_values: "
              + instance.getNumValues());
    }
  }