@Override
  public void setup(Context context) {

    this.context = context;

    classIndex = ClassIndex.getInstance();

    Configuration conf = context.getConfiguration();

    float threshold = conf.getFloat(Util.CONF_MINIMUM_DF_OF_HOSTS, 0);

    try {

      Path headerPath = new Path(conf.get(Util.CONF_HEADER_PATH));

      LOG.info("Reading dataset header..." + headerPath.toString());

      header = new DataSetHeader(conf, headerPath);
      if (!classIndex.isPopulated()) {
        classIndex.init(conf);
        classIndex.populateIndex();
      }
      if (threshold > 0.0) {
        LOG.info("loading DF values");
        hostsWithMinimumDF = Util.getHostsWithDocumentFrequencies(conf, threshold);
      }

      LOG.info("loading training data...");

      loadTrainingInstances(conf.get(Util.CONF_TRAINING_DATE));

    } catch (Exception e) {
      LOG.error("setup failed with an exception!");
      e.printStackTrace();
      setupFailedException = e;
    }
  }
  public void loadTrainingInstances(String date) throws IOException, ClassNotFoundException {
    String line = null;

    // TIntByteHashMap map = null;
    Path file = null;

    Configuration conf = context.getConfiguration();

    Path trainingPath =
        new Path(
            conf.get(Util.CONF_DATASET_PATH)
                + "/"
                + conf.get(Util.CONF_SESSION_DURATION)
                + "/"
                + conf.get(Util.CONF_OPTIONS)
                + "/"
                + date);

    FileSystem fs = FileSystem.get(conf);

    trainingInstances = new TLinkedHashSet<Instance>();

    for (FileStatus fileStatus : fs.listStatus(trainingPath)) {
      if (fileStatus.isDir()) {
        continue;
      }

      file = fileStatus.getPath();

      LOG.info("reading from " + file + "...");

      FSDataInputStream in = fs.open(file);
      BufferedReader reader = new BufferedReader(new InputStreamReader(in));

      int i = 0;
      while (true) {
        line = reader.readLine();

        if (i++ % 10000 == 0) {
          Util.ping(context, JaccardTestMapper.class);
          i = 0;
        }

        if (line == null) {
          break;
        }

        Instance instance;
        try {
          instance = Instance.fromString(header, line, context);
        } catch (Exception e) {
          LOG.warn("Skipping invalid instance: " + line);
          continue;
        }
        trainingInstances.add(instance);
      }
    }

    line = null;

    LOG.info("training day has " + trainingInstances.size() + " classes/instances");
    Util.getMemoryInfo(JaccardTestMapper.class);
  }
  /** The main entry point if this class is called as a {@link Tool}. */
  @Override
  public int run(String[] args) throws Exception {
    Path inputPath = null;
    Path outputPath = null;

    Configuration conf = getConf();

    // retrieve our paths from the configuration
    inputPath = new Path(conf.get(Util.CONF_LOGDATA_PATH));
    outputPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH));

    final int numCores = conf.getInt(Util.CONF_NUM_CORES, Util.DEFAULT_NUM_CORES);
    final int numNodes = conf.getInt(Util.CONF_NUM_NODES, Util.DEFAULT_NUM_NODES);

    NUM_OF_REDUCE_TASKS = numCores * numNodes;

    // set the jobname
    String jobName =
        Util.JOB_NAME
            + " ["
            + CachingTool.ACTION
            + "] {logdata="
            + inputPath.getName()
            + ", session="
            + conf.get(Util.CONF_SESSION_DURATION)
            + "}";

    Util.showStatus("Running " + jobName);

    conf.set("hadoop.job.ugi", Util.HADOOP_USER);
    conf.set("mapred.child.java.opts", "-Xmx1500M -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode");
    conf.set("mapred.task.timeout", "1800000");
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    FileSystem fs = FileSystem.get(conf);

    Job job = new Job(conf, jobName);

    // set number of reduce tasks
    job.setNumReduceTasks(NUM_OF_REDUCE_TASKS);

    // set mapper, reducer, partitioner and grouping comperator
    job.setJarByClass(CachingTool.class);
    job.setMapperClass(CachingMapper.class);
    job.setReducerClass(CachingReducer.class);
    // GroupingComperator used for Secondary-Sort
    job.setGroupingComparatorClass(TextPair.FirstComparator.class);
    job.setPartitionerClass(TextPair.FirstPartitioner.class);
    job.setOutputKeyClass(TextPair.class);
    job.setOutputValueClass(Text.class);

    // set input and output format
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setMaxInputSplitSize(job, Util.DATASET_MB_SPLIT * 25);
    FileInputFormat.setMinInputSplitSize(job, Util.DATASET_MB_SPLIT * 25);

    // add input path subdirectories if there are any
    ArrayList<Path> inputPaths = Util.getInputDirectories(fs, inputPath);
    int pathsAdded = 0;
    if (inputPaths.size() > 0) {
      for (Path p : inputPaths) {
        if (!p.getName().contains(".") && !p.getName().contains("_")) {
          Util.showStatus("Adding input paths " + p);
          FileInputFormat.addInputPath(job, p);
          pathsAdded++;
        }
      }
    }

    if (pathsAdded == 0) {
      Util.showStatus("Adding input path " + inputPath);
      FileInputFormat.addInputPath(job, inputPath);
    }

    // clear output dir
    fs.delete(outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)), true);

    FileOutputFormat.setOutputPath(
        job, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)));

    // run the job and wait for it to be completed
    boolean b = job.waitForCompletion(true);

    // NOTE! The counters will be written HERE
    // retrieve the counters
    Counter numNewInCache = job.getCounters().findCounter(CachingReducer.CacheCounter.NEW_TO_CACHE);
    Counter numRenewCache = job.getCounters().findCounter(CachingReducer.CacheCounter.RENEW_CACHE);
    Counter numUsedFromCache =
        job.getCounters().findCounter(CachingReducer.CacheCounter.USED_FROM_CACHE);

    // write the counters to the metadata file
    Path headerPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH));
    FSDataOutputStream out =
        fs.create(headerPath.suffix("/" + DataSetHeader.SIMULATE_CACHING_METADATA_FILE));
    PrintWriter w = new PrintWriter(out);

    // the sum of all counters equals the sum of all queries in the log file
    w.println("hostnametypeAddedToCache=" + numNewInCache.getValue());
    w.println("queriesAddedAgainToCache=" + numRenewCache.getValue());
    w.println("queriesAnsweredFromCache=" + numUsedFromCache.getValue());

    w.close();
    out.close();

    // Delete all empty output files
    Util.deleteEmptyFiles(fs, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)));

    return b ? 1 : 0;
  }