@Override public void setup(Context context) { this.context = context; classIndex = ClassIndex.getInstance(); Configuration conf = context.getConfiguration(); float threshold = conf.getFloat(Util.CONF_MINIMUM_DF_OF_HOSTS, 0); try { Path headerPath = new Path(conf.get(Util.CONF_HEADER_PATH)); LOG.info("Reading dataset header..." + headerPath.toString()); header = new DataSetHeader(conf, headerPath); if (!classIndex.isPopulated()) { classIndex.init(conf); classIndex.populateIndex(); } if (threshold > 0.0) { LOG.info("loading DF values"); hostsWithMinimumDF = Util.getHostsWithDocumentFrequencies(conf, threshold); } LOG.info("loading training data..."); loadTrainingInstances(conf.get(Util.CONF_TRAINING_DATE)); } catch (Exception e) { LOG.error("setup failed with an exception!"); e.printStackTrace(); setupFailedException = e; } }
public void loadTrainingInstances(String date) throws IOException, ClassNotFoundException { String line = null; // TIntByteHashMap map = null; Path file = null; Configuration conf = context.getConfiguration(); Path trainingPath = new Path( conf.get(Util.CONF_DATASET_PATH) + "/" + conf.get(Util.CONF_SESSION_DURATION) + "/" + conf.get(Util.CONF_OPTIONS) + "/" + date); FileSystem fs = FileSystem.get(conf); trainingInstances = new TLinkedHashSet<Instance>(); for (FileStatus fileStatus : fs.listStatus(trainingPath)) { if (fileStatus.isDir()) { continue; } file = fileStatus.getPath(); LOG.info("reading from " + file + "..."); FSDataInputStream in = fs.open(file); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); int i = 0; while (true) { line = reader.readLine(); if (i++ % 10000 == 0) { Util.ping(context, JaccardTestMapper.class); i = 0; } if (line == null) { break; } Instance instance; try { instance = Instance.fromString(header, line, context); } catch (Exception e) { LOG.warn("Skipping invalid instance: " + line); continue; } trainingInstances.add(instance); } } line = null; LOG.info("training day has " + trainingInstances.size() + " classes/instances"); Util.getMemoryInfo(JaccardTestMapper.class); }
/** The main entry point if this class is called as a {@link Tool}. */ @Override public int run(String[] args) throws Exception { Path inputPath = null; Path outputPath = null; Configuration conf = getConf(); // retrieve our paths from the configuration inputPath = new Path(conf.get(Util.CONF_LOGDATA_PATH)); outputPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH)); final int numCores = conf.getInt(Util.CONF_NUM_CORES, Util.DEFAULT_NUM_CORES); final int numNodes = conf.getInt(Util.CONF_NUM_NODES, Util.DEFAULT_NUM_NODES); NUM_OF_REDUCE_TASKS = numCores * numNodes; // set the jobname String jobName = Util.JOB_NAME + " [" + CachingTool.ACTION + "] {logdata=" + inputPath.getName() + ", session=" + conf.get(Util.CONF_SESSION_DURATION) + "}"; Util.showStatus("Running " + jobName); conf.set("hadoop.job.ugi", Util.HADOOP_USER); conf.set("mapred.child.java.opts", "-Xmx1500M -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode"); conf.set("mapred.task.timeout", "1800000"); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, jobName); // set number of reduce tasks job.setNumReduceTasks(NUM_OF_REDUCE_TASKS); // set mapper, reducer, partitioner and grouping comperator job.setJarByClass(CachingTool.class); job.setMapperClass(CachingMapper.class); job.setReducerClass(CachingReducer.class); // GroupingComperator used for Secondary-Sort job.setGroupingComparatorClass(TextPair.FirstComparator.class); job.setPartitionerClass(TextPair.FirstPartitioner.class); job.setOutputKeyClass(TextPair.class); job.setOutputValueClass(Text.class); // set input and output format job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setMaxInputSplitSize(job, Util.DATASET_MB_SPLIT * 25); FileInputFormat.setMinInputSplitSize(job, Util.DATASET_MB_SPLIT * 25); // add input path subdirectories if there are any ArrayList<Path> inputPaths = Util.getInputDirectories(fs, inputPath); int pathsAdded = 0; if (inputPaths.size() > 0) { for (Path p : inputPaths) { if (!p.getName().contains(".") && !p.getName().contains("_")) { Util.showStatus("Adding input paths " + p); FileInputFormat.addInputPath(job, p); pathsAdded++; } } } if (pathsAdded == 0) { Util.showStatus("Adding input path " + inputPath); FileInputFormat.addInputPath(job, inputPath); } // clear output dir fs.delete(outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)), true); FileOutputFormat.setOutputPath( job, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION))); // run the job and wait for it to be completed boolean b = job.waitForCompletion(true); // NOTE! The counters will be written HERE // retrieve the counters Counter numNewInCache = job.getCounters().findCounter(CachingReducer.CacheCounter.NEW_TO_CACHE); Counter numRenewCache = job.getCounters().findCounter(CachingReducer.CacheCounter.RENEW_CACHE); Counter numUsedFromCache = job.getCounters().findCounter(CachingReducer.CacheCounter.USED_FROM_CACHE); // write the counters to the metadata file Path headerPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH)); FSDataOutputStream out = fs.create(headerPath.suffix("/" + DataSetHeader.SIMULATE_CACHING_METADATA_FILE)); PrintWriter w = new PrintWriter(out); // the sum of all counters equals the sum of all queries in the log file w.println("hostnametypeAddedToCache=" + numNewInCache.getValue()); w.println("queriesAddedAgainToCache=" + numRenewCache.getValue()); w.println("queriesAnsweredFromCache=" + numUsedFromCache.getValue()); w.close(); out.close(); // Delete all empty output files Util.deleteEmptyFiles(fs, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION))); return b ? 1 : 0; }