private void loadExample(String examplesPath, Configuration conf) throws IOException { mExampleStats.clear(); mTotalTerms = 0L; String exampleStr = null; final Text example = new Text(); BufferedReader reader = MavunoUtils.getBufferedReader(conf, examplesPath); String input; while ((input = reader.readLine()) != null) { String[] cols = input.split("\t"); if (mPatternTarget) { exampleStr = cols[ContextPatternWritable.PATTERN_FIELD]; } else if (mContextTarget) { exampleStr = cols[ContextPatternWritable.CONTEXT_FIELD]; } example.set(exampleStr); if (!mExampleStats.containsKey(example)) { mExampleStats.put(new Text(example), 0); } } reader.close(); }
@Override public void map( Writable key, Indexable doc, Mapper<Writable, Indexable, ContextPatternWritable, ContextPatternStatsWritable>.Context context) throws IOException, InterruptedException { // set current document mExtractor.setDocument(doc); // extract example counts while (mExtractor.getNextPair(mPair)) { if (mPatternTarget) { if (mExampleStats.containsKey(mPair.getPattern())) { mExampleStats.increment(mPair.getPattern()); } } else if (mContextTarget) { if (mExampleStats.containsKey(mPair.getContext())) { mExampleStats.increment(mPair.getContext()); } } // increment number of pairs mTotalTerms++; } }
@Override public void cleanup( Mapper<Writable, Indexable, ContextPatternWritable, ContextPatternStatsWritable>.Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String examplesPath = conf.get("Mavuno.ExtractGlobalStats.ExamplesPath", null); mContext.clear(); mStats.clear(); if (mPatternTarget) { mStats.globalPatternCount = mTotalTerms; } else if (mContextTarget) { mStats.globalContextCount = mTotalTerms; } context.write(mContext, mStats); BufferedReader reader = MavunoUtils.getBufferedReader(conf, examplesPath); long count = 0L; String input; while ((input = reader.readLine()) != null) { String[] cols = input.split("\t"); mContext.setId(cols[ContextPatternWritable.ID_FIELD]); mStats.weight = Double.parseDouble(cols[ContextPatternWritable.TOTAL_FIELDS]); if (mPatternTarget) { mContext.setPattern(cols[ContextPatternWritable.PATTERN_FIELD]); mStats.globalPatternCount = mExampleStats.get(mContext.getPattern()); count = mStats.globalPatternCount; } else if (mContextTarget) { mContext.setContext(cols[ContextPatternWritable.CONTEXT_FIELD]); mStats.globalContextCount = mExampleStats.get(mContext.getContext()); count = mStats.globalContextCount; } if (count > 0) { context.write(mContext, mStats); } } reader.close(); }