예제 #1
0
    @Override
    public void map(
        Writable key,
        Indexable doc,
        Mapper<Writable, Indexable, ContextPatternWritable, ContextPatternStatsWritable>.Context
            context)
        throws IOException, InterruptedException {
      // set current document
      mExtractor.setDocument(doc);

      // extract example counts
      while (mExtractor.getNextPair(mPair)) {
        if (mPatternTarget) {
          if (mExampleStats.containsKey(mPair.getPattern())) {
            mExampleStats.increment(mPair.getPattern());
          }
        } else if (mContextTarget) {
          if (mExampleStats.containsKey(mPair.getContext())) {
            mExampleStats.increment(mPair.getContext());
          }
        }

        // increment number of pairs
        mTotalTerms++;
      }
    }
예제 #2
0
    @Override
    public void cleanup(
        Mapper<Writable, Indexable, ContextPatternWritable, ContextPatternStatsWritable>.Context
            context)
        throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();

      String examplesPath = conf.get("Mavuno.ExtractGlobalStats.ExamplesPath", null);

      mContext.clear();
      mStats.clear();

      if (mPatternTarget) {
        mStats.globalPatternCount = mTotalTerms;
      } else if (mContextTarget) {
        mStats.globalContextCount = mTotalTerms;
      }

      context.write(mContext, mStats);

      BufferedReader reader = MavunoUtils.getBufferedReader(conf, examplesPath);

      long count = 0L;

      String input;
      while ((input = reader.readLine()) != null) {
        String[] cols = input.split("\t");

        mContext.setId(cols[ContextPatternWritable.ID_FIELD]);
        mStats.weight = Double.parseDouble(cols[ContextPatternWritable.TOTAL_FIELDS]);

        if (mPatternTarget) {
          mContext.setPattern(cols[ContextPatternWritable.PATTERN_FIELD]);

          mStats.globalPatternCount = mExampleStats.get(mContext.getPattern());
          count = mStats.globalPatternCount;
        } else if (mContextTarget) {
          mContext.setContext(cols[ContextPatternWritable.CONTEXT_FIELD]);

          mStats.globalContextCount = mExampleStats.get(mContext.getContext());
          count = mStats.globalContextCount;
        }

        if (count > 0) {
          context.write(mContext, mStats);
        }
      }

      reader.close();
    }
예제 #3
0
    @Override
    public void reduce(
        ContextPatternWritable key,
        Iterable<ContextPatternStatsWritable> values,
        Reducer<
                    ContextPatternWritable,
                    ContextPatternStatsWritable,
                    ContextPatternWritable,
                    ContextPatternStatsWritable>
                .Context
            context)
        throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();

      mResult.zero();

      for (ContextPatternStatsWritable c : values) {
        mResult.increment(c);
      }

      // write total terms
      if (key.getId().equals(ContextPatternWritable.ASTERISK)) {
        if (!MavunoUtils.pathExists(conf, mTotalTermsPath)) {
          BufferedWriter writer = MavunoUtils.getBufferedWriter(conf, mTotalTermsPath);
          if (mPatternTarget) {
            writer.write(Long.toString(mResult.globalPatternCount));
          } else if (mContextTarget) {
            writer.write(Long.toString(mResult.globalContextCount));
          }
          writer.close();
        }
        return;
      }

      context.write(key, mResult);
    }