Example #1
0
    private void loadExample(String examplesPath, Configuration conf) throws IOException {
      mExampleStats.clear();
      mTotalTerms = 0L;

      String exampleStr = null;
      final Text example = new Text();

      BufferedReader reader = MavunoUtils.getBufferedReader(conf, examplesPath);

      String input;
      while ((input = reader.readLine()) != null) {
        String[] cols = input.split("\t");

        if (mPatternTarget) {
          exampleStr = cols[ContextPatternWritable.PATTERN_FIELD];
        } else if (mContextTarget) {
          exampleStr = cols[ContextPatternWritable.CONTEXT_FIELD];
        }

        example.set(exampleStr);
        if (!mExampleStats.containsKey(example)) {
          mExampleStats.put(new Text(example), 0);
        }
      }

      reader.close();
    }
Example #2
0
    @Override
    public void map(
        Writable key,
        Indexable doc,
        Mapper<Writable, Indexable, ContextPatternWritable, ContextPatternStatsWritable>.Context
            context)
        throws IOException, InterruptedException {
      // set current document
      mExtractor.setDocument(doc);

      // extract example counts
      while (mExtractor.getNextPair(mPair)) {
        if (mPatternTarget) {
          if (mExampleStats.containsKey(mPair.getPattern())) {
            mExampleStats.increment(mPair.getPattern());
          }
        } else if (mContextTarget) {
          if (mExampleStats.containsKey(mPair.getContext())) {
            mExampleStats.increment(mPair.getContext());
          }
        }

        // increment number of pairs
        mTotalTerms++;
      }
    }
Example #3
0
    @Override
    public void cleanup(
        Mapper<Writable, Indexable, ContextPatternWritable, ContextPatternStatsWritable>.Context
            context)
        throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();

      String examplesPath = conf.get("Mavuno.ExtractGlobalStats.ExamplesPath", null);

      mContext.clear();
      mStats.clear();

      if (mPatternTarget) {
        mStats.globalPatternCount = mTotalTerms;
      } else if (mContextTarget) {
        mStats.globalContextCount = mTotalTerms;
      }

      context.write(mContext, mStats);

      BufferedReader reader = MavunoUtils.getBufferedReader(conf, examplesPath);

      long count = 0L;

      String input;
      while ((input = reader.readLine()) != null) {
        String[] cols = input.split("\t");

        mContext.setId(cols[ContextPatternWritable.ID_FIELD]);
        mStats.weight = Double.parseDouble(cols[ContextPatternWritable.TOTAL_FIELDS]);

        if (mPatternTarget) {
          mContext.setPattern(cols[ContextPatternWritable.PATTERN_FIELD]);

          mStats.globalPatternCount = mExampleStats.get(mContext.getPattern());
          count = mStats.globalPatternCount;
        } else if (mContextTarget) {
          mContext.setContext(cols[ContextPatternWritable.CONTEXT_FIELD]);

          mStats.globalContextCount = mExampleStats.get(mContext.getContext());
          count = mStats.globalContextCount;
        }

        if (count > 0) {
          context.write(mContext, mStats);
        }
      }

      reader.close();
    }