Пример #1
0
 @Override
 public void reduce(
     GenericKey key,
     Iterator<GenericValue> values,
     OutputCollector<GenericKey, GenericValue> output,
     Reporter reporter)
     throws IOException {
   if (key.getSecondary() < Preprocesser.MINIMUM_ID) { // vector
     output.collect(key, values.next());
     if (values.hasNext()) assert false : "Vectors should not get grouped by combiner: " + key;
   } else { // addend
     reporter.progress();
     int counter = 0;
     float sim = 0;
     HalfPair hp = null;
     while (values.hasNext()) {
       hp = (HalfPair) values.next().get();
       sim += hp.getSimilarity();
       if (counter++ % REPORTER_INTERVAL == 0) reporter.progress();
     }
     if (hp != null) {
       payload.set(hp.getID(), sim);
       outValue.set(payload);
       output.collect(key, outValue);
     } else {
       assert false : "There is nothing to combine!";
     }
   }
 }
  boolean skipRecord(GoogleURL urlObject, Reporter reporter) {

    if (_skipDomain) {
      reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1);
      return true;
    }

    if (!urlObject.isValid()) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1);
      return true;
    } else if (urlObject.has_query()) {
      reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1);
      if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) {
        reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1);
        return true;
      }
    } else {
      // if redirect ... skip
      if ((_flags & HAS_REDIRECT_DATA) != 0) {
        reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1);
        return true;
      }

      if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) {
        if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) {
          reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1);
          return false;
        }
      }

      if (_skipEverythingButHomepage) {
        reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1);
        return true;
      }

      if (_crawlStatus != null) {
        if (_crawlStatus.has("crawl_status")) {
          JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject();
          if (realCrawlStatus.has("http_result")) {
            int httpResult = realCrawlStatus.get("http_result").getAsInt();
            if (httpResult == 200 || httpResult == 404) {
              if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) {
                if (_blogURLSkipFlag.get()) {
                  reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1);
                  return true;
                } else {
                  reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1);
                  return false;
                }
              } else {
                reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1);
                return true;
              }
            }
          }
        }
      }
    }
    return false;
  }
Пример #3
0
    public void map(
        LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      Node node = new Node();
      node.fromNodeMsg(nodetxt.toString());

      List<String> bubbles = node.getBubbles();
      if (bubbles != null) {
        for (String bubble : bubbles) {
          String[] vals = bubble.split("\\|");
          String minor = vals[0];
          String minord = vals[1];
          String dead = vals[2];
          String newd = vals[3];
          String newid = vals[4];
          String extracov = vals[5];

          output.collect(
              new Text(minor),
              new Text(
                  Node.KILLLINKMSG + "\t" + minord + "\t" + dead + "\t" + newd + "\t" + newid));

          output.collect(new Text(dead), new Text(Node.KILLMSG));
          output.collect(new Text(newid), new Text(Node.EXTRACOV + "\t" + extracov));

          reporter.incrCounter("Contrail", "bubblespopped", 1);
        }

        node.clearBubbles();
      }

      output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
      reporter.incrCounter("Contrail", "nodes", 1);
    }
    /**
     * {@inheritDoc}
     *
     * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object,
     *     org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
     */
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
      String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase();

      for (String pattern : m_patternsToSkip) {
        line = line.replaceAll(pattern, "");
      }

      StringTokenizer tokenizer = new StringTokenizer(line);
      while (tokenizer.hasMoreTokens()) {
        m_word.set(tokenizer.nextToken());
        output.collect(m_word, ONE);
        reporter.incrCounter(Counters.INPUT_WORDS, 1);
      }

      if ((++m_numRecords % 100) == 0) {
        reporter.setStatus(
            "Finished processing "
                + m_numRecords
                + " records "
                + "from the input file: "
                + m_inputFile);
      }
    }
    public void map(
        LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      Node node = new Node();
      node.fromNodeMsg(nodetxt.toString());

      for (String adj : Node.dirs) {
        node.setCanCompress(adj, false);

        TailInfo next = node.gettail(adj);

        if (next != null /*&& node.getBlackEdges() == null*/) {
          if (next.id.equals(node.getNodeId())) {
            continue;
          }

          reporter.incrCounter("Brush", "remotemark", 1);

          output.collect(
              new Text(next.id), new Text(Node.HASUNIQUEP + "\t" + node.getNodeId() + "\t" + adj));
        }
      }

      output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));

      reporter.incrCounter("Brush", "nodes", 1);
    }
Пример #6
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
    public void map(
        LongWritable key,
        Text t,
        OutputCollector<IntWritable, PageRankNode> output,
        Reporter reporter)
        throws IOException {

      String[] arr = t.toString().trim().split("\\s+");

      nid.set(Integer.parseInt(arr[0]));
      if (arr.length == 1) {
        node.setNodeId(Integer.parseInt(arr[0]));
        node.setAdjacencyList(new ArrayListOfIntsWritable());

      } else {
        node.setNodeId(Integer.parseInt(arr[0]));

        int[] neighbors = new int[arr.length - 1];
        for (int i = 1; i < arr.length; i++) {
          neighbors[i - 1] = Integer.parseInt(arr[i]);
        }

        node.setAdjacencyList(new ArrayListOfIntsWritable(neighbors));
      }

      reporter.incrCounter("graph", "numNodes", 1);
      reporter.incrCounter("graph", "numEdges", arr.length - 1);

      if (arr.length > 1) {
        reporter.incrCounter("graph", "numActiveNodes", 1);
      }

      output.collect(nid, node);
    }
Пример #8
0
    private boolean outputHelper(
        HalfPair hp,
        int vectorID,
        VectorComponentArrayWritable vector,
        float similarity,
        OutputCollector<VectorPair, FloatWritable> output,
        Reporter reporter)
        throws IOException {
      reporter.incrCounter(APS.EVALUATED, 1);
      reporter.progress();
      if (haspruned) {
        VectorComponentArrayWritable remainder = pruned.get(hp.getID());
        if (remainder != null) {
          // cheap upper bound dot(x,y) <= min(|x|,|y|) * maxweight(x) * maxweight(y)
          // double dotProdBound = min(remainder.length(), vector.length()) *
          // remainder.getMaxWeight()
          // * vector.getMaxWeight();
          // if (compare(similarity + dotProdBound, threshold) >= 0)
          similarity += VectorComponentArrayWritable.dotProduct(vector, remainder);

        } else {
          LOG.warn("No remainder found for vector " + hp.getID());
        }
      }
      if (compare(similarity, threshold) >= 0) {
        int firstID = VectorPair.canonicalFirst(vectorID, hp.getID());
        int secondID = VectorPair.canonicalSecond(vectorID, hp.getID());
        outKey.set(firstID, secondID);
        outValue.set(similarity);
        output.collect(outKey, outValue);
        reporter.incrCounter(APS.SIMILAR, 1);
        return true;
      }
      return false;
    }
Пример #9
0
  public void close() throws IOException {
    if (_consumer != null) _consumer.close();

    String topic = _request.getTopic();
    long endTime = System.currentTimeMillis();
    _reporter.incrCounter(topic, "read-time(ms)", endTime - _startTime);
    _reporter.incrCounter(topic, "request-time(ms)", _requestTime);

    long bytesRead = _offset - _offsetRange[0];
    double megaRead = bytesRead / (1024.0 * 1024.0);
    _reporter.incrCounter(topic, "data-read(mb)", (long) megaRead);
    _reporter.incrCounter(topic, "event-count", _count);
  }
  void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException {

    _domainStats = domainStats;
    if (_domainStats.has("dR")) {
      _domainRank = _domainStats.get("dR").getAsDouble();
    } else {
      _domainRank = 0.0;
    }

    if (_domainStats.has("urls")) {
      int urlCount = _domainStats.get("urls").getAsInt();
      int crawledCount = _domainStats.get("crawled").getAsInt();
      int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0;
      if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) {
        reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1);
        LOG.info(
            "Skipping Everything But Homepage for Domain:"
                + _newDomainBytes.toString()
                + " CrawledCount:"
                + crawledCount
                + " HTTP200Count:"
                + Http200Count
                + " URLCount:"
                + urlCount);
        _skipEverythingButHomepage = true;
      } else if (urlCount > 25000 && urlCount < 100000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      } else if (urlCount > 250000 && urlCount < 1000000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      } else if (urlCount > 1000000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      }
    }
    if (_emittedURLSInFilter >= FLUSH_THRESHOLD) {
      _emittedURLSFilter.clear();
      _emittedURLSInFilter = 0;
      reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1);
    }
  }
  void emitLastRecord(Reporter reporter) throws IOException {

    if (_flags != 0) {
      if (_domainStats == null) {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1);
      }

      if (_crawlStatus != null) {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1);
      }
    }

    if (_contextURLBytes.getLength() >= 4097) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1);
    } else {
      GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString());

      if (!skipRecord(urlObject, reporter)) {

        if (urlObject.has_query()) {
          reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1);
        }

        URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
        if (fp != null) {
          if (_emittedURLSFilter.isPresent(fp)) {
            reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
          } else {
            _emittedURLSFilter.add(fp);
            _emittedURLSInFilter++;

            SegmentGeneratorItem itemValue = new SegmentGeneratorItem();

            itemValue.setDomainFP(fp.getDomainHash());
            itemValue.setRootDomainFP(fp.getRootDomainHash());
            itemValue.setUrlFP(fp.getUrlHash());
            itemValue.setUrl(urlObject.getCanonicalURL());
            itemValue.setPageRank(0);
            itemValue.setModifiedStatus((byte) 0);

            items.add(itemValue);

            if (items.size() >= SPILL_THRESHOLD) spillItems(reporter);
          }
        } else {
          reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
        }
      }
    }

    // reset stuff
    _flags = 0;
    _crawlStatus = null;
    _contextURLBytes.clear();
    _blogURLSkipFlag.set(true);
  }
    public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter)
        throws IOException {

      // key: a single sentence in both languages and alignment
      // ignore value. each key is parallel sentence and its alignment, in xml format

      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());

      // Chunk is an array of tokens in the sentence, without any special tokenization (just
      // separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }

      // ec,fc: English/French sentence represented as sequence of words
      // vocE,vocF: vocabularies for english and french, of type VocabularyWritable

      // ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

      // e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);

      edu.umd.hooka.PhrasePair b = new PhrasePair(f, e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
      oc.collect(ok, b);
    }
Пример #13
0
    public void map(
        WritableComparable key,
        Writable value,
        OutputCollector<WritableComparable, Writable> collector,
        Reporter reporter)
        throws IOException {
      LOG.info("Start Map");
      if (err != null) {
        throw err;
      }

      DecimalFormat df = new DecimalFormat("00000000000000000000");

      collector.collect(new Text(tableName), new Text(""));

      for (long i = 0; i < 50000; i++) {
        long randNum = rand.nextLong();
        Row.Key rowKey = new Row.Key(df.format(randNum));
        Row row = new Row(rowKey);
        row.addCell("Col1", new Cell(Cell.Key.EMPTY_KEY, this.data));
        ctable.put(row);
        if (i % 1000 == 0) {
          reporter.progress();
        }

        if (i % 10000 == 0) {
          LOG.info("uploaded: " + i);
        }
        collector.collect(new Text(df.format(randNum)), new Text(""));
      }

      LOG.info("End Map");
    }
Пример #14
0
    public void map(
        LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter)
        throws IOException {

      int slotId = Integer.parseInt(value.toString().trim());
      html.fireRandom(slotId);

      long[] range = HtmlCore.getPageRange(slotId, pages, slotpages);

      /** For output collect */
      for (long i = range[0]; i < range[1]; i++) {
        key.set(i);

        long[] linkids = html.genPureLinkIds();
        for (int j = 0; j < linkids.length; j++) {
          String to = Long.toString(linkids[j]);
          Text v = new Text(to);
          output.collect(key, v);
          reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength());
        }

        if (0 == (i % 10000)) {
          log.info("still running: " + (i - range[0]) + " of " + slotpages);
        }
      }
    }
Пример #15
0
    /**
     * Map method.
     *
     * @param offset samples starting from the (offset+1)th sample.
     * @param size the number of samples for this map
     * @param out output {ture->numInside, false->numOutside}
     * @param reporter
     */
    public void map(
        LongWritable offset,
        LongWritable size,
        OutputCollector<BooleanWritable, LongWritable> out,
        Reporter reporter)
        throws IOException {

      final HaltonSequence haltonsequence = new HaltonSequence(offset.get());
      long numInside = 0L;
      long numOutside = 0L;

      for (long i = 0; i < size.get(); ) {
        // generate points in a unit square
        final double[] point = haltonsequence.nextPoint();

        // count points inside/outside of the inscribed circle of the square
        final double x = point[0] - 0.5;
        final double y = point[1] - 0.5;
        if (x * x + y * y > 0.25) {
          numOutside++;
        } else {
          numInside++;
        }

        // report status
        i++;
        if (i % 1000 == 0) {
          reporter.setStatus("Generated " + i + " samples.");
        }
      }

      // output map results
      out.collect(new BooleanWritable(true), new LongWritable(numInside));
      out.collect(new BooleanWritable(false), new LongWritable(numOutside));
    }
Пример #16
0
 /**
  * This is the function that re-groups values for a key into sub-groups based on a secondary key
  * (input tag).
  *
  * @param arg1
  * @return
  */
 private SortedMap<Object, ResetableIterator> regroup(Object key, Iterator arg1, Reporter reporter)
     throws IOException {
   this.numOfValues = 0;
   SortedMap<Object, ResetableIterator> retv = new TreeMap<Object, ResetableIterator>();
   IntermediateData aRecord = null;
   while (arg1.hasNext()) {
     this.numOfValues += 1;
     // make log while processing
     if (this.numOfValues % 100 == 0) {
       reporter.setStatus("key: " + key.toString() + " numOfValues: " + this.numOfValues);
     }
     // skip out when exccess limit
     if (this.numOfValues > this.maxNumOfValuesPerGroup) {
       break;
     }
     aRecord = ((IntermediateData) arg1.next()).clone(job);
     Text tag = aRecord.getTag();
     ResetableIterator data = retv.get(tag);
     if (data == null) {
       data = createResetableIterator();
       retv.put(tag, data);
     }
     data.add(aRecord);
   }
   // LOG.info("EXIT while");
   if (this.numOfValues > this.largestNumOfValues) {
     this.largestNumOfValues = numOfValues;
     LOG.info("key: " + key.toString() + " this.largestNumOfValues: " + this.largestNumOfValues);
   }
   return retv;
 }
Пример #17
0
    @Override
    public void reduce(
        MatrixIndexes indexes,
        Iterator<TaggedMatrixValue> values,
        OutputCollector<MatrixIndexes, WeightedPair> out,
        Reporter reporter)
        throws IOException {

      long start = System.currentTimeMillis();

      if (firsttime) {
        cachedReporter = reporter;
        firsttime = false;
      }

      cachedValues.reset();

      while (values.hasNext()) {
        TaggedMatrixValue taggedValue = values.next();
        cachedValues.set(taggedValue.getTag(), indexes, taggedValue.getBaseObject(), true);
      }
      // LOG.info("before aggregation: \n"+cachedValues);
      // perform aggregate operations first
      // processAggregateInstructions(indexes, values);

      // LOG.info("after aggregation: \n"+cachedValues);

      // perform mixed operations
      // processReducerInstructions();

      processCombineInstructionsAndOutput(reporter);

      reporter.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start);
    }
Пример #18
0
    @Override
    public void map(
        WritableComparable key,
        CompactorInputSplit split,
        OutputCollector<NullWritable, NullWritable> nullWritableVOutputCollector,
        Reporter reporter)
        throws IOException {
      // This will only get called once, since CompactRecordReader only returns one record,
      // the input split.
      // Based on the split we're passed we go instantiate the real reader and then iterate on it
      // until it finishes.
      @SuppressWarnings("unchecked") // since there is no way to parametrize instance of Class
      AcidInputFormat<WritableComparable, V> aif =
          instantiate(AcidInputFormat.class, jobConf.get(INPUT_FORMAT_CLASS_NAME));
      ValidTxnList txnList = new ValidReadTxnList(jobConf.get(ValidTxnList.VALID_TXNS_KEY));

      boolean isMajor = jobConf.getBoolean(IS_MAJOR, false);
      AcidInputFormat.RawReader<V> reader =
          aif.getRawReader(
              jobConf,
              isMajor,
              split.getBucket(),
              txnList,
              split.getBaseDir(),
              split.getDeltaDirs());
      RecordIdentifier identifier = reader.createKey();
      V value = reader.createValue();
      getWriter(reporter, reader.getObjectInspector(), split.getBucket());
      while (reader.next(identifier, value)) {
        if (isMajor && reader.isDelete(value)) continue;
        writer.write(value);
        reporter.progress();
      }
    }
Пример #19
0
    public void run(
        RecordReader<IntWritable, WikipediaPage> input,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      IntWritable key = new IntWritable();
      WikipediaPage value = new WikipediaPage();

      long pos = -1;
      long prevPos = -1;

      int prevDocno = 0;

      pos = input.getPos();
      while (input.next(key, value)) {
        if (prevPos != -1 && prevPos != pos) {
          LOG.info(
              "- beginning of block at " + prevPos + ", docno:" + prevDocno + ", file:" + fileno);
          keyOut.set(prevDocno);
          valOut.set(prevPos + "\t" + fileno);
          output.collect(keyOut, valOut);
          reporter.incrCounter(Blocks.Total, 1);
        }

        prevPos = pos;
        pos = input.getPos();
        prevDocno = key.get();
      }
    }
Пример #20
0
 public void func(Operator op) {
   Map<Enum, Long> opStats = op.getStats();
   for (Map.Entry<Enum, Long> e : opStats.entrySet()) {
     if (this.rp != null) {
       rp.incrCounter(e.getKey(), e.getValue());
     }
   }
 }
Пример #21
0
    /** Run a FileOperation */
    public void map(
        Text key,
        PolicyInfo policy,
        OutputCollector<WritableComparable, Text> out,
        Reporter reporter)
        throws IOException {
      this.reporter = reporter;
      try {
        LOG.info("Raiding file=" + key.toString() + " policy=" + policy);
        Path p = new Path(key.toString());
        FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p);
        st.clear();
        RaidNode.doRaid(jobconf, policy, fs, st, reporter);

        ++succeedcount;

        reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks);
        reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize);
        reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks);
        reporter.incrCounter(Counter.META_SIZE, st.metaSize);

        reporter.incrCounter(Counter.FILES_SUCCEEDED, 1);
      } catch (IOException e) {
        ++failcount;
        reporter.incrCounter(Counter.FILES_FAILED, 1);

        String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e);
        out.collect(null, new Text(s));
        LOG.info(s);
      } finally {
        reporter.setStatus(getCountString());
      }
    }
    public void map(
        Text key,
        LongWritable value,
        OutputCollector<Text, LongWritable> collector,
        Reporter reporter)
        throws IOException {

      String name = key.toString();
      long size = value.get();
      long seed = Long.parseLong(name);

      random.setSeed(seed);
      reporter.setStatus("opening " + name);

      DataInputStream in = new DataInputStream(fs.open(new Path(DATA_DIR, name)));

      long read = 0;
      try {
        while (read < size) {
          long remains = size - read;
          int n = (remains <= buffer.length) ? (int) remains : buffer.length;
          in.readFully(buffer, 0, n);
          read += n;
          if (fastCheck) {
            Arrays.fill(check, (byte) random.nextInt(Byte.MAX_VALUE));
          } else {
            random.nextBytes(check);
          }
          if (n != buffer.length) {
            Arrays.fill(buffer, n, buffer.length, (byte) 0);
            Arrays.fill(check, n, check.length, (byte) 0);
          }
          assertTrue(Arrays.equals(buffer, check));

          reporter.setStatus("reading " + name + "@" + read + "/" + size);
        }
      } finally {
        in.close();
      }

      collector.collect(new Text("bytes"), new LongWritable(read));

      reporter.setStatus("read " + name);
    }
    public void map(
        Text key,
        LongWritable value,
        OutputCollector<Text, LongWritable> collector,
        Reporter reporter)
        throws IOException {

      String name = key.toString();
      long size = value.get();
      long seed = Long.parseLong(name);

      random.setSeed(seed);
      reporter.setStatus("creating " + name);

      // write to temp file initially to permit parallel execution
      Path tempFile = new Path(DATA_DIR, name + suffix);
      OutputStream out = fs.create(tempFile);

      long written = 0;
      try {
        while (written < size) {
          if (fastCheck) {
            Arrays.fill(buffer, (byte) random.nextInt(Byte.MAX_VALUE));
          } else {
            random.nextBytes(buffer);
          }
          long remains = size - written;
          int length = (remains <= buffer.length) ? (int) remains : buffer.length;
          out.write(buffer, 0, length);
          written += length;
          reporter.setStatus("writing " + name + "@" + written + "/" + size);
        }
      } finally {
        out.close();
      }
      // rename to final location
      fs.rename(tempFile, new Path(DATA_DIR, name));

      collector.collect(new Text("bytes"), new LongWritable(written));

      reporter.setStatus("wrote " + name);
    }
    public void map(
        Text key, LongWritable value, OutputCollector<K, LongWritable> collector, Reporter reporter)
        throws IOException {
      String name = key.toString();
      long size = value.get();
      long seed = Long.parseLong(name);

      if (size == 0) return;

      reporter.setStatus("opening " + name);

      FSDataInputStream in = fs.open(new Path(DATA_DIR, name));

      try {
        for (int i = 0; i < SEEKS_PER_FILE; i++) {
          // generate a random position
          long position = Math.abs(random.nextLong()) % size;

          // seek file to that position
          reporter.setStatus("seeking " + name);
          in.seek(position);
          byte b = in.readByte();

          // check that byte matches
          byte checkByte = 0;
          // advance random state to that position
          random.setSeed(seed);
          for (int p = 0; p <= position; p += check.length) {
            reporter.setStatus("generating data for " + name);
            if (fastCheck) {
              checkByte = (byte) random.nextInt(Byte.MAX_VALUE);
            } else {
              random.nextBytes(check);
              checkByte = check[(int) (position % check.length)];
            }
          }
          assertEquals(b, checkByte);
        }
      } finally {
        in.close();
      }
    }
Пример #25
0
 /**
  * The subclass can overwrite this method to perform additional filtering and/or other processing
  * logic before a value is collected.
  *
  * @param key
  * @param aRecord
  * @param output
  * @param reporter
  * @throws IOException
  */
 protected void collect(
     Object key, IntermediateData aRecord, OutputCollector output, Reporter reporter)
     throws IOException {
   this.collected += 1;
   addLongValue("collectedCount", 1);
   if (aRecord != null) {
     output.collect(key, aRecord.getData());
     reporter.setStatus("key: " + key.toString() + " collected: " + collected);
     addLongValue("actuallyCollectedCount", 1);
   }
 }
Пример #26
0
  public void map(
      LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter)
      throws IOException {
    double min = value.sumOfSquares(centers.get(0));
    int best = 0;

    for (int index = 1; index < numberOfCenters; ++index) {
      double current = value.sumOfSquares(centers.get(index));

      if (current < min) {
        min = current;
        best = index;
      }
    }

    reporter.incrCounter("NUMBER", "NODES", 1);
    reporter.incrCounter("CENTER", "" + best, 1);

    output.collect(new LongWritable(best), value);
  }
Пример #27
0
 @Override
 public void reduce(
     Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter)
     throws IOException {
   HashSet<Text> hash = new HashSet<Text>();
   while (iter.hasNext()) {
     hash.add(iter.next());
   }
   oc.collect(key, new Text(Integer.toString(hash.size())));
   reporter.setStatus("OK");
 }
Пример #28
0
 @Override
 public void reduce(
     Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter)
     throws IOException {
   HashSet<Text> hash = new HashSet<Text>();
   while (iter.hasNext()) {
     hash.add(iter.next());
   }
   for (Text t : hash) oc.collect(key, t);
   reporter.setStatus("OK");
 }
Пример #29
0
    public void map(
        LongWritable key,
        TrecDocument doc,
        OutputCollector<Text, IntWritable> output,
        Reporter reporter)
        throws IOException {
      reporter.incrCounter(Count.DOCS, 1);

      docid.set(doc.getDocid());
      one.set(docMapping.getDocno(doc.getDocid()));
      output.collect(docid, one);
    }
    public void reduce(
        Text key, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      Node node = new Node(key.toString());
      Set<String> f_unique = new HashSet<String>();
      Set<String> r_unique = new HashSet<String>();

      int sawnode = 0;

      while (iter.hasNext()) {
        String msg = iter.next().toString();

        // System.err.println(key.toString() + "\t" + msg);

        String[] vals = msg.split("\t");

        if (vals[0].equals(Node.NODEMSG)) {
          node.parseNodeMsg(vals, 0);
          sawnode++;
        } else if (vals[0].equals(Node.HASUNIQUEP)) {
          if (vals[2].equals("f")) {
            f_unique.add(vals[1]);
          } else if (vals[2].equals("r")) {
            r_unique.add(vals[1]);
          }
        } else {
          throw new IOException("Unknown msgtype: " + msg);
        }
      }

      if (sawnode != 1) {
        throw new IOException(
            "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + key.toString());
      }

      for (String adj : Node.dirs) {
        TailInfo next = node.gettail(adj);

        if (next != null) {
          if ((next.dir.equals("f") && r_unique.contains(next.id))
              || (next.dir.equals("r") && f_unique.contains(next.id))) {
            // for path compress
            if (node.getBlackEdges() == null) {
              node.setCanCompress(adj, true);
            }
            reporter.incrCounter("Brush", "compressible", 1);
          }
        }
      }
      // System.err.println(node.getNodeId() + " " + node.toNodeMsg() );
      output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
    }