public void map(
        LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      Node node = new Node();
      node.fromNodeMsg(nodetxt.toString());

      for (String adj : Node.dirs) {
        node.setCanCompress(adj, false);

        TailInfo next = node.gettail(adj);

        if (next != null /*&& node.getBlackEdges() == null*/) {
          if (next.id.equals(node.getNodeId())) {
            continue;
          }

          reporter.incrCounter("Brush", "remotemark", 1);

          output.collect(
              new Text(next.id), new Text(Node.HASUNIQUEP + "\t" + node.getNodeId() + "\t" + adj));
        }
      }

      output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));

      reporter.incrCounter("Brush", "nodes", 1);
    }
    public void map(
        LongWritable key,
        Text t,
        OutputCollector<IntWritable, PageRankNode> output,
        Reporter reporter)
        throws IOException {

      String[] arr = t.toString().trim().split("\\s+");

      nid.set(Integer.parseInt(arr[0]));
      if (arr.length == 1) {
        node.setNodeId(Integer.parseInt(arr[0]));
        node.setAdjacencyList(new ArrayListOfIntsWritable());

      } else {
        node.setNodeId(Integer.parseInt(arr[0]));

        int[] neighbors = new int[arr.length - 1];
        for (int i = 1; i < arr.length; i++) {
          neighbors[i - 1] = Integer.parseInt(arr[i]);
        }

        node.setAdjacencyList(new ArrayListOfIntsWritable(neighbors));
      }

      reporter.incrCounter("graph", "numNodes", 1);
      reporter.incrCounter("graph", "numEdges", arr.length - 1);

      if (arr.length > 1) {
        reporter.incrCounter("graph", "numActiveNodes", 1);
      }

      output.collect(nid, node);
    }
示例#3
0
    public void map(
        LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      Node node = new Node();
      node.fromNodeMsg(nodetxt.toString());

      List<String> bubbles = node.getBubbles();
      if (bubbles != null) {
        for (String bubble : bubbles) {
          String[] vals = bubble.split("\\|");
          String minor = vals[0];
          String minord = vals[1];
          String dead = vals[2];
          String newd = vals[3];
          String newid = vals[4];
          String extracov = vals[5];

          output.collect(
              new Text(minor),
              new Text(
                  Node.KILLLINKMSG + "\t" + minord + "\t" + dead + "\t" + newd + "\t" + newid));

          output.collect(new Text(dead), new Text(Node.KILLMSG));
          output.collect(new Text(newid), new Text(Node.EXTRACOV + "\t" + extracov));

          reporter.incrCounter("Contrail", "bubblespopped", 1);
        }

        node.clearBubbles();
      }

      output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
      reporter.incrCounter("Contrail", "nodes", 1);
    }
示例#4
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
  boolean skipRecord(GoogleURL urlObject, Reporter reporter) {

    if (_skipDomain) {
      reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1);
      return true;
    }

    if (!urlObject.isValid()) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1);
      return true;
    } else if (urlObject.has_query()) {
      reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1);
      if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) {
        reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1);
        return true;
      }
    } else {
      // if redirect ... skip
      if ((_flags & HAS_REDIRECT_DATA) != 0) {
        reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1);
        return true;
      }

      if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) {
        if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) {
          reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1);
          return false;
        }
      }

      if (_skipEverythingButHomepage) {
        reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1);
        return true;
      }

      if (_crawlStatus != null) {
        if (_crawlStatus.has("crawl_status")) {
          JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject();
          if (realCrawlStatus.has("http_result")) {
            int httpResult = realCrawlStatus.get("http_result").getAsInt();
            if (httpResult == 200 || httpResult == 404) {
              if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) {
                if (_blogURLSkipFlag.get()) {
                  reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1);
                  return true;
                } else {
                  reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1);
                  return false;
                }
              } else {
                reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1);
                return true;
              }
            }
          }
        }
      }
    }
    return false;
  }
示例#6
0
    private boolean outputHelper(
        HalfPair hp,
        int vectorID,
        VectorComponentArrayWritable vector,
        float similarity,
        OutputCollector<VectorPair, FloatWritable> output,
        Reporter reporter)
        throws IOException {
      reporter.incrCounter(APS.EVALUATED, 1);
      reporter.progress();
      if (haspruned) {
        VectorComponentArrayWritable remainder = pruned.get(hp.getID());
        if (remainder != null) {
          // cheap upper bound dot(x,y) <= min(|x|,|y|) * maxweight(x) * maxweight(y)
          // double dotProdBound = min(remainder.length(), vector.length()) *
          // remainder.getMaxWeight()
          // * vector.getMaxWeight();
          // if (compare(similarity + dotProdBound, threshold) >= 0)
          similarity += VectorComponentArrayWritable.dotProduct(vector, remainder);

        } else {
          LOG.warn("No remainder found for vector " + hp.getID());
        }
      }
      if (compare(similarity, threshold) >= 0) {
        int firstID = VectorPair.canonicalFirst(vectorID, hp.getID());
        int secondID = VectorPair.canonicalSecond(vectorID, hp.getID());
        outKey.set(firstID, secondID);
        outValue.set(similarity);
        output.collect(outKey, outValue);
        reporter.incrCounter(APS.SIMILAR, 1);
        return true;
      }
      return false;
    }
示例#7
0
  public void close() throws IOException {
    if (_consumer != null) _consumer.close();

    String topic = _request.getTopic();
    long endTime = System.currentTimeMillis();
    _reporter.incrCounter(topic, "read-time(ms)", endTime - _startTime);
    _reporter.incrCounter(topic, "request-time(ms)", _requestTime);

    long bytesRead = _offset - _offsetRange[0];
    double megaRead = bytesRead / (1024.0 * 1024.0);
    _reporter.incrCounter(topic, "data-read(mb)", (long) megaRead);
    _reporter.incrCounter(topic, "event-count", _count);
  }
  void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException {

    _domainStats = domainStats;
    if (_domainStats.has("dR")) {
      _domainRank = _domainStats.get("dR").getAsDouble();
    } else {
      _domainRank = 0.0;
    }

    if (_domainStats.has("urls")) {
      int urlCount = _domainStats.get("urls").getAsInt();
      int crawledCount = _domainStats.get("crawled").getAsInt();
      int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0;
      if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) {
        reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1);
        LOG.info(
            "Skipping Everything But Homepage for Domain:"
                + _newDomainBytes.toString()
                + " CrawledCount:"
                + crawledCount
                + " HTTP200Count:"
                + Http200Count
                + " URLCount:"
                + urlCount);
        _skipEverythingButHomepage = true;
      } else if (urlCount > 25000 && urlCount < 100000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      } else if (urlCount > 250000 && urlCount < 1000000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      } else if (urlCount > 1000000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      }
    }
    if (_emittedURLSInFilter >= FLUSH_THRESHOLD) {
      _emittedURLSFilter.clear();
      _emittedURLSInFilter = 0;
      reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1);
    }
  }
  void emitLastRecord(Reporter reporter) throws IOException {

    if (_flags != 0) {
      if (_domainStats == null) {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1);
      }

      if (_crawlStatus != null) {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1);
      }
    }

    if (_contextURLBytes.getLength() >= 4097) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1);
    } else {
      GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString());

      if (!skipRecord(urlObject, reporter)) {

        if (urlObject.has_query()) {
          reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1);
        }

        URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
        if (fp != null) {
          if (_emittedURLSFilter.isPresent(fp)) {
            reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
          } else {
            _emittedURLSFilter.add(fp);
            _emittedURLSInFilter++;

            SegmentGeneratorItem itemValue = new SegmentGeneratorItem();

            itemValue.setDomainFP(fp.getDomainHash());
            itemValue.setRootDomainFP(fp.getRootDomainHash());
            itemValue.setUrlFP(fp.getUrlHash());
            itemValue.setUrl(urlObject.getCanonicalURL());
            itemValue.setPageRank(0);
            itemValue.setModifiedStatus((byte) 0);

            items.add(itemValue);

            if (items.size() >= SPILL_THRESHOLD) spillItems(reporter);
          }
        } else {
          reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
        }
      }
    }

    // reset stuff
    _flags = 0;
    _crawlStatus = null;
    _contextURLBytes.clear();
    _blogURLSkipFlag.set(true);
  }
    public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter)
        throws IOException {

      // key: a single sentence in both languages and alignment
      // ignore value. each key is parallel sentence and its alignment, in xml format

      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());

      // Chunk is an array of tokens in the sentence, without any special tokenization (just
      // separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }

      // ec,fc: English/French sentence represented as sequence of words
      // vocE,vocF: vocabularies for english and french, of type VocabularyWritable

      // ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

      // e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);

      edu.umd.hooka.PhrasePair b = new PhrasePair(f, e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
      oc.collect(ok, b);
    }
    public void run(
        RecordReader<IntWritable, WikipediaPage> input,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      IntWritable key = new IntWritable();
      WikipediaPage value = new WikipediaPage();

      long pos = -1;
      long prevPos = -1;

      int prevDocno = 0;

      pos = input.getPos();
      while (input.next(key, value)) {
        if (prevPos != -1 && prevPos != pos) {
          LOG.info(
              "- beginning of block at " + prevPos + ", docno:" + prevDocno + ", file:" + fileno);
          keyOut.set(prevDocno);
          valOut.set(prevPos + "\t" + fileno);
          output.collect(keyOut, valOut);
          reporter.incrCounter(Blocks.Total, 1);
        }

        prevPos = pos;
        pos = input.getPos();
        prevDocno = key.get();
      }
    }
示例#12
0
    @Override
    public void reduce(
        MatrixIndexes indexes,
        Iterator<TaggedMatrixValue> values,
        OutputCollector<MatrixIndexes, WeightedPair> out,
        Reporter reporter)
        throws IOException {

      long start = System.currentTimeMillis();

      if (firsttime) {
        cachedReporter = reporter;
        firsttime = false;
      }

      cachedValues.reset();

      while (values.hasNext()) {
        TaggedMatrixValue taggedValue = values.next();
        cachedValues.set(taggedValue.getTag(), indexes, taggedValue.getBaseObject(), true);
      }
      // LOG.info("before aggregation: \n"+cachedValues);
      // perform aggregate operations first
      // processAggregateInstructions(indexes, values);

      // LOG.info("after aggregation: \n"+cachedValues);

      // perform mixed operations
      // processReducerInstructions();

      processCombineInstructionsAndOutput(reporter);

      reporter.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start);
    }
示例#13
0
    public void map(
        LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter)
        throws IOException {

      int slotId = Integer.parseInt(value.toString().trim());
      html.fireRandom(slotId);

      long[] range = HtmlCore.getPageRange(slotId, pages, slotpages);

      /** For output collect */
      for (long i = range[0]; i < range[1]; i++) {
        key.set(i);

        long[] linkids = html.genPureLinkIds();
        for (int j = 0; j < linkids.length; j++) {
          String to = Long.toString(linkids[j]);
          Text v = new Text(to);
          output.collect(key, v);
          reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength());
        }

        if (0 == (i % 10000)) {
          log.info("still running: " + (i - range[0]) + " of " + slotpages);
        }
      }
    }
    /**
     * {@inheritDoc}
     *
     * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object,
     *     org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
     */
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
      String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase();

      for (String pattern : m_patternsToSkip) {
        line = line.replaceAll(pattern, "");
      }

      StringTokenizer tokenizer = new StringTokenizer(line);
      while (tokenizer.hasMoreTokens()) {
        m_word.set(tokenizer.nextToken());
        output.collect(m_word, ONE);
        reporter.incrCounter(Counters.INPUT_WORDS, 1);
      }

      if ((++m_numRecords % 100) == 0) {
        reporter.setStatus(
            "Finished processing "
                + m_numRecords
                + " records "
                + "from the input file: "
                + m_inputFile);
      }
    }
示例#15
0
 public void func(Operator op) {
   Map<Enum, Long> opStats = op.getStats();
   for (Map.Entry<Enum, Long> e : opStats.entrySet()) {
     if (this.rp != null) {
       rp.incrCounter(e.getKey(), e.getValue());
     }
   }
 }
示例#16
0
  public void map(
      LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter)
      throws IOException {
    double min = value.sumOfSquares(centers.get(0));
    int best = 0;

    for (int index = 1; index < numberOfCenters; ++index) {
      double current = value.sumOfSquares(centers.get(index));

      if (current < min) {
        min = current;
        best = index;
      }
    }

    reporter.incrCounter("NUMBER", "NODES", 1);
    reporter.incrCounter("CENTER", "" + best, 1);

    output.collect(new LongWritable(best), value);
  }
示例#17
0
    public void map(
        LongWritable key,
        TrecDocument doc,
        OutputCollector<Text, IntWritable> output,
        Reporter reporter)
        throws IOException {
      reporter.incrCounter(Count.DOCS, 1);

      docid.set(doc.getDocid());
      one.set(docMapping.getDocno(doc.getDocid()));
      output.collect(docid, one);
    }
示例#18
0
    /** Run a FileOperation */
    public void map(
        Text key,
        PolicyInfo policy,
        OutputCollector<WritableComparable, Text> out,
        Reporter reporter)
        throws IOException {
      this.reporter = reporter;
      try {
        LOG.info("Raiding file=" + key.toString() + " policy=" + policy);
        Path p = new Path(key.toString());
        FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p);
        st.clear();
        RaidNode.doRaid(jobconf, policy, fs, st, reporter);

        ++succeedcount;

        reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks);
        reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize);
        reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks);
        reporter.incrCounter(Counter.META_SIZE, st.metaSize);

        reporter.incrCounter(Counter.FILES_SUCCEEDED, 1);
      } catch (IOException e) {
        ++failcount;
        reporter.incrCounter(Counter.FILES_FAILED, 1);

        String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e);
        out.collect(null, new Text(s));
        LOG.info(s);
      } finally {
        reporter.setStatus(getCountString());
      }
    }
    public void reduce(
        Text key, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      Node node = new Node(key.toString());
      Set<String> f_unique = new HashSet<String>();
      Set<String> r_unique = new HashSet<String>();

      int sawnode = 0;

      while (iter.hasNext()) {
        String msg = iter.next().toString();

        // System.err.println(key.toString() + "\t" + msg);

        String[] vals = msg.split("\t");

        if (vals[0].equals(Node.NODEMSG)) {
          node.parseNodeMsg(vals, 0);
          sawnode++;
        } else if (vals[0].equals(Node.HASUNIQUEP)) {
          if (vals[2].equals("f")) {
            f_unique.add(vals[1]);
          } else if (vals[2].equals("r")) {
            r_unique.add(vals[1]);
          }
        } else {
          throw new IOException("Unknown msgtype: " + msg);
        }
      }

      if (sawnode != 1) {
        throw new IOException(
            "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + key.toString());
      }

      for (String adj : Node.dirs) {
        TailInfo next = node.gettail(adj);

        if (next != null) {
          if ((next.dir.equals("f") && r_unique.contains(next.id))
              || (next.dir.equals("r") && f_unique.contains(next.id))) {
            // for path compress
            if (node.getBlackEdges() == null) {
              node.setCanCompress(adj, true);
            }
            reporter.incrCounter("Brush", "compressible", 1);
          }
        }
      }
      // System.err.println(node.getNodeId() + " " + node.toNodeMsg() );
      output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
    }
  private boolean validate(String str, Reporter reporter) {
    String[] parts = str.split("\t");

    if (parts.length != 6) {
      if (parts.length < 6) {
        reporter.incrCounter(LineCounters.TOO_FEW_TABS, 1);
      } else {
        reporter.incrCounter(LineCounters.TOO_MANY_TABS, 1);
      }

      reporter.incrCounter(LineCounters.BAD_LINES, 1);

      if ((reporter.getCounter(LineCounters.BAD_LINES).getCounter() % 10) == 0) {
        reporter.setStatus("Got 10 bad lines.");
        System.err.println("Read another 10 bad lines.");
      }

      return false;
    }

    return true;
  }
  private void flushDomain(Reporter reporter) throws IOException {
    if (_currentDomainId != -1) {
      if (items.size() != 0) {
        spillItems(reporter);
      }

      if (reporter != null) {
        if (currentDomainSpilledItemCount >= 10000000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_10MILLION_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 1000000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_1MILLION_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 100000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_100K_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 50000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_50K_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 10000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_10K_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 1000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_1K_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 100) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_100_URLS, 1);
        } else if (currentDomainSpilledItemCount >= 10) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_10_URLS, 1);
        } else if (currentDomainSpilledItemCount > 1) {
          reporter.incrCounter(Counters.DOMAIN_WITH_LT_10_URLS, 1);
        } else if (currentDomainSpilledItemCount == 1) {
          reporter.incrCounter(Counters.DOMAIN_WITH_1_URL, 1);
        }
      }

      _currentDomainId = -1;
      currentDomainCrawlIdx = -1;
      currentDomainName = "";
      currentDomainSpilledItemCount = 0;
      currentDomainURLCount = 0;
    }
  }
示例#22
0
    @Override
    public void reduce(
        GenericKey key,
        Iterator<GenericValue> values,
        OutputCollector<VectorPair, FloatWritable> output,
        Reporter reporter)
        throws IOException {

      int vectorID = key.getPrimary();
      assert (key.getSecondary() == -1);
      // the vector is the first value
      VectorComponentArrayWritable vector = (VectorComponentArrayWritable) values.next().get();
      // half pairs are sorted such that all equal pairs are consecutive
      if (values.hasNext()) {
        reporter.incrCounter(APS.COMBINED, 1);
        HalfPair hp1 = (HalfPair) values.next().get();
        float similarity = hp1.getSimilarity();
        HalfPair hp2;
        int counter = 0;
        while (values.hasNext()) {
          reporter.incrCounter(APS.COMBINED, 1);
          if (counter++ % REPORTER_INTERVAL == 0) reporter.progress();
          hp2 = (HalfPair) values.next().get();
          if (hp1.equals(hp2)) {
            similarity += hp2.getSimilarity();
          } else {
            // output
            outputHelper(hp1, vectorID, vector, similarity, output, reporter);
            // start new stripe
            hp1 = hp2;
            similarity = hp1.getSimilarity();
          }
        }
        // output the last one
        outputHelper(hp1, vectorID, vector, similarity, output, reporter);
      }
    }
示例#23
0
    /** Outputs exactly one value for each key; this suppresses duplicates */
    @Override
    public void reduce(
        Text key, Iterator<LogRecord> vals, OutputCollector<Text, LogRecord> out, Reporter r)
        throws IOException {
      LogRecord i = vals.next();

      // out.collect(new Text(key.getKey()), i);
      out.collect(key, i);
      int dups = 0;
      while (vals.hasNext()) {
        vals.next();
        dups++;
      }
      r.incrCounter("app", "duplicate chunks", dups);
    }
示例#24
0
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter)
        throws IOException {

      int slotId = Integer.parseInt(value.toString().trim());
      long[] range = HtmlCore.getPageRange(slotId, pages, slotpages);

      for (long i = range[0]; i < range[1]; i++) {
        key.set(i);
        Text v = new Text(Long.toString(i));
        output.collect(key, v);
        reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength());
      }
    }
  /** potentially reset state based on domain id transition * */
  private void domainTransition(long newDomainFP, String newDomainName, Reporter reporter)
      throws IOException {
    if (_currentDomainId != -1) {
      flushDomain(reporter);
    }

    _flags = 0;
    _domainStats = null;
    _domainRank = 0.0;
    _skipDomain = false;
    _skipEverythingButHomepage = false;

    // zero out item count ...
    items.clear();
    // reset domain id
    _currentDomainId = newDomainFP;
    currentDomainCrawlIdx = (((int) _currentDomainId & Integer.MAX_VALUE) % crawlerCount);
    // reset current domain url count
    currentDomainURLCount = 0;
    currentDomainName = newDomainName;
    // and reset last bundle id
    currentBundleId = 0;
    // reset spill count for domain
    currentDomainSpilledItemCount = 0;

    if (BlockedDomainList.blockedDomains.contains(newDomainFP)) {
      reporter.incrCounter(Counters.SKIPPING_BLOCKED_DOMAIN, 1);
      LOG.info("Skipping Blocked Domain:" + newDomainName);
      _skipDomain = true;
    }

    if (ipAddressRegExPattern.matcher(currentDomainName.trim()).matches()) {
      reporter.incrCounter(Counters.SKIPPING_IP_ADDRESS, 1);
      _skipDomain = true;
    }
  }
示例#26
0
 /** Given an output filename, write a bunch of random records to it. */
 public void map(
     WritableComparable key,
     Writable value,
     OutputCollector<BytesWritable, BytesWritable> output,
     Reporter reporter)
     throws IOException {
   int itemCount = 0;
   while (numBytesToWrite > 0) {
     int keyLength = minKeySize + (keySizeRange != 0 ? random.nextInt(keySizeRange) : 0);
     randomKey.setSize(keyLength);
     randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength());
     int valueLength = minValueSize + (valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0);
     randomValue.setSize(valueLength);
     randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength());
     output.collect(randomKey, randomValue);
     numBytesToWrite -= keyLength + valueLength;
     reporter.incrCounter(Counters.BYTES_WRITTEN, keyLength + valueLength);
     reporter.incrCounter(Counters.RECORDS_WRITTEN, 1);
     if (++itemCount % 200 == 0) {
       reporter.setStatus("wrote record " + itemCount + ". " + numBytesToWrite + " bytes left.");
     }
   }
   reporter.setStatus("done with " + itemCount + " records.");
 }
 public void warn(Object o, String msg, Enum warningEnum) {
   String displayMessage = o.getClass().getName() + ": " + msg;
   if (aggregate) {
     if (reporter != null) {
       reporter.incrCounter(warningEnum, 1);
     } else {
       // TODO:
       // in local mode of execution if the PigHadoopLogger is used initially,
       // then aggregation cannot be performed as the reporter will be null.
       // The reference to a reporter is given by Hadoop at run time.
       // In local mode, due to the absence of Hadoop there will be no reporter
       // Just print the warning message as is.
       // If a warning message is printed in map reduce mode when aggregation
       // is turned on then we have a problem, its a bug.
       log.warn(displayMessage);
     }
   } else {
     log.warn(displayMessage);
   }
 }
示例#28
0
 private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
   CrawlDatum newDatum =
       new CrawlDatum(
           CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore());
   // transfer all existing metadata to the redirect
   newDatum.getMetaData().putAll(fit.datum.getMetaData());
   scfilters.initialScore(redirUrl, newDatum);
   if (reprUrl != null) {
     newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
   }
   fit = FetchItem.create(redirUrl, newDatum, queueMode);
   if (fit != null) {
     FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
     fiq.addInProgressFetchItem(fit);
   } else {
     // stop redirecting
     redirecting = false;
     reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
   }
   return fit;
 }
  /** spill cached items * */
  void spillItems(Reporter reporter) throws IOException {
    // if item count exceeds spill threshold .. or we ran out of data ...
    if (items.size() != 0) {
      // LOG.info("Spilling Bundle:" + currentBundleId + " for DH:" +
      // currentDomain + " ItemCount:" + subList.size());
      // flush items
      generateABundle(_currentDomainId, items, reporter);
      if (reporter != null) {
        reporter.progress();
      }
      // ok, increment counts ...
      currentDomainSpilledItemCount += items.size();

      if (currentDomainSpilledItemCount >= 1000000) {
        reporter.incrCounter(Counters.SPILLED_1_MILLION_SKIPPED_REST, 1);
        LOG.info("Skipping Remaining URLS for Domain:" + currentDomainName);
        _skipDomain = true;
      }
    }
    // reset list ...
    items.clear();
  }
示例#30
0
  @Override
  public void reduce(
      TaggedFirstSecondIndexes indexes,
      Iterator<MatrixValue> values,
      OutputCollector<Writable, Writable> out,
      Reporter report)
      throws IOException {
    long start = System.currentTimeMillis();
    //		LOG.info("---------- key: "+indexes);

    commonSetup(report);

    // perform aggregate
    MatrixValue aggregateValue = performAggregateInstructions(indexes, values);

    if (aggregateValue == null) return;

    int tag = indexes.getTag();
    long firstIndex = indexes.getFirstIndex();
    long secondIndex = indexes.getSecondIndex();

    // for a different k
    if (prevFirstIndex != firstIndex) {
      resetCache();
      prevFirstIndex = firstIndex;
    } else if (prevTag > tag)
      throw new RuntimeException("tag is not ordered correctly: " + prevTag + " > " + tag);

    remainingbuffer.set(secondIndex, aggregateValue);
    try {
      processJoin(tag, remainingbuffer);
    } catch (Exception e) {
      throw new IOException(e);
    }
    prevTag = tag;

    report.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start);
  }