public void map( LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Node node = new Node(); node.fromNodeMsg(nodetxt.toString()); for (String adj : Node.dirs) { node.setCanCompress(adj, false); TailInfo next = node.gettail(adj); if (next != null /*&& node.getBlackEdges() == null*/) { if (next.id.equals(node.getNodeId())) { continue; } reporter.incrCounter("Brush", "remotemark", 1); output.collect( new Text(next.id), new Text(Node.HASUNIQUEP + "\t" + node.getNodeId() + "\t" + adj)); } } output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg())); reporter.incrCounter("Brush", "nodes", 1); }
public void map( LongWritable key, Text t, OutputCollector<IntWritable, PageRankNode> output, Reporter reporter) throws IOException { String[] arr = t.toString().trim().split("\\s+"); nid.set(Integer.parseInt(arr[0])); if (arr.length == 1) { node.setNodeId(Integer.parseInt(arr[0])); node.setAdjacencyList(new ArrayListOfIntsWritable()); } else { node.setNodeId(Integer.parseInt(arr[0])); int[] neighbors = new int[arr.length - 1]; for (int i = 1; i < arr.length; i++) { neighbors[i - 1] = Integer.parseInt(arr[i]); } node.setAdjacencyList(new ArrayListOfIntsWritable(neighbors)); } reporter.incrCounter("graph", "numNodes", 1); reporter.incrCounter("graph", "numEdges", arr.length - 1); if (arr.length > 1) { reporter.incrCounter("graph", "numActiveNodes", 1); } output.collect(nid, node); }
public void map( LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Node node = new Node(); node.fromNodeMsg(nodetxt.toString()); List<String> bubbles = node.getBubbles(); if (bubbles != null) { for (String bubble : bubbles) { String[] vals = bubble.split("\\|"); String minor = vals[0]; String minord = vals[1]; String dead = vals[2]; String newd = vals[3]; String newid = vals[4]; String extracov = vals[5]; output.collect( new Text(minor), new Text( Node.KILLLINKMSG + "\t" + minord + "\t" + dead + "\t" + newd + "\t" + newid)); output.collect(new Text(dead), new Text(Node.KILLMSG)); output.collect(new Text(newid), new Text(Node.EXTRACOV + "\t" + extracov)); reporter.incrCounter("Contrail", "bubblespopped", 1); } node.clearBubbles(); } output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg())); reporter.incrCounter("Contrail", "nodes", 1); }
public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
boolean skipRecord(GoogleURL urlObject, Reporter reporter) { if (_skipDomain) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1); return true; } if (!urlObject.isValid()) { reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1); return true; } else if (urlObject.has_query()) { reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1); if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) { reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1); return true; } } else { // if redirect ... skip if ((_flags & HAS_REDIRECT_DATA) != 0) { reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1); return true; } if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) { if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) { reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1); return false; } } if (_skipEverythingButHomepage) { reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1); return true; } if (_crawlStatus != null) { if (_crawlStatus.has("crawl_status")) { JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject(); if (realCrawlStatus.has("http_result")) { int httpResult = realCrawlStatus.get("http_result").getAsInt(); if (httpResult == 200 || httpResult == 404) { if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) { if (_blogURLSkipFlag.get()) { reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1); return true; } else { reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1); return false; } } else { reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1); return true; } } } } } } return false; }
private boolean outputHelper( HalfPair hp, int vectorID, VectorComponentArrayWritable vector, float similarity, OutputCollector<VectorPair, FloatWritable> output, Reporter reporter) throws IOException { reporter.incrCounter(APS.EVALUATED, 1); reporter.progress(); if (haspruned) { VectorComponentArrayWritable remainder = pruned.get(hp.getID()); if (remainder != null) { // cheap upper bound dot(x,y) <= min(|x|,|y|) * maxweight(x) * maxweight(y) // double dotProdBound = min(remainder.length(), vector.length()) * // remainder.getMaxWeight() // * vector.getMaxWeight(); // if (compare(similarity + dotProdBound, threshold) >= 0) similarity += VectorComponentArrayWritable.dotProduct(vector, remainder); } else { LOG.warn("No remainder found for vector " + hp.getID()); } } if (compare(similarity, threshold) >= 0) { int firstID = VectorPair.canonicalFirst(vectorID, hp.getID()); int secondID = VectorPair.canonicalSecond(vectorID, hp.getID()); outKey.set(firstID, secondID); outValue.set(similarity); output.collect(outKey, outValue); reporter.incrCounter(APS.SIMILAR, 1); return true; } return false; }
public void close() throws IOException { if (_consumer != null) _consumer.close(); String topic = _request.getTopic(); long endTime = System.currentTimeMillis(); _reporter.incrCounter(topic, "read-time(ms)", endTime - _startTime); _reporter.incrCounter(topic, "request-time(ms)", _requestTime); long bytesRead = _offset - _offsetRange[0]; double megaRead = bytesRead / (1024.0 * 1024.0); _reporter.incrCounter(topic, "data-read(mb)", (long) megaRead); _reporter.incrCounter(topic, "event-count", _count); }
void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException { _domainStats = domainStats; if (_domainStats.has("dR")) { _domainRank = _domainStats.get("dR").getAsDouble(); } else { _domainRank = 0.0; } if (_domainStats.has("urls")) { int urlCount = _domainStats.get("urls").getAsInt(); int crawledCount = _domainStats.get("crawled").getAsInt(); int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0; if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1); LOG.info( "Skipping Everything But Homepage for Domain:" + _newDomainBytes.toString() + " CrawledCount:" + crawledCount + " HTTP200Count:" + Http200Count + " URLCount:" + urlCount); _skipEverythingButHomepage = true; } else if (urlCount > 25000 && urlCount < 100000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 250000 && urlCount < 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } } if (_emittedURLSInFilter >= FLUSH_THRESHOLD) { _emittedURLSFilter.clear(); _emittedURLSInFilter = 0; reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1); } }
void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }
public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter) throws IOException { // key: a single sentence in both languages and alignment // ignore value. each key is parallel sentence and its alignment, in xml format ParallelChunk c = pcr.parseString(key.toString()); ok.set(c.idString()); // Chunk is an array of tokens in the sentence, without any special tokenization (just // separated by spaces) Chunk fc = c.getChunk(src); Chunk ec = c.getChunk(tgt); if (fc == null || ec == null) { reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1); return; } if (fc.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1); return; } if (ec.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1); return; } // ec,fc: English/French sentence represented as sequence of words // vocE,vocF: vocabularies for english and french, of type VocabularyWritable // ee,fe: integer representation of words in sentences ec and fc sLogger.debug("Target sentence:"); int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE); sLogger.debug("Source sentence:"); int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF); // e,f: phrase from whole sentence Phrase e = new Phrase(ee, 0); Phrase f = new Phrase(fe, 1); edu.umd.hooka.PhrasePair b = new PhrasePair(f, e); ReferenceAlignment ra = c.getReferenceAlignment(lp); if (ra != null) { b.setAlignment(ra); } reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length); reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length); reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1); oc.collect(ok, b); }
public void run( RecordReader<IntWritable, WikipediaPage> input, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { IntWritable key = new IntWritable(); WikipediaPage value = new WikipediaPage(); long pos = -1; long prevPos = -1; int prevDocno = 0; pos = input.getPos(); while (input.next(key, value)) { if (prevPos != -1 && prevPos != pos) { LOG.info( "- beginning of block at " + prevPos + ", docno:" + prevDocno + ", file:" + fileno); keyOut.set(prevDocno); valOut.set(prevPos + "\t" + fileno); output.collect(keyOut, valOut); reporter.incrCounter(Blocks.Total, 1); } prevPos = pos; pos = input.getPos(); prevDocno = key.get(); } }
@Override public void reduce( MatrixIndexes indexes, Iterator<TaggedMatrixValue> values, OutputCollector<MatrixIndexes, WeightedPair> out, Reporter reporter) throws IOException { long start = System.currentTimeMillis(); if (firsttime) { cachedReporter = reporter; firsttime = false; } cachedValues.reset(); while (values.hasNext()) { TaggedMatrixValue taggedValue = values.next(); cachedValues.set(taggedValue.getTag(), indexes, taggedValue.getBaseObject(), true); } // LOG.info("before aggregation: \n"+cachedValues); // perform aggregate operations first // processAggregateInstructions(indexes, values); // LOG.info("after aggregation: \n"+cachedValues); // perform mixed operations // processReducerInstructions(); processCombineInstructionsAndOutput(reporter); reporter.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start); }
public void map( LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { int slotId = Integer.parseInt(value.toString().trim()); html.fireRandom(slotId); long[] range = HtmlCore.getPageRange(slotId, pages, slotpages); /** For output collect */ for (long i = range[0]; i < range[1]; i++) { key.set(i); long[] linkids = html.genPureLinkIds(); for (int j = 0; j < linkids.length; j++) { String to = Long.toString(linkids[j]); Text v = new Text(to); output.collect(key, v); reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength()); } if (0 == (i % 10000)) { log.info("still running: " + (i - range[0]) + " of " + slotpages); } } }
/** * {@inheritDoc} * * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, * org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase(); for (String pattern : m_patternsToSkip) { line = line.replaceAll(pattern, ""); } StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { m_word.set(tokenizer.nextToken()); output.collect(m_word, ONE); reporter.incrCounter(Counters.INPUT_WORDS, 1); } if ((++m_numRecords % 100) == 0) { reporter.setStatus( "Finished processing " + m_numRecords + " records " + "from the input file: " + m_inputFile); } }
public void func(Operator op) { Map<Enum, Long> opStats = op.getStats(); for (Map.Entry<Enum, Long> e : opStats.entrySet()) { if (this.rp != null) { rp.incrCounter(e.getKey(), e.getValue()); } } }
public void map( LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter) throws IOException { double min = value.sumOfSquares(centers.get(0)); int best = 0; for (int index = 1; index < numberOfCenters; ++index) { double current = value.sumOfSquares(centers.get(index)); if (current < min) { min = current; best = index; } } reporter.incrCounter("NUMBER", "NODES", 1); reporter.incrCounter("CENTER", "" + best, 1); output.collect(new LongWritable(best), value); }
public void map( LongWritable key, TrecDocument doc, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { reporter.incrCounter(Count.DOCS, 1); docid.set(doc.getDocid()); one.set(docMapping.getDocno(doc.getDocid())); output.collect(docid, one); }
/** Run a FileOperation */ public void map( Text key, PolicyInfo policy, OutputCollector<WritableComparable, Text> out, Reporter reporter) throws IOException { this.reporter = reporter; try { LOG.info("Raiding file=" + key.toString() + " policy=" + policy); Path p = new Path(key.toString()); FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p); st.clear(); RaidNode.doRaid(jobconf, policy, fs, st, reporter); ++succeedcount; reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks); reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize); reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks); reporter.incrCounter(Counter.META_SIZE, st.metaSize); reporter.incrCounter(Counter.FILES_SUCCEEDED, 1); } catch (IOException e) { ++failcount; reporter.incrCounter(Counter.FILES_FAILED, 1); String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e); out.collect(null, new Text(s)); LOG.info(s); } finally { reporter.setStatus(getCountString()); } }
public void reduce( Text key, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Node node = new Node(key.toString()); Set<String> f_unique = new HashSet<String>(); Set<String> r_unique = new HashSet<String>(); int sawnode = 0; while (iter.hasNext()) { String msg = iter.next().toString(); // System.err.println(key.toString() + "\t" + msg); String[] vals = msg.split("\t"); if (vals[0].equals(Node.NODEMSG)) { node.parseNodeMsg(vals, 0); sawnode++; } else if (vals[0].equals(Node.HASUNIQUEP)) { if (vals[2].equals("f")) { f_unique.add(vals[1]); } else if (vals[2].equals("r")) { r_unique.add(vals[1]); } } else { throw new IOException("Unknown msgtype: " + msg); } } if (sawnode != 1) { throw new IOException( "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + key.toString()); } for (String adj : Node.dirs) { TailInfo next = node.gettail(adj); if (next != null) { if ((next.dir.equals("f") && r_unique.contains(next.id)) || (next.dir.equals("r") && f_unique.contains(next.id))) { // for path compress if (node.getBlackEdges() == null) { node.setCanCompress(adj, true); } reporter.incrCounter("Brush", "compressible", 1); } } } // System.err.println(node.getNodeId() + " " + node.toNodeMsg() ); output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg())); }
private boolean validate(String str, Reporter reporter) { String[] parts = str.split("\t"); if (parts.length != 6) { if (parts.length < 6) { reporter.incrCounter(LineCounters.TOO_FEW_TABS, 1); } else { reporter.incrCounter(LineCounters.TOO_MANY_TABS, 1); } reporter.incrCounter(LineCounters.BAD_LINES, 1); if ((reporter.getCounter(LineCounters.BAD_LINES).getCounter() % 10) == 0) { reporter.setStatus("Got 10 bad lines."); System.err.println("Read another 10 bad lines."); } return false; } return true; }
private void flushDomain(Reporter reporter) throws IOException { if (_currentDomainId != -1) { if (items.size() != 0) { spillItems(reporter); } if (reporter != null) { if (currentDomainSpilledItemCount >= 10000000) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_10MILLION_URLS, 1); } else if (currentDomainSpilledItemCount >= 1000000) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_1MILLION_URLS, 1); } else if (currentDomainSpilledItemCount >= 100000) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_100K_URLS, 1); } else if (currentDomainSpilledItemCount >= 50000) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_50K_URLS, 1); } else if (currentDomainSpilledItemCount >= 10000) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_10K_URLS, 1); } else if (currentDomainSpilledItemCount >= 1000) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_1K_URLS, 1); } else if (currentDomainSpilledItemCount >= 100) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_100_URLS, 1); } else if (currentDomainSpilledItemCount >= 10) { reporter.incrCounter(Counters.DOMAIN_WITH_GT_10_URLS, 1); } else if (currentDomainSpilledItemCount > 1) { reporter.incrCounter(Counters.DOMAIN_WITH_LT_10_URLS, 1); } else if (currentDomainSpilledItemCount == 1) { reporter.incrCounter(Counters.DOMAIN_WITH_1_URL, 1); } } _currentDomainId = -1; currentDomainCrawlIdx = -1; currentDomainName = ""; currentDomainSpilledItemCount = 0; currentDomainURLCount = 0; } }
@Override public void reduce( GenericKey key, Iterator<GenericValue> values, OutputCollector<VectorPair, FloatWritable> output, Reporter reporter) throws IOException { int vectorID = key.getPrimary(); assert (key.getSecondary() == -1); // the vector is the first value VectorComponentArrayWritable vector = (VectorComponentArrayWritable) values.next().get(); // half pairs are sorted such that all equal pairs are consecutive if (values.hasNext()) { reporter.incrCounter(APS.COMBINED, 1); HalfPair hp1 = (HalfPair) values.next().get(); float similarity = hp1.getSimilarity(); HalfPair hp2; int counter = 0; while (values.hasNext()) { reporter.incrCounter(APS.COMBINED, 1); if (counter++ % REPORTER_INTERVAL == 0) reporter.progress(); hp2 = (HalfPair) values.next().get(); if (hp1.equals(hp2)) { similarity += hp2.getSimilarity(); } else { // output outputHelper(hp1, vectorID, vector, similarity, output, reporter); // start new stripe hp1 = hp2; similarity = hp1.getSimilarity(); } } // output the last one outputHelper(hp1, vectorID, vector, similarity, output, reporter); } }
/** Outputs exactly one value for each key; this suppresses duplicates */ @Override public void reduce( Text key, Iterator<LogRecord> vals, OutputCollector<Text, LogRecord> out, Reporter r) throws IOException { LogRecord i = vals.next(); // out.collect(new Text(key.getKey()), i); out.collect(key, i); int dups = 0; while (vals.hasNext()) { vals.next(); dups++; } r.incrCounter("app", "duplicate chunks", dups); }
@Override public void map( LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { int slotId = Integer.parseInt(value.toString().trim()); long[] range = HtmlCore.getPageRange(slotId, pages, slotpages); for (long i = range[0]; i < range[1]; i++) { key.set(i); Text v = new Text(Long.toString(i)); output.collect(key, v); reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength()); } }
/** potentially reset state based on domain id transition * */ private void domainTransition(long newDomainFP, String newDomainName, Reporter reporter) throws IOException { if (_currentDomainId != -1) { flushDomain(reporter); } _flags = 0; _domainStats = null; _domainRank = 0.0; _skipDomain = false; _skipEverythingButHomepage = false; // zero out item count ... items.clear(); // reset domain id _currentDomainId = newDomainFP; currentDomainCrawlIdx = (((int) _currentDomainId & Integer.MAX_VALUE) % crawlerCount); // reset current domain url count currentDomainURLCount = 0; currentDomainName = newDomainName; // and reset last bundle id currentBundleId = 0; // reset spill count for domain currentDomainSpilledItemCount = 0; if (BlockedDomainList.blockedDomains.contains(newDomainFP)) { reporter.incrCounter(Counters.SKIPPING_BLOCKED_DOMAIN, 1); LOG.info("Skipping Blocked Domain:" + newDomainName); _skipDomain = true; } if (ipAddressRegExPattern.matcher(currentDomainName.trim()).matches()) { reporter.incrCounter(Counters.SKIPPING_IP_ADDRESS, 1); _skipDomain = true; } }
/** Given an output filename, write a bunch of random records to it. */ public void map( WritableComparable key, Writable value, OutputCollector<BytesWritable, BytesWritable> output, Reporter reporter) throws IOException { int itemCount = 0; while (numBytesToWrite > 0) { int keyLength = minKeySize + (keySizeRange != 0 ? random.nextInt(keySizeRange) : 0); randomKey.setSize(keyLength); randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength()); int valueLength = minValueSize + (valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0); randomValue.setSize(valueLength); randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength()); output.collect(randomKey, randomValue); numBytesToWrite -= keyLength + valueLength; reporter.incrCounter(Counters.BYTES_WRITTEN, keyLength + valueLength); reporter.incrCounter(Counters.RECORDS_WRITTEN, 1); if (++itemCount % 200 == 0) { reporter.setStatus("wrote record " + itemCount + ". " + numBytesToWrite + " bytes left."); } } reporter.setStatus("done with " + itemCount + " records."); }
public void warn(Object o, String msg, Enum warningEnum) { String displayMessage = o.getClass().getName() + ": " + msg; if (aggregate) { if (reporter != null) { reporter.incrCounter(warningEnum, 1); } else { // TODO: // in local mode of execution if the PigHadoopLogger is used initially, // then aggregation cannot be performed as the reporter will be null. // The reference to a reporter is given by Hadoop at run time. // In local mode, due to the absence of Hadoop there will be no reporter // Just print the warning message as is. // If a warning message is printed in map reduce mode when aggregation // is turned on then we have a problem, its a bug. log.warn(displayMessage); } } else { log.warn(displayMessage); } }
private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { CrawlDatum newDatum = new CrawlDatum( CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); // transfer all existing metadata to the redirect newDatum.getMetaData().putAll(fit.datum.getMetaData()); scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } fit = FetchItem.create(redirUrl, newDatum, queueMode); if (fit != null) { FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID); fiq.addInProgressFetchItem(fit); } else { // stop redirecting redirecting = false; reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } return fit; }
/** spill cached items * */ void spillItems(Reporter reporter) throws IOException { // if item count exceeds spill threshold .. or we ran out of data ... if (items.size() != 0) { // LOG.info("Spilling Bundle:" + currentBundleId + " for DH:" + // currentDomain + " ItemCount:" + subList.size()); // flush items generateABundle(_currentDomainId, items, reporter); if (reporter != null) { reporter.progress(); } // ok, increment counts ... currentDomainSpilledItemCount += items.size(); if (currentDomainSpilledItemCount >= 1000000) { reporter.incrCounter(Counters.SPILLED_1_MILLION_SKIPPED_REST, 1); LOG.info("Skipping Remaining URLS for Domain:" + currentDomainName); _skipDomain = true; } } // reset list ... items.clear(); }
@Override public void reduce( TaggedFirstSecondIndexes indexes, Iterator<MatrixValue> values, OutputCollector<Writable, Writable> out, Reporter report) throws IOException { long start = System.currentTimeMillis(); // LOG.info("---------- key: "+indexes); commonSetup(report); // perform aggregate MatrixValue aggregateValue = performAggregateInstructions(indexes, values); if (aggregateValue == null) return; int tag = indexes.getTag(); long firstIndex = indexes.getFirstIndex(); long secondIndex = indexes.getSecondIndex(); // for a different k if (prevFirstIndex != firstIndex) { resetCache(); prevFirstIndex = firstIndex; } else if (prevTag > tag) throw new RuntimeException("tag is not ordered correctly: " + prevTag + " > " + tag); remainingbuffer.set(secondIndex, aggregateValue); try { processJoin(tag, remainingbuffer); } catch (Exception e) { throw new IOException(e); } prevTag = tag; report.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start); }