@Override public void reduce( GenericKey key, Iterator<GenericValue> values, OutputCollector<GenericKey, GenericValue> output, Reporter reporter) throws IOException { if (key.getSecondary() < Preprocesser.MINIMUM_ID) { // vector output.collect(key, values.next()); if (values.hasNext()) assert false : "Vectors should not get grouped by combiner: " + key; } else { // addend reporter.progress(); int counter = 0; float sim = 0; HalfPair hp = null; while (values.hasNext()) { hp = (HalfPair) values.next().get(); sim += hp.getSimilarity(); if (counter++ % REPORTER_INTERVAL == 0) reporter.progress(); } if (hp != null) { payload.set(hp.getID(), sim); outValue.set(payload); output.collect(key, outValue); } else { assert false : "There is nothing to combine!"; } } }
boolean skipRecord(GoogleURL urlObject, Reporter reporter) { if (_skipDomain) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1); return true; } if (!urlObject.isValid()) { reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1); return true; } else if (urlObject.has_query()) { reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1); if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) { reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1); return true; } } else { // if redirect ... skip if ((_flags & HAS_REDIRECT_DATA) != 0) { reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1); return true; } if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) { if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) { reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1); return false; } } if (_skipEverythingButHomepage) { reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1); return true; } if (_crawlStatus != null) { if (_crawlStatus.has("crawl_status")) { JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject(); if (realCrawlStatus.has("http_result")) { int httpResult = realCrawlStatus.get("http_result").getAsInt(); if (httpResult == 200 || httpResult == 404) { if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) { if (_blogURLSkipFlag.get()) { reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1); return true; } else { reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1); return false; } } else { reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1); return true; } } } } } } return false; }
public void map( LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Node node = new Node(); node.fromNodeMsg(nodetxt.toString()); List<String> bubbles = node.getBubbles(); if (bubbles != null) { for (String bubble : bubbles) { String[] vals = bubble.split("\\|"); String minor = vals[0]; String minord = vals[1]; String dead = vals[2]; String newd = vals[3]; String newid = vals[4]; String extracov = vals[5]; output.collect( new Text(minor), new Text( Node.KILLLINKMSG + "\t" + minord + "\t" + dead + "\t" + newd + "\t" + newid)); output.collect(new Text(dead), new Text(Node.KILLMSG)); output.collect(new Text(newid), new Text(Node.EXTRACOV + "\t" + extracov)); reporter.incrCounter("Contrail", "bubblespopped", 1); } node.clearBubbles(); } output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg())); reporter.incrCounter("Contrail", "nodes", 1); }
/** * {@inheritDoc} * * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, * org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase(); for (String pattern : m_patternsToSkip) { line = line.replaceAll(pattern, ""); } StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { m_word.set(tokenizer.nextToken()); output.collect(m_word, ONE); reporter.incrCounter(Counters.INPUT_WORDS, 1); } if ((++m_numRecords % 100) == 0) { reporter.setStatus( "Finished processing " + m_numRecords + " records " + "from the input file: " + m_inputFile); } }
public void map( LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Node node = new Node(); node.fromNodeMsg(nodetxt.toString()); for (String adj : Node.dirs) { node.setCanCompress(adj, false); TailInfo next = node.gettail(adj); if (next != null /*&& node.getBlackEdges() == null*/) { if (next.id.equals(node.getNodeId())) { continue; } reporter.incrCounter("Brush", "remotemark", 1); output.collect( new Text(next.id), new Text(Node.HASUNIQUEP + "\t" + node.getNodeId() + "\t" + adj)); } } output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg())); reporter.incrCounter("Brush", "nodes", 1); }
public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
public void map( LongWritable key, Text t, OutputCollector<IntWritable, PageRankNode> output, Reporter reporter) throws IOException { String[] arr = t.toString().trim().split("\\s+"); nid.set(Integer.parseInt(arr[0])); if (arr.length == 1) { node.setNodeId(Integer.parseInt(arr[0])); node.setAdjacencyList(new ArrayListOfIntsWritable()); } else { node.setNodeId(Integer.parseInt(arr[0])); int[] neighbors = new int[arr.length - 1]; for (int i = 1; i < arr.length; i++) { neighbors[i - 1] = Integer.parseInt(arr[i]); } node.setAdjacencyList(new ArrayListOfIntsWritable(neighbors)); } reporter.incrCounter("graph", "numNodes", 1); reporter.incrCounter("graph", "numEdges", arr.length - 1); if (arr.length > 1) { reporter.incrCounter("graph", "numActiveNodes", 1); } output.collect(nid, node); }
private boolean outputHelper( HalfPair hp, int vectorID, VectorComponentArrayWritable vector, float similarity, OutputCollector<VectorPair, FloatWritable> output, Reporter reporter) throws IOException { reporter.incrCounter(APS.EVALUATED, 1); reporter.progress(); if (haspruned) { VectorComponentArrayWritable remainder = pruned.get(hp.getID()); if (remainder != null) { // cheap upper bound dot(x,y) <= min(|x|,|y|) * maxweight(x) * maxweight(y) // double dotProdBound = min(remainder.length(), vector.length()) * // remainder.getMaxWeight() // * vector.getMaxWeight(); // if (compare(similarity + dotProdBound, threshold) >= 0) similarity += VectorComponentArrayWritable.dotProduct(vector, remainder); } else { LOG.warn("No remainder found for vector " + hp.getID()); } } if (compare(similarity, threshold) >= 0) { int firstID = VectorPair.canonicalFirst(vectorID, hp.getID()); int secondID = VectorPair.canonicalSecond(vectorID, hp.getID()); outKey.set(firstID, secondID); outValue.set(similarity); output.collect(outKey, outValue); reporter.incrCounter(APS.SIMILAR, 1); return true; } return false; }
public void close() throws IOException { if (_consumer != null) _consumer.close(); String topic = _request.getTopic(); long endTime = System.currentTimeMillis(); _reporter.incrCounter(topic, "read-time(ms)", endTime - _startTime); _reporter.incrCounter(topic, "request-time(ms)", _requestTime); long bytesRead = _offset - _offsetRange[0]; double megaRead = bytesRead / (1024.0 * 1024.0); _reporter.incrCounter(topic, "data-read(mb)", (long) megaRead); _reporter.incrCounter(topic, "event-count", _count); }
void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException { _domainStats = domainStats; if (_domainStats.has("dR")) { _domainRank = _domainStats.get("dR").getAsDouble(); } else { _domainRank = 0.0; } if (_domainStats.has("urls")) { int urlCount = _domainStats.get("urls").getAsInt(); int crawledCount = _domainStats.get("crawled").getAsInt(); int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0; if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1); LOG.info( "Skipping Everything But Homepage for Domain:" + _newDomainBytes.toString() + " CrawledCount:" + crawledCount + " HTTP200Count:" + Http200Count + " URLCount:" + urlCount); _skipEverythingButHomepage = true; } else if (urlCount > 25000 && urlCount < 100000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 250000 && urlCount < 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } } if (_emittedURLSInFilter >= FLUSH_THRESHOLD) { _emittedURLSFilter.clear(); _emittedURLSInFilter = 0; reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1); } }
void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }
public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter) throws IOException { // key: a single sentence in both languages and alignment // ignore value. each key is parallel sentence and its alignment, in xml format ParallelChunk c = pcr.parseString(key.toString()); ok.set(c.idString()); // Chunk is an array of tokens in the sentence, without any special tokenization (just // separated by spaces) Chunk fc = c.getChunk(src); Chunk ec = c.getChunk(tgt); if (fc == null || ec == null) { reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1); return; } if (fc.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1); return; } if (ec.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1); return; } // ec,fc: English/French sentence represented as sequence of words // vocE,vocF: vocabularies for english and french, of type VocabularyWritable // ee,fe: integer representation of words in sentences ec and fc sLogger.debug("Target sentence:"); int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE); sLogger.debug("Source sentence:"); int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF); // e,f: phrase from whole sentence Phrase e = new Phrase(ee, 0); Phrase f = new Phrase(fe, 1); edu.umd.hooka.PhrasePair b = new PhrasePair(f, e); ReferenceAlignment ra = c.getReferenceAlignment(lp); if (ra != null) { b.setAlignment(ra); } reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length); reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length); reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1); oc.collect(ok, b); }
public void map( WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { LOG.info("Start Map"); if (err != null) { throw err; } DecimalFormat df = new DecimalFormat("00000000000000000000"); collector.collect(new Text(tableName), new Text("")); for (long i = 0; i < 50000; i++) { long randNum = rand.nextLong(); Row.Key rowKey = new Row.Key(df.format(randNum)); Row row = new Row(rowKey); row.addCell("Col1", new Cell(Cell.Key.EMPTY_KEY, this.data)); ctable.put(row); if (i % 1000 == 0) { reporter.progress(); } if (i % 10000 == 0) { LOG.info("uploaded: " + i); } collector.collect(new Text(df.format(randNum)), new Text("")); } LOG.info("End Map"); }
public void map( LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { int slotId = Integer.parseInt(value.toString().trim()); html.fireRandom(slotId); long[] range = HtmlCore.getPageRange(slotId, pages, slotpages); /** For output collect */ for (long i = range[0]; i < range[1]; i++) { key.set(i); long[] linkids = html.genPureLinkIds(); for (int j = 0; j < linkids.length; j++) { String to = Long.toString(linkids[j]); Text v = new Text(to); output.collect(key, v); reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8 + v.getLength()); } if (0 == (i % 10000)) { log.info("still running: " + (i - range[0]) + " of " + slotpages); } } }
/** * Map method. * * @param offset samples starting from the (offset+1)th sample. * @param size the number of samples for this map * @param out output {ture->numInside, false->numOutside} * @param reporter */ public void map( LongWritable offset, LongWritable size, OutputCollector<BooleanWritable, LongWritable> out, Reporter reporter) throws IOException { final HaltonSequence haltonsequence = new HaltonSequence(offset.get()); long numInside = 0L; long numOutside = 0L; for (long i = 0; i < size.get(); ) { // generate points in a unit square final double[] point = haltonsequence.nextPoint(); // count points inside/outside of the inscribed circle of the square final double x = point[0] - 0.5; final double y = point[1] - 0.5; if (x * x + y * y > 0.25) { numOutside++; } else { numInside++; } // report status i++; if (i % 1000 == 0) { reporter.setStatus("Generated " + i + " samples."); } } // output map results out.collect(new BooleanWritable(true), new LongWritable(numInside)); out.collect(new BooleanWritable(false), new LongWritable(numOutside)); }
/** * This is the function that re-groups values for a key into sub-groups based on a secondary key * (input tag). * * @param arg1 * @return */ private SortedMap<Object, ResetableIterator> regroup(Object key, Iterator arg1, Reporter reporter) throws IOException { this.numOfValues = 0; SortedMap<Object, ResetableIterator> retv = new TreeMap<Object, ResetableIterator>(); IntermediateData aRecord = null; while (arg1.hasNext()) { this.numOfValues += 1; // make log while processing if (this.numOfValues % 100 == 0) { reporter.setStatus("key: " + key.toString() + " numOfValues: " + this.numOfValues); } // skip out when exccess limit if (this.numOfValues > this.maxNumOfValuesPerGroup) { break; } aRecord = ((IntermediateData) arg1.next()).clone(job); Text tag = aRecord.getTag(); ResetableIterator data = retv.get(tag); if (data == null) { data = createResetableIterator(); retv.put(tag, data); } data.add(aRecord); } // LOG.info("EXIT while"); if (this.numOfValues > this.largestNumOfValues) { this.largestNumOfValues = numOfValues; LOG.info("key: " + key.toString() + " this.largestNumOfValues: " + this.largestNumOfValues); } return retv; }
@Override public void reduce( MatrixIndexes indexes, Iterator<TaggedMatrixValue> values, OutputCollector<MatrixIndexes, WeightedPair> out, Reporter reporter) throws IOException { long start = System.currentTimeMillis(); if (firsttime) { cachedReporter = reporter; firsttime = false; } cachedValues.reset(); while (values.hasNext()) { TaggedMatrixValue taggedValue = values.next(); cachedValues.set(taggedValue.getTag(), indexes, taggedValue.getBaseObject(), true); } // LOG.info("before aggregation: \n"+cachedValues); // perform aggregate operations first // processAggregateInstructions(indexes, values); // LOG.info("after aggregation: \n"+cachedValues); // perform mixed operations // processReducerInstructions(); processCombineInstructionsAndOutput(reporter); reporter.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, System.currentTimeMillis() - start); }
@Override public void map( WritableComparable key, CompactorInputSplit split, OutputCollector<NullWritable, NullWritable> nullWritableVOutputCollector, Reporter reporter) throws IOException { // This will only get called once, since CompactRecordReader only returns one record, // the input split. // Based on the split we're passed we go instantiate the real reader and then iterate on it // until it finishes. @SuppressWarnings("unchecked") // since there is no way to parametrize instance of Class AcidInputFormat<WritableComparable, V> aif = instantiate(AcidInputFormat.class, jobConf.get(INPUT_FORMAT_CLASS_NAME)); ValidTxnList txnList = new ValidReadTxnList(jobConf.get(ValidTxnList.VALID_TXNS_KEY)); boolean isMajor = jobConf.getBoolean(IS_MAJOR, false); AcidInputFormat.RawReader<V> reader = aif.getRawReader( jobConf, isMajor, split.getBucket(), txnList, split.getBaseDir(), split.getDeltaDirs()); RecordIdentifier identifier = reader.createKey(); V value = reader.createValue(); getWriter(reporter, reader.getObjectInspector(), split.getBucket()); while (reader.next(identifier, value)) { if (isMajor && reader.isDelete(value)) continue; writer.write(value); reporter.progress(); } }
public void run( RecordReader<IntWritable, WikipediaPage> input, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { IntWritable key = new IntWritable(); WikipediaPage value = new WikipediaPage(); long pos = -1; long prevPos = -1; int prevDocno = 0; pos = input.getPos(); while (input.next(key, value)) { if (prevPos != -1 && prevPos != pos) { LOG.info( "- beginning of block at " + prevPos + ", docno:" + prevDocno + ", file:" + fileno); keyOut.set(prevDocno); valOut.set(prevPos + "\t" + fileno); output.collect(keyOut, valOut); reporter.incrCounter(Blocks.Total, 1); } prevPos = pos; pos = input.getPos(); prevDocno = key.get(); } }
public void func(Operator op) { Map<Enum, Long> opStats = op.getStats(); for (Map.Entry<Enum, Long> e : opStats.entrySet()) { if (this.rp != null) { rp.incrCounter(e.getKey(), e.getValue()); } } }
/** Run a FileOperation */ public void map( Text key, PolicyInfo policy, OutputCollector<WritableComparable, Text> out, Reporter reporter) throws IOException { this.reporter = reporter; try { LOG.info("Raiding file=" + key.toString() + " policy=" + policy); Path p = new Path(key.toString()); FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p); st.clear(); RaidNode.doRaid(jobconf, policy, fs, st, reporter); ++succeedcount; reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks); reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize); reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks); reporter.incrCounter(Counter.META_SIZE, st.metaSize); reporter.incrCounter(Counter.FILES_SUCCEEDED, 1); } catch (IOException e) { ++failcount; reporter.incrCounter(Counter.FILES_FAILED, 1); String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e); out.collect(null, new Text(s)); LOG.info(s); } finally { reporter.setStatus(getCountString()); } }
public void map( Text key, LongWritable value, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { String name = key.toString(); long size = value.get(); long seed = Long.parseLong(name); random.setSeed(seed); reporter.setStatus("opening " + name); DataInputStream in = new DataInputStream(fs.open(new Path(DATA_DIR, name))); long read = 0; try { while (read < size) { long remains = size - read; int n = (remains <= buffer.length) ? (int) remains : buffer.length; in.readFully(buffer, 0, n); read += n; if (fastCheck) { Arrays.fill(check, (byte) random.nextInt(Byte.MAX_VALUE)); } else { random.nextBytes(check); } if (n != buffer.length) { Arrays.fill(buffer, n, buffer.length, (byte) 0); Arrays.fill(check, n, check.length, (byte) 0); } assertTrue(Arrays.equals(buffer, check)); reporter.setStatus("reading " + name + "@" + read + "/" + size); } } finally { in.close(); } collector.collect(new Text("bytes"), new LongWritable(read)); reporter.setStatus("read " + name); }
public void map( Text key, LongWritable value, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { String name = key.toString(); long size = value.get(); long seed = Long.parseLong(name); random.setSeed(seed); reporter.setStatus("creating " + name); // write to temp file initially to permit parallel execution Path tempFile = new Path(DATA_DIR, name + suffix); OutputStream out = fs.create(tempFile); long written = 0; try { while (written < size) { if (fastCheck) { Arrays.fill(buffer, (byte) random.nextInt(Byte.MAX_VALUE)); } else { random.nextBytes(buffer); } long remains = size - written; int length = (remains <= buffer.length) ? (int) remains : buffer.length; out.write(buffer, 0, length); written += length; reporter.setStatus("writing " + name + "@" + written + "/" + size); } } finally { out.close(); } // rename to final location fs.rename(tempFile, new Path(DATA_DIR, name)); collector.collect(new Text("bytes"), new LongWritable(written)); reporter.setStatus("wrote " + name); }
public void map( Text key, LongWritable value, OutputCollector<K, LongWritable> collector, Reporter reporter) throws IOException { String name = key.toString(); long size = value.get(); long seed = Long.parseLong(name); if (size == 0) return; reporter.setStatus("opening " + name); FSDataInputStream in = fs.open(new Path(DATA_DIR, name)); try { for (int i = 0; i < SEEKS_PER_FILE; i++) { // generate a random position long position = Math.abs(random.nextLong()) % size; // seek file to that position reporter.setStatus("seeking " + name); in.seek(position); byte b = in.readByte(); // check that byte matches byte checkByte = 0; // advance random state to that position random.setSeed(seed); for (int p = 0; p <= position; p += check.length) { reporter.setStatus("generating data for " + name); if (fastCheck) { checkByte = (byte) random.nextInt(Byte.MAX_VALUE); } else { random.nextBytes(check); checkByte = check[(int) (position % check.length)]; } } assertEquals(b, checkByte); } } finally { in.close(); } }
/** * The subclass can overwrite this method to perform additional filtering and/or other processing * logic before a value is collected. * * @param key * @param aRecord * @param output * @param reporter * @throws IOException */ protected void collect( Object key, IntermediateData aRecord, OutputCollector output, Reporter reporter) throws IOException { this.collected += 1; addLongValue("collectedCount", 1); if (aRecord != null) { output.collect(key, aRecord.getData()); reporter.setStatus("key: " + key.toString() + " collected: " + collected); addLongValue("actuallyCollectedCount", 1); } }
public void map( LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter) throws IOException { double min = value.sumOfSquares(centers.get(0)); int best = 0; for (int index = 1; index < numberOfCenters; ++index) { double current = value.sumOfSquares(centers.get(index)); if (current < min) { min = current; best = index; } } reporter.incrCounter("NUMBER", "NODES", 1); reporter.incrCounter("CENTER", "" + best, 1); output.collect(new LongWritable(best), value); }
@Override public void reduce( Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter) throws IOException { HashSet<Text> hash = new HashSet<Text>(); while (iter.hasNext()) { hash.add(iter.next()); } oc.collect(key, new Text(Integer.toString(hash.size()))); reporter.setStatus("OK"); }
@Override public void reduce( Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter) throws IOException { HashSet<Text> hash = new HashSet<Text>(); while (iter.hasNext()) { hash.add(iter.next()); } for (Text t : hash) oc.collect(key, t); reporter.setStatus("OK"); }
public void map( LongWritable key, TrecDocument doc, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { reporter.incrCounter(Count.DOCS, 1); docid.set(doc.getDocid()); one.set(docMapping.getDocno(doc.getDocid())); output.collect(docid, one); }
public void reduce( Text key, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Node node = new Node(key.toString()); Set<String> f_unique = new HashSet<String>(); Set<String> r_unique = new HashSet<String>(); int sawnode = 0; while (iter.hasNext()) { String msg = iter.next().toString(); // System.err.println(key.toString() + "\t" + msg); String[] vals = msg.split("\t"); if (vals[0].equals(Node.NODEMSG)) { node.parseNodeMsg(vals, 0); sawnode++; } else if (vals[0].equals(Node.HASUNIQUEP)) { if (vals[2].equals("f")) { f_unique.add(vals[1]); } else if (vals[2].equals("r")) { r_unique.add(vals[1]); } } else { throw new IOException("Unknown msgtype: " + msg); } } if (sawnode != 1) { throw new IOException( "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + key.toString()); } for (String adj : Node.dirs) { TailInfo next = node.gettail(adj); if (next != null) { if ((next.dir.equals("f") && r_unique.contains(next.id)) || (next.dir.equals("r") && f_unique.contains(next.id))) { // for path compress if (node.getBlackEdges() == null) { node.setCanCompress(adj, true); } reporter.incrCounter("Brush", "compressible", 1); } } } // System.err.println(node.getNodeId() + " " + node.toNodeMsg() ); output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg())); }