public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
/** * Map method. * * @param offset samples starting from the (offset+1)th sample. * @param size the number of samples for this map * @param out output {ture->numInside, false->numOutside} * @param reporter */ public void map( LongWritable offset, LongWritable size, OutputCollector<BooleanWritable, LongWritable> out, Reporter reporter) throws IOException { final HaltonSequence haltonsequence = new HaltonSequence(offset.get()); long numInside = 0L; long numOutside = 0L; for (long i = 0; i < size.get(); ) { // generate points in a unit square final double[] point = haltonsequence.nextPoint(); // count points inside/outside of the inscribed circle of the square final double x = point[0] - 0.5; final double y = point[1] - 0.5; if (x * x + y * y > 0.25) { numOutside++; } else { numInside++; } // report status i++; if (i % 1000 == 0) { reporter.setStatus("Generated " + i + " samples."); } } // output map results out.collect(new BooleanWritable(true), new LongWritable(numInside)); out.collect(new BooleanWritable(false), new LongWritable(numOutside)); }
/** Run a FileOperation */ public void map( Text key, PolicyInfo policy, OutputCollector<WritableComparable, Text> out, Reporter reporter) throws IOException { this.reporter = reporter; try { LOG.info("Raiding file=" + key.toString() + " policy=" + policy); Path p = new Path(key.toString()); FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p); st.clear(); RaidNode.doRaid(jobconf, policy, fs, st, reporter); ++succeedcount; reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks); reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize); reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks); reporter.incrCounter(Counter.META_SIZE, st.metaSize); reporter.incrCounter(Counter.FILES_SUCCEEDED, 1); } catch (IOException e) { ++failcount; reporter.incrCounter(Counter.FILES_FAILED, 1); String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e); out.collect(null, new Text(s)); LOG.info(s); } finally { reporter.setStatus(getCountString()); } }
/** * This is the function that re-groups values for a key into sub-groups based on a secondary key * (input tag). * * @param arg1 * @return */ private SortedMap<Object, ResetableIterator> regroup(Object key, Iterator arg1, Reporter reporter) throws IOException { this.numOfValues = 0; SortedMap<Object, ResetableIterator> retv = new TreeMap<Object, ResetableIterator>(); IntermediateData aRecord = null; while (arg1.hasNext()) { this.numOfValues += 1; // make log while processing if (this.numOfValues % 100 == 0) { reporter.setStatus("key: " + key.toString() + " numOfValues: " + this.numOfValues); } // skip out when exccess limit if (this.numOfValues > this.maxNumOfValuesPerGroup) { break; } aRecord = ((IntermediateData) arg1.next()).clone(job); Text tag = aRecord.getTag(); ResetableIterator data = retv.get(tag); if (data == null) { data = createResetableIterator(); retv.put(tag, data); } data.add(aRecord); } // LOG.info("EXIT while"); if (this.numOfValues > this.largestNumOfValues) { this.largestNumOfValues = numOfValues; LOG.info("key: " + key.toString() + " this.largestNumOfValues: " + this.largestNumOfValues); } return retv; }
/** * {@inheritDoc} * * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, * org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) */ @Override public void map( LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase(); for (String pattern : m_patternsToSkip) { line = line.replaceAll(pattern, ""); } StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { m_word.set(tokenizer.nextToken()); output.collect(m_word, ONE); reporter.incrCounter(Counters.INPUT_WORDS, 1); } if ((++m_numRecords % 100) == 0) { reporter.setStatus( "Finished processing " + m_numRecords + " records " + "from the input file: " + m_inputFile); } }
public void map( Text key, LongWritable value, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { String name = key.toString(); long size = value.get(); long seed = Long.parseLong(name); random.setSeed(seed); reporter.setStatus("opening " + name); DataInputStream in = new DataInputStream(fs.open(new Path(DATA_DIR, name))); long read = 0; try { while (read < size) { long remains = size - read; int n = (remains <= buffer.length) ? (int) remains : buffer.length; in.readFully(buffer, 0, n); read += n; if (fastCheck) { Arrays.fill(check, (byte) random.nextInt(Byte.MAX_VALUE)); } else { random.nextBytes(check); } if (n != buffer.length) { Arrays.fill(buffer, n, buffer.length, (byte) 0); Arrays.fill(check, n, check.length, (byte) 0); } assertTrue(Arrays.equals(buffer, check)); reporter.setStatus("reading " + name + "@" + read + "/" + size); } } finally { in.close(); } collector.collect(new Text("bytes"), new LongWritable(read)); reporter.setStatus("read " + name); }
public void map( Text key, LongWritable value, OutputCollector<K, LongWritable> collector, Reporter reporter) throws IOException { String name = key.toString(); long size = value.get(); long seed = Long.parseLong(name); if (size == 0) return; reporter.setStatus("opening " + name); FSDataInputStream in = fs.open(new Path(DATA_DIR, name)); try { for (int i = 0; i < SEEKS_PER_FILE; i++) { // generate a random position long position = Math.abs(random.nextLong()) % size; // seek file to that position reporter.setStatus("seeking " + name); in.seek(position); byte b = in.readByte(); // check that byte matches byte checkByte = 0; // advance random state to that position random.setSeed(seed); for (int p = 0; p <= position; p += check.length) { reporter.setStatus("generating data for " + name); if (fastCheck) { checkByte = (byte) random.nextInt(Byte.MAX_VALUE); } else { random.nextBytes(check); checkByte = check[(int) (position % check.length)]; } } assertEquals(b, checkByte); } } finally { in.close(); } }
public void map( Text key, LongWritable value, OutputCollector<Text, LongWritable> collector, Reporter reporter) throws IOException { String name = key.toString(); long size = value.get(); long seed = Long.parseLong(name); random.setSeed(seed); reporter.setStatus("creating " + name); // write to temp file initially to permit parallel execution Path tempFile = new Path(DATA_DIR, name + suffix); OutputStream out = fs.create(tempFile); long written = 0; try { while (written < size) { if (fastCheck) { Arrays.fill(buffer, (byte) random.nextInt(Byte.MAX_VALUE)); } else { random.nextBytes(buffer); } long remains = size - written; int length = (remains <= buffer.length) ? (int) remains : buffer.length; out.write(buffer, 0, length); written += length; reporter.setStatus("writing " + name + "@" + written + "/" + size); } } finally { out.close(); } // rename to final location fs.rename(tempFile, new Path(DATA_DIR, name)); collector.collect(new Text("bytes"), new LongWritable(written)); reporter.setStatus("wrote " + name); }
/** * The subclass can overwrite this method to perform additional filtering and/or other processing * logic before a value is collected. * * @param key * @param aRecord * @param output * @param reporter * @throws IOException */ protected void collect( Object key, IntermediateData aRecord, OutputCollector output, Reporter reporter) throws IOException { this.collected += 1; addLongValue("collectedCount", 1); if (aRecord != null) { output.collect(key, aRecord.getData()); reporter.setStatus("key: " + key.toString() + " collected: " + collected); addLongValue("actuallyCollectedCount", 1); } }
@Override public void reduce( Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter) throws IOException { HashSet<Text> hash = new HashSet<Text>(); while (iter.hasNext()) { hash.add(iter.next()); } for (Text t : hash) oc.collect(key, t); reporter.setStatus("OK"); }
@Override public void reduce( Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter) throws IOException { HashSet<Text> hash = new HashSet<Text>(); while (iter.hasNext()) { hash.add(iter.next()); } oc.collect(key, new Text(Integer.toString(hash.size()))); reporter.setStatus("OK"); }
public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); FileSplit split = (FileSplit) genericSplit; final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (compressionCodecs != null && compressionCodecs.getCodec(file) != null) throw new RuntimeException("Not handling compression!"); return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job)); }
@Override // IOMapperBase public Long doIO(Reporter reporter, String name, long totalSize // in bytes ) throws IOException { InputStream in = (InputStream) this.stream; long actualSize = 0; while (actualSize < totalSize) { int curSize = in.read(buffer, 0, bufferSize); if (curSize < 0) break; actualSize += curSize; reporter.setStatus( "reading " + name + "@" + actualSize + "/" + totalSize + " ::host = " + hostName); } return Long.valueOf(actualSize); }
@Override // IOMapperBase public Long doIO(Reporter reporter, String name, long totalSize // in bytes ) throws IOException { PositionedReadable in = (PositionedReadable) this.stream; long actualSize = 0; for (long pos = nextOffset(-1); actualSize < totalSize; pos = nextOffset(pos)) { int curSize = in.read(pos, buffer, 0, bufferSize); if (curSize < 0) break; actualSize += curSize; reporter.setStatus( "reading " + name + "@" + actualSize + "/" + totalSize + " ::host = " + hostName); } return Long.valueOf(actualSize); }
@SuppressWarnings("unchecked") @Override /** * Instantiates a FileCollectionRecordReader using the specified spit (which is assumed to be a * CombineFileSplit. * * @param genericSplit contains files to be processed, assumed to be a CombineFileSplit * @param job JobConf of this job * @param reported To report progress */ public RecordReader<Text, SplitAwareWrapper<Document>> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new FileCollectionRecordReader(job, (PositionAwareSplit<CombineFileSplit>) genericSplit); }
/** Given an output filename, write a bunch of random records to it. */ public void map( WritableComparable key, Writable value, OutputCollector<BytesWritable, BytesWritable> output, Reporter reporter) throws IOException { int itemCount = 0; while (numBytesToWrite > 0) { int keyLength = minKeySize + (keySizeRange != 0 ? random.nextInt(keySizeRange) : 0); randomKey.setSize(keyLength); randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength()); int valueLength = minValueSize + (valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0); randomValue.setSize(valueLength); randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength()); output.collect(randomKey, randomValue); numBytesToWrite -= keyLength + valueLength; reporter.incrCounter(Counters.BYTES_WRITTEN, keyLength + valueLength); reporter.incrCounter(Counters.RECORDS_WRITTEN, 1); if (++itemCount % 200 == 0) { reporter.setStatus("wrote record " + itemCount + ". " + numBytesToWrite + " bytes left."); } } reporter.setStatus("done with " + itemCount + " records."); }
@Override public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) { // try{ compatible with hadoop-0.14 TODO MC reporter.setStatus(split.toString()); /* } catch (IOException e) { throw new RuntimeException("Cannot set status for reported:", e); } */ // find part name SegmentPart segmentPart; final String spString; try { segmentPart = SegmentPart.get((FileSplit) split); spString = segmentPart.toString(); } catch (IOException e) { throw new RuntimeException("Cannot identify segment:", e); } try { return new SequenceFileRecordReader(job, (FileSplit) split) { @Override public synchronized boolean next(Writable key, Writable value) throws IOException { LOG.debug("Running OIF.next()"); MetaWrapper wrapper = (MetaWrapper) value; try { wrapper.set(getValueClass().newInstance()); } catch (Exception e) { throw new IOException(e.toString()); } boolean res = super.next(key, (Writable) wrapper.get()); wrapper.setMeta(SEGMENT_PART_KEY, spString); return res; } @Override public Writable createValue() { return new MetaWrapper(); } }; } catch (IOException e) { throw new RuntimeException("Cannot create RecordReader: ", e); } }
@Override // IOMapperBase public Long doIO(Reporter reporter, String name, long totalSize // in bytes ) throws IOException { OutputStream out = (OutputStream) this.stream; // write to the file long nrRemaining; for (nrRemaining = totalSize; nrRemaining > 0; nrRemaining -= bufferSize) { int curSize = (bufferSize < nrRemaining) ? bufferSize : (int) nrRemaining; out.write(buffer, 0, curSize); reporter.setStatus( "writing " + name + "@" + (totalSize - nrRemaining) + "/" + totalSize + " ::host = " + hostName); } return Long.valueOf(totalSize); }
private boolean validate(String str, Reporter reporter) { String[] parts = str.split("\t"); if (parts.length != 6) { if (parts.length < 6) { reporter.incrCounter(LineCounters.TOO_FEW_TABS, 1); } else { reporter.incrCounter(LineCounters.TOO_MANY_TABS, 1); } reporter.incrCounter(LineCounters.BAD_LINES, 1); if ((reporter.getCounter(LineCounters.BAD_LINES).getCounter() % 10) == 0) { reporter.setStatus("Got 10 bad lines."); System.err.println("Read another 10 bad lines."); } return false; } return true; }
public Long doIO(Reporter reporter, String name, long totalSize) throws IOException { totalSize *= MEGA; // create instance of local filesystem FileSystem localFS = FileSystem.getLocal(fsConfig); try { // native runtime Runtime runTime = Runtime.getRuntime(); // copy the dso and executable from dfs synchronized (this) { localFS.delete(HDFS_TEST_DIR, true); if (!(localFS.mkdirs(HDFS_TEST_DIR))) { throw new IOException("Failed to create " + HDFS_TEST_DIR + " on local filesystem"); } } synchronized (this) { if (!localFS.exists(HDFS_SHLIB)) { if (!FileUtil.copy(fs, HDFS_SHLIB, localFS, HDFS_SHLIB, false, fsConfig)) { throw new IOException("Failed to copy " + HDFS_SHLIB + " to local filesystem"); } String chmodCmd = new String(CHMOD + " a+x " + HDFS_SHLIB); Process process = runTime.exec(chmodCmd); int exitStatus = process.waitFor(); if (exitStatus != 0) { throw new IOException(chmodCmd + ": Failed with exitStatus: " + exitStatus); } } } synchronized (this) { if (!localFS.exists(HDFS_READ)) { if (!FileUtil.copy(fs, HDFS_READ, localFS, HDFS_READ, false, fsConfig)) { throw new IOException("Failed to copy " + HDFS_READ + " to local filesystem"); } String chmodCmd = new String(CHMOD + " a+x " + HDFS_READ); Process process = runTime.exec(chmodCmd); int exitStatus = process.waitFor(); if (exitStatus != 0) { throw new IOException(chmodCmd + ": Failed with exitStatus: " + exitStatus); } } } // exec the C program Path inFile = new Path(DATA_DIR, name); String readCmd = new String(HDFS_READ + " " + inFile + " " + totalSize + " " + bufferSize); Process process = runTime.exec(readCmd, null, new File(HDFS_TEST_DIR.toString())); int exitStatus = process.waitFor(); if (exitStatus != 0) { throw new IOException(HDFS_READ + ": Failed with exitStatus: " + exitStatus); } } catch (InterruptedException interruptedException) { reporter.setStatus(interruptedException.toString()); } finally { localFS.close(); } return new Long(totalSize); }
protected void sequenceCrush(FileSystem fs, FileStatus[] status) throws IOException, CrushException { l4j.info("Sequence file crushing activated"); Class keyClass = null; Class valueClass = null; SequenceFile.Writer writer = null; for (FileStatus stat : status) { if (reporter != null) { reporter.setStatus("Crushing on " + stat.getPath()); l4j.info("Current file " + stat.getPath()); l4j.info("length " + stat.getLen()); reporter.incrCounter(CrushMapper.CrushCounters.FILES_CRUSHED, 1); } Path p1 = stat.getPath(); SequenceFile.Reader read = new SequenceFile.Reader(fs, p1, jobConf); if (keyClass == null) { keyClass = read.getKeyClass(); valueClass = read.getValueClass(); writer = SequenceFile.createWriter( fs, jobConf, outPath, keyClass, valueClass, this.compressionType, this.codec); } else { if (!(keyClass.equals(read.getKeyClass()) && valueClass.equals(read.getValueClass()))) { read.close(); writer.close(); throw new CrushException( "File " + stat.getPath() + " keyClass " + read.getKeyClass() + " valueClass " + read.getValueClassName() + " does not match" + " other files in folder"); } } Writable k = (Writable) ReflectionUtils.newInstance(keyClass, jobConf); Writable v = (Writable) ReflectionUtils.newInstance(valueClass, jobConf); int rowCount = 0; while (read.next(k, v)) { writer.append(k, v); rowCount++; if (rowCount % 100000 == 0) { if (reporter != null) { reporter.setStatus(stat + " at row " + rowCount); l4j.debug(stat + " at row " + rowCount); } } } read.close(); if (reporter != null) { reporter.incrCounter(CrushMapper.CrushCounters.ROWS_WRITTEN, rowCount); } } // end for writer.close(); l4j.info("crushed file written to " + outPath); }
public TableJoinRecordReader( JobConf jobConf, CloudataConf conf, TableSplit tableSplit, Reporter reporter) throws IOException { this.conf = conf; String mergeEvaluatorClass = tableSplit.getInputTableInfo().getMergeEvaluatorClass(); MergeEvaluator mergeEvaluator = null; if (mergeEvaluatorClass != null && mergeEvaluatorClass.length() > 0) { try { mergeEvaluator = (MergeEvaluator) Class.forName(mergeEvaluatorClass).newInstance(); } catch (Exception e) { LOG.error("mergeEvaluator:" + mergeEvaluatorClass + "," + e.getMessage()); IOException err = new IOException(e.getMessage() + ":" + mergeEvaluatorClass); err.initCause(e); throw err; } } RowFilter splitRowFilter = tableSplit.getRowFilter(); InputTableInfo inputTableInfo = tableSplit.getInputTableInfo(); this.startRowKey = splitRowFilter.getStartRowKey(); this.endRowKey = splitRowFilter.getEndRowKey(); RowFilter rowFilter = inputTableInfo.getRowFilter(); rowFilter.setStartRowKey(startRowKey); rowFilter.setEndRowKey(endRowKey); CTable ctable = CTable.openTable(conf, inputTableInfo.getTableName()); TableScanner pivotScanner = null; TableScanner targetScanner = null; try { pivotScanner = ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT); Row.Key firstRowKey = null; try { // 첫번째 Tablet이 아닌 경우에는 첫번째 row는 무시한다. if (!startRowKey.equals(Row.Key.MIN_KEY)) { Row pivotRow = pivotScanner.nextRow(); if (pivotRow == null) { end = true; return; } if (firstRowKey == null) { firstRowKey = pivotRow.getKey(); } if (firstRowKey.equals(pivotRow.getKey())) { pivotRow = pivotScanner.nextRow(); if (pivotRow == null) { end = true; return; } } pivotScanner.close(); rowFilter.setStartRowKey(firstRowKey); pivotScanner = ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT); } else { firstRowKey = startRowKey; } } catch (Exception e) { if (pivotScanner != null) { pivotScanner.close(); } throw e; } RowFilter joinRowFilter = inputTableInfo.getJoinRowFilter(); if (mergeEvaluator != null) { if (!firstRowKey.equals(Row.Key.MIN_KEY)) { joinRowFilter.setStartRowKey(mergeEvaluator.parseTargetRowKey(firstRowKey, 0)); } else { joinRowFilter.setStartRowKey(Row.Key.MIN_KEY); } if (!rowFilter.getEndRowKey().equals(Row.Key.MAX_KEY)) { joinRowFilter.setEndRowKey(mergeEvaluator.parseTargetRowKey(rowFilter.getEndRowKey(), 0)); } else { joinRowFilter.setEndRowKey(Row.Key.MAX_KEY); } } else { joinRowFilter.setStartRowKey(firstRowKey); joinRowFilter.setEndRowKey(rowFilter.getEndRowKey()); } reporter.setStatus( inputTableInfo.getTableName() + ":" + startRowKey + " ~ " + endRowKey + ", " + inputTableInfo.getJoinTableName() + ":" + joinRowFilter.getStartRowKey() + " ~ " + joinRowFilter.getEndRowKey()); // pivot table의 startRow, endRow에 대응하는 target table의 scanner 생성 CTable targetTable = CTable.openTable(conf, inputTableInfo.getJoinTableName()); targetScanner = ScannerFactory.openScanner(targetTable, joinRowFilter, TableScanner.SCANNER_OPEN_TIMEOUT); this.scanner = new MergeScanner(new TableScanner[] {pivotScanner, targetScanner}, mergeEvaluator); } catch (Exception e) { if (targetScanner != null) { targetScanner.close(); } IOException err = new IOException(e.getMessage()); err.initCause(e); throw err; } }
@Override public RecordReader<LongWritable, Text> getRecordReader( InputSplit split, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new ExampleRecordReader(job, (FileSplit) split); }
// Process an input document with GATE and a Reporter public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); AnnotationSet annots = null; if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations(); else annots = gatedocument.getAnnotations(filters.getAnnotationSetName()); // enrich the input doc with the annotations from // the GATE application // transfer the annotations from the GATE document // to the Behemoth one using the filters List<com.digitalpebble.behemoth.Annotation> beheannotations = convertGATEAnnotationsToBehemoth(annots, inputDoc); // sort the annotations before adding them? Collections.sort(beheannotations); // clear the existing behemoth annotations if (clearBehemothAnnotations) { inputDoc.getAnnotations().clear(); } inputDoc.getAnnotations().addAll(beheannotations); // add counters about num of annotations added if (reporter != null) for (com.digitalpebble.behemoth.Annotation annot : beheannotations) { reporter.incrCounter("GATE", annot.getType(), 1); } // Add the document features from GATE to Behemoth Set<String> docFeatFilter = this.filters.getDocFeaturesFilter(); MapWritable beheMD = inputDoc.getMetadata(true); if (docFeatFilter.size() > 0) { for (String docFeatName : docFeatFilter) { Object featValue = gatedocument.getFeatures().get(docFeatName); if (featValue != null) { beheMD.put(new Text(docFeatName), new Text(featValue.toString())); } } } if (reporter != null) reporter.incrCounter("GATE", "Document", 1); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } // currently returns only the input document return new BehemothDocument[] {inputDoc}; }
public void status(String message) { reporter.setStatus(message.toString()); }
@Override public RecordReader<LongWritable, Text> getRecordReader( InputSplit split, JobConf conf, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new DeprecatedLzoLineRecordReader(conf, (FileSplit) split); }
// constructor used by the old API ESRecordReader( org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) { reporter.setStatus(split.toString()); init((ESInputSplit) split, job); }
private void updateStatus(Reporter reporter) { reporter.setStatus(getCountString()); }
@Override public void map(LongWritable key, Text value, OutputCollector<Text, Text> oc, Reporter reporter) throws IOException { BufferedReader fin = null; InputStream is = null; try { String s3Path = value.toString(); URL url = new URL(s3Path); URLConnection conn = url.openConnection(); conn.setConnectTimeout(20000); conn.setReadTimeout(20000); is = conn.getInputStream(); is.read(); is.read(); fin = new BufferedReader(new InputStreamReader(new CBZip2InputStream(is), "UTF-8")); String currentTitle = ""; // int cnt = 0; String line = null; StringWriter merged = null; CsvWriter writer; while ((line = fin.readLine()) != null) { if ("<page>".equals(line.trim())) { String secondLine = fin.readLine(); currentTitle = new String( secondLine.substring( secondLine.indexOf(pre) + pre.length(), secondLine.indexOf(suf))); secondLine = null; } if (line.trim().startsWith("{{Infobox")) { sb = new StringBuilder(); merged = new StringWriter(); writer = new CsvWriter(merged, ','); sb.append(line); sb.append(sep); while (true) { line = fin.readLine().trim(); sb.append(line); sb.append(sep); if ("}}".equals(line)) { sb.append(line); sb.append(sep); break; } reporter.progress(); } writer.writeRecord(new String[] {currentTitle, sb.toString()}); writer.flush(); oc.collect(new Text(""), new Text(merged.toString())); reporter.progress(); reporter.setStatus(value.toString() + " processed"); sb = null; merged = null; writer = null; } line = null; } } catch (IOException ioe) { reporter.setStatus("This task didn't get fully passed"); } finally { try { fin.close(); } catch (Exception e) { e.printStackTrace(); } finally { } } }
/** * Copy a file to a destination. * * @param srcstat src path and metadata * @param dstpath dst path * @param reporter */ private void copy( FileStatus srcstat, Path relativedst, OutputCollector<WritableComparable<?>, Text> outc, Reporter reporter) throws IOException { Path absdst = new Path(destPath, relativedst); int totfiles = job.getInt(SRC_COUNT_LABEL, -1); assert totfiles >= 0 : "Invalid file count " + totfiles; // if a directory, ensure created even if empty if (srcstat.isDir()) { if (destFileSys.exists(absdst)) { if (!destFileSys.getFileStatus(absdst).isDir()) { throw new IOException("Failed to mkdirs: " + absdst + " is a file."); } } else if (!destFileSys.mkdirs(absdst)) { throw new IOException("Failed to mkdirs " + absdst); } // TODO: when modification times can be set, directories should be // emitted to reducers so they might be preserved. Also, mkdirs does // not currently return an error when the directory already exists; // if this changes, all directory work might as well be done in reduce return; } if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) { outc.collect(null, new Text("SKIP: " + srcstat.getPath())); ++skipcount; reporter.incrCounter(Counter.SKIP, 1); updateStatus(reporter); return; } Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst); long cbcopied = 0L; FSDataInputStream in = null; FSDataOutputStream out = null; try { // open src file try { in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath()); } catch (IOException e) { LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return"); in = null; return; } reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen()); // open tmp file out = create(tmpfile, reporter, srcstat); // copy file for (int cbread; (cbread = in.read(buffer)) >= 0; ) { out.write(buffer, 0, cbread); cbcopied += cbread; reporter.setStatus( String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen()) + absdst + " [ " + StringUtils.humanReadableInt(cbcopied) + " / " + StringUtils.humanReadableInt(srcstat.getLen()) + " ]"); } } finally { checkAndClose(in); checkAndClose(out); } if (cbcopied != srcstat.getLen()) { if (srcstat.getLen() == 0 && cbcopied > 0) { LOG.info("most likely see a WAL file corruption: " + srcstat.getPath()); } else { throw new IOException( "File size not matched: copied " + bytesString(cbcopied) + " to tmpfile (=" + tmpfile + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } } else { if (totfiles == 1) { // Copying a single file; use dst path provided by user as destination // rather than destination directory, if a file Path dstparent = absdst.getParent(); if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) { absdst = dstparent; } } if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) { throw new IOException(absdst + " is a directory"); } if (!destFileSys.mkdirs(absdst.getParent())) { throw new IOException("Failed to craete parent dir: " + absdst.getParent()); } rename(tmpfile, absdst); FileStatus dststat = destFileSys.getFileStatus(absdst); if (dststat.getLen() != srcstat.getLen()) { destFileSys.delete(absdst, false); throw new IOException( "File size not matched: copied " + bytesString(dststat.getLen()) + " to dst (=" + absdst + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } updatePermissions(srcstat, dststat); } // report at least once for each file ++copycount; reporter.incrCounter(Counter.BYTESCOPIED, cbcopied); reporter.incrCounter(Counter.COPY, 1); updateStatus(reporter); }